运行淘宝图片爬取,经常出错`
import urllib.request
import re
keyname = "秋上新"
key = urllib.request.quote(keyname)
headers = ("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/62.0")
openers = urllib.request.build_opener()
openers.addheaders = [headers]
urllib.request.install_opener(openers)
for i in range(1,4):
url = "https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.3.5af911d9qdrblU&q="+key+"&cat=16&seller_type=taobao&oetag=6745&source=qiangdiao&bcoffset=12&s="+str(i*60)
data = urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat = '"pic_url":"//(.*?)"'
imagelist = re.compile(pat).findall(data)
for j in range(0,len(imagelist)):
this_img = imagelist[j]
this_img_url = "http://" + this_img
file = "E:/Test/result/img/"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(this_img_url,filename=file)
一直报错
Traceback (most recent call last):
File "D:/Users/jiang/workspaceForPycharm/Test.py", line 12, in <module>
data = urllib.request.urlopen(url).read().decode("utf-8","ignore")
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\urllib\request.py", line 526, in open
response = self._open(req, data)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\urllib\request.py", line 544, in _open
'_open', req)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\urllib\request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\urllib\request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\http\client.py", line 1280, in _send_request
self.putheader(hdr, value)
File "D:\Users\jiang\AppData\Local\Continuum\anaconda3\lib\http\client.py", line 1212, in putheader
values[i] = one_value.encode('latin-1')
UnicodeEncodeError: 'latin-1' codec can't encode character '\u2026' in position 30: ordinal not in range(256)
尝试了很多解决方案,后来发现在浏览器伪装的时候,报头写错了。中间不能有省略号。
headers = (“User-Agent” , “Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/62.0”)
把10.0;后面的省略号删除后运行成功