踩过的坑: 1. OSError: [Errno 22] Invalid argument 创建jpg文件时,直接用的图片链接作为图片名的,而链接中有'/',所以报错了,解决方法是链接切片 2. TypeError: a bytes-like object is required, not 'str' 把URL返回的response写入图片时报错,resp.text返回的是Unicode型的数据, 所以用resp.content,它返回的是bytes型也就是二进制的数据
#coding=utf-8 import time import requests from lxml import etree import time from multiprocessing.dummy import Pool headers = { 'userAgent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36'} def get_info(url): ''' get源码,encode,解析,xpath,保存 ''' response = requests.get(url, headers=headers) response = response.text.encode('utf-8') selector = etree.HTML(response) soup = selector.xpath('//*[@class="photo-item photo-item--overlay"]/a[1]/img') list_url = [] for img in soup: photo = img.get('src') list_url.append(photo) for item in list_url: with open(item[33:39]+ '.jpg', 'wb') as fp: #创建jpg data = requests.get(item, headers = headers) #get url fp.write(data.content) #写入.text返回的是Unicode型的数据,所以用.content返回的是bytes型也就是二进制的数据 if __name__ == '__main__': urls = ['https://www.pexels.com/?page={}'.format(str(i)) for i in range(1, 2)] start_time = time.time() for url in urls: print(url) get_info(url) end_time = time.time() print('time1 : ', end_time - start_time) #多进程 # start_time2 = time.time() # pool = Pool(processes=6) # pool.map(get_info, urls) # end_time2 = time.time() # print('time2 : ', end_time2 - start_time2)