python 爬虫保存图片/多进程

踩过的坑:
1. OSError: [Errno 22] Invalid argument 创建jpg文件时,直接用的图片链接作为图片名的,而链接中有'/',所以报错了,解决方法是链接切片
2. TypeError: a bytes-like object is required, not 'str' 把URL返回的response写入图片时报错,resp.text返回的是Unicode型的数据,
所以用resp.content,它返回的是bytes型也就是二进制的数据

#coding=utf-8
import time
import requests
from lxml import etree
import time
from multiprocessing.dummy import Pool


headers = {
    'userAgent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36'}


def get_info(url):
    '''
    get源码,encode,解析,xpath,保存
    '''

    response = requests.get(url, headers=headers)
    response = response.text.encode('utf-8')
    selector = etree.HTML(response)
    soup = selector.xpath('//*[@class="photo-item photo-item--overlay"]/a[1]/img')

    list_url = []
    for img in soup:
        photo = img.get('src')
        list_url.append(photo)

    for item in list_url:
        with open(item[33:39]+ '.jpg', 'wb') as fp:         #创建jpg
            data = requests.get(item, headers = headers)    #get url
            fp.write(data.content)        #写入.text返回的是Unicode型的数据,所以用.content返回的是bytes型也就是二进制的数据


if __name__ == '__main__':
    urls = ['https://www.pexels.com/?page={}'.format(str(i)) for i in range(1, 2)]
    start_time = time.time()
    for url in urls:
        print(url)
        get_info(url)
    end_time = time.time()
    print('time1 : ', end_time - start_time)

    #多进程
    # start_time2 = time.time()
    # pool = Pool(processes=6)
    # pool.map(get_info, urls)
    # end_time2 = time.time()
    # print('time2 : ', end_time2 - start_time2)

python 爬虫保存图片/多进程

猜你喜欢