根据关键字爬取搜狗图库图片
说明:关键字是从excel表格文件中读取的
如果想看爬虫分析过程的可以参考博主之前的写的博客,完整代码如下:
'''
@Time : 2019/10/24 15:38
@Software: PyCharm
'''
import json
import urllib
import jsonpath
import requests
import os
import xlrd
def search(startPage,endPage,path,keywords):
url = "https://pic.sogou.com/pics"
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'pic.sogou.com',
'Referer': 'https://pic.sogou.com/pics?query=%B1%ED%B8%F1%CD%BC%C6%AC&p=40230500&st=255&mode=255&policyType=0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
datas={"mode":1,"reqType":"ajax","reqFrom":"result","tn":0}
for keyword in keywords:
datas['query'] = keyword
temp = path+keyword+"\\"
if not os.path.exists(temp):
os.makedirs(temp)
for num in range(startPage,(endPage + 1)):
datas['start'] = num
images = requests.get(url, headers=headers, params=datas, timeout=5)
# print(images.status_code)
# print(images.text)
jsonObjs = json.loads(images.text)
images_urls = jsonpath.jsonpath(jsonObjs, '$.items..ori_pic_url')
i = 1
for image_url in images_urls:
try:
print('*' * 10 + '正在下载——'+keyword+'——第' + str((num - 1) * 48 + i) + '张图片' + '*' * 10)
res = urllib.request.urlopen(image_url, timeout=5).read()
with open(temp + keyword + '_'+str((num - 1) * 48 + i) + '.jpg', 'wb') as file:
file.write(res)
file.close()
except Exception as e:
print(keyword+'——第' + str((num - 1) * 48 + i) + '张图片下载出错,错误信息如下:')
print(' ' * 10 + str(e))
print('')
continue
finally:
i += 1
print('*' * 15 + '下载完成' + '*' * 15)
def read_excel(excel_path):
book = xlrd.open_workbook(excel_path )
sheet1 = book.sheets()[0]
keywords = sheet1.col_values(0)
return keywords
def main(excel_path,startPage,endPage,path):
keywords = read_excel(excel_path)
search(startPage, endPage, path, keywords)
if __name__ == '__main__':
excel_path="D:\\My Documents\\Desktop\\搜狗爬虫\\10.xlsx"
startPage=1
endPage = 2
path = 'd:\\download\\搜狗\\'
main(excel_path,startPage,endPage,path)
看完了,随手点个赞呗!