两个爬虫代码参考了多篇资料,若有需要标注,请私信联系。闲言少叙,直接上代码。
# -*- coding:utf-8 -*-
import re
import requests
def dowmloadPic(html, keyword):
pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
i = 1
print('找到关键词:' + keyword + '的图片,现在开始下载图片...')
for each in pic_url:
print('正在下载第' + str(i) + '张图片,图片地址:' + str(each))
try:
pic = requests.get(each, timeout=10)
# requests.adapters.DEFAULT_RETRIES = 5
except requests.exceptions.ConnectionError:
print('【错误】当前图片无法下载')
continue
dir = '../images/' + keyword + '_' + str(i) + '.jpg'
fp = open(dir, 'wb')
fp.write(pic.content)
fp.close()
i += 1
s = requests.session()
s.keep_alive = False
if __name__ == '__main__':
word = input("Input key word: ")
# pageId = 0
# # 这里我保存到第50页
# for i in range(50):
# url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=" + str(
# pageId) + "&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
# pageId += 20
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&ct=201326592&v=flip'
# url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + word
result = requests.get(url)
dowmloadPic(result.text, word)
上面这个版本适用于download打开的一个页面,不会自动滚动下滑条。
# -*- coding:utf-8 -*-
import re
import requests
import traceback
import os
def dowmloadPic(html, keyword, startNum):
kv = {'user-agent': 'Mozilla/5.0'}
pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
num = len(pic_url)
i = startNum
root = 'L:/pics/'
print('找到关键词:' + keyword + '的图片,现在开始下载图片...')
for each in pic_url:
print('正在下载第' + str(i + 1) + '张图片,图片地址:' + str(each))
path = root + each.split('/')[-1]
# dir = root + keyword + str(i) + '.jpg'
dir = root + keyword + 'v1_' + str(i) + '.jpg'
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
pic = requests.get(each, headers=kv, timeout=10)
with open(dir, 'wb') as f:
f.write(pic.content)
f.close()
except:
traceback.print_exc()
print('【错误】当前图片无法下载')
continue
i += 1
return i
if __name__ == '__main__':
kv = {'user-agent': 'Mozilla/5.0'}
lastNum = 0
words = ['篮球','排球']
# words为一个列表,可以自动保存多个关键字的图片
for word in words:
# word = input("Input key word: ")
if word.strip() == "exit":
break
pageId = 0
# 此处的参数为需爬取的页数
for i in range(10):
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=" + str(
pageId) + "&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
pageId += 20
result = requests.get(url, headers=kv)
lastNum = dowmloadPic(result.text, word, lastNum)
上面这个版本适用于可以滚动下滑条,download大量图片。