# 导包
import requests
import urllib.parse
import re
import os
from os.path import join
def get_url_one_page(url):
html = requests.get(url)
html.encoding = 'utf-8'
html = html.text
url_pic_this_page = re.findall(r'"objURL":"(.*?)",',html)
url_next_page_prefix = re.findall(r'<a href="(.*)?" class="n" >下一页',html)
if len(url_next_page_prefix) != 0:
url_next_page = 'http://image.baidu.com' + url_next_page_prefix[0]
else:
print("已经到达最后一页")
url_next_page=None
return url_pic_this_page,url_next_page
def fetch_pictures(key,num_pics):
print('开始爬虫:关键字「%s」,爬取图片数量「%d」'%(key, num_pics))
url_init_base = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url_init_base + urllib.parse.quote(key)
url_pic_list=[] # 所有图片地址的列表拍
# 1。批量爬取
while True:
url_pic_this_page, url_next_page = get_url_one_page(url=url_init)
url_pic_list += url_pic_this_page
if url_next_page is not None:
url_init = url_next_page
else:
print('图片页数已经达到最后')
break
if len( url_pic_list)>num_pics-1:
print("已满足你的数量")
break
print(url_pic_list)
# 2。保存图片
if __name__ == '__main__':
key = "蔡徐坤打篮球"
num_pics = 10
SAVE_DIR = key
# os.mkdir(SAVE_DIR)
fetch_pictures(key, num_pics)
```
Alex -爬取图片
猜你喜欢
转载自blog.csdn.net/houlaos/article/details/103995893
今日推荐
周排行