# -*- coding: utf-8 -*-
"""根据搜索词下载百度图片"""
import re
import sys
import urllib
import requests
def get_onepage_urls(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
#print('syy')
if not onepageurl:
print('已到最后一页, 结束')
return []
try:
req = urllib.request.Request(onepageurl)
req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
html = urllib.request.urlopen(req).read()
html=html.decode('utf-8')
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('img src="(.*?)" width="190"', html, re.S)
pic_urls=pic_urls[0:35]
#print(pic_urls)
title=[]
title=re.findall('alt="(.*?)" style=', html, re.S)
title=title[0:35]#46
#print(title)
if(title):
print('')
else:
title=['']
#print('title%s' % (title[0:20]))
#
fanye_urls = re.findall(re.compile(r"'page-next' href='(.*)'>下一页"), html, flags=0)
fanye_url = 'http://www.ivsky.com' + fanye_urls[0] if fanye_urls else ''
#print(fanye_url)
#return pic_urls,fanye_url,title
return pic_urls,fanye_url
def down_pic(pic_urls,all_title):
"""给出图片链接列表, 下载所有图片"""
#print(all_title)
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1) + '.jpg'
with open('../2_picture/online_picture_3/'+string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
#if(i>=19):
# break
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
#if(i>=19):
# break
continue
if __name__ == '__main__':
keyword = 'china flag' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
url_init = r'http://www.ivsky.com/tupian/geguoguoqi_t2928/'
all_pic_urls = []
all_title=[]
onepage_urls,fanye_url= get_onepage_urls(url_init)
all_pic_urls.extend(onepage_urls)
print(all_pic_urls)
#pic = requests.get('http://img.ivsky.com/img/tupian/t/201101/12/guoqi.jpg', timeout=15)
#all_title.extend(title)
fanye_count = 0 # 累计翻页数
i=0
while 1:
onepage_urls,fanye_url= get_onepage_urls(fanye_url)
fanye_count += 1
i+=1
all_pic_urls.extend(onepage_urls)
#all_title.extend(title)
#print(all_title)
if(i>=10):#9
break
# print('第页' % str(fanye_count))
if fanye_url == '' and onepage_urls == []:
break
#print(all_title)
#print(all_pic_urls)
down_pic(all_pic_urls,all_title)
爬虫 爬国旗
猜你喜欢
转载自blog.csdn.net/dss875914213/article/details/84996106
今日推荐
周排行