#import urllib
import requests
import time
from lxml import etree
url='http://www.doutula.com/'
headers={'Referer':'http://www.doutula.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
resp=requests.get(url,headers=headers)
print(resp.text)
'''
<img class="gif" style="min-height: inherit;left: 5px;top:5px" src="//static.doutula.com/img/gif.png" />
<img src="//static.doutula.com/img/loader_170_160.png"
style="margin: 0 auto; min-height: inherit;"
data-original="https://ws2.sinaimg.cn/bmiddle/6af89bc8gw1f8smgrjzkug20af0afmyl.gif"
alt="总爱在我的生活里指手画脚,俗称经验婊和过来人婊" class="img-responsive lazy image_dta"
data-backup="http://img.doutula.com/production/uploads/image//2016/06/10/20160610526577_IvENsd.gif!dta">
'''
#开始解析
#html=etree.HTML(resp.text)
#srcs=html.xpath('.//img/@data-original')
#for src in srcs:
# filename=src.split('/')[-1]
# img=requests.get(src,headers=headers)
#
# with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
# file.write(img.content)
# print(src,filename)
#
#print(len(src))
def download_img(src):
filename=src.split('/')[-1]
img=requests.get(src,headers=headers)
with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
file.write(img.content)
print(src,filename)
def get_page(url):
resp=requests.get(url,headers=headers)
print(resp,url)
html=etree.HTML(resp.text)
srcs=html.xpath('.//img/@data-original')
for src in srcs:
download_img(src)
next_link=html.xpath('.//a[@rel="next"]/@href')
return ['next_link']
next_link_base='http://www.doutula.com/article/list/?page='
next_link=html.xpath('.//a[@rel="next"]/@href')
current_num=1
while next_link:
time.sleep(0.2)
current_num+=1
next_link=get_page(next_link_base+str(current_num))
if current_num>=4:
break
'''
http://www.doutula.com/article/list/?page=581
'''
python 爬取表情包——斗图啦
猜你喜欢
转载自blog.csdn.net/lhy2239705435/article/details/84023503
今日推荐
周排行