import urllib.request
from lxml import etree
headers = ('Referer','https://tieba.baidu.com/p/4640092720?pn=1')#防盗链,修改访问来源
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1,14):
url="https://tieba.baidu.com/p/4640092720?pn="+str(i)
response = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response)
imagelist = html.xpath('///img[@class="BDE_Image"]/@src')
print(len(imagelist))
for j in range(0,len(imagelist)):
thisimg=imagelist[j]
thisimgurl=thisimg
file="D://python//baidu_bl/"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename=file)
textlist=html.xpath('//div[@class="d_post_content j_d_post_content "]/text()')
print(len(textlist))
for k in range(0,len(textlist)):
data=textlist[k]
fh1=open("D:/python/baidu_bl/baidu_bl.doc","a")
fh1.write(data+'\n')
fh1.close()
多线程如下:
import urllib.request
import threading
from lxml import etree
headers = ('Referer','https://tieba.baidu.com/p/4640092720?pn=1')#防盗链,修改访问来源
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class Img(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(1,3):
url="https://tieba.baidu.com/p/4640092720?pn="+str(i)
response = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response)
imagelist = html.xpath('///img[@class="BDE_Image"]/@src')
print(len(imagelist))
for j in range(0,len(imagelist)):
thisimg=imagelist[j]
thisimgurl=thisimg
file="D://python//baidu_bl/"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename=file)
class Text(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(1,3):
url="https://tieba.baidu.com/p/4640092720?pn="+str(i)
response = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response)
textlist=html.xpath('//div[@class="d_post_content j_d_post_content "]/text()')
for k in range(0,len(textlist)):
data=textlist[k]
fh1=open("D:/python/baidu_bl/baidu_bl.doc","a")
fh1.write(data+'\n')
fh1.close()
t1=Img()#赋值线程
t1.start()#开启线程
t2=Text()
t2.start()