import time
from lxml import etree
import requests
import os
import gevent
import threading
def down_image(url,kw):
data = requests.get(url).content
name = url[-10:]
print(name)
print("正在下载%s...%s"%(name,url))
print(url)
if not os.path.exists("./" + kw):
os.mkdir("./" + kw)
with open("./%s/%s.jpg"%(kw,name),"wb") as f:
f.write(data)
def load_image(link_list,kw):
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko)"}
thread_list = []
begin = time.time()
print("开始下载图片...")
for link in link_list:
link = "https://tieba.baidu.com" + link
html = requests.get(link,headers = headers).text
xml = etree.HTML(html)
link_list = xml.xpath("//div/img[@class = 'BDE_Image']/@src")
for link in link_list:
t = threading.Thread(target = down_image,args = (link,kw))
t.start()
thread_list.append(t)
for t in thread_list:
t.join()
end = time.time()
print("下载完成共耗时%f秒"%(end-begin))
def load_page(url,payload):
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko)"}
html = requests.get(url,params = payload,headers = headers).text
xml = etree.HTML(html)
link_list = xml.xpath("//div[@class = 't_con cleafix']//div[@class = 'threadlist_lz clearfix']/div/a/@href")
for link in link_list:
link = "https://tieba.baidu.com" + link
load_image(link_list,payload['kw'])
def main():
kw = input("请输入贴吧名称:")
pagenum = int(input("请输入要爬取的页码:"))
payload = {'kw':kw,'pn':((pagenum-1)*50)}
url = "https://tieba.baidu.com/f?ie=utf-8"
load_page(url,payload)
thread_list = []
if __name__ == "__main__":
main()
爬取贴吧图片
猜你喜欢
转载自blog.csdn.net/sdzhr/article/details/80921115
今日推荐
周排行