#coding=utf-8
import os
import random
import requests
from lxml import etree
from urllib.parse import urlparse
import urllib.request as urllib
from bs4 import BeautifulSoup
user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11",
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6",
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6",
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5",
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5",
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",]
class Crawler(object):
def __init__(self,start_url):
self.index = 1
self.tag = 0
self.tagname = []
self.start_url = start_url
self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url))
@staticmethod
def request(url, **kwargs):
try:
page = requests.get(url,**kwargs)
return page.text
except:
return ''
def get_max_page(self,html,regular):
html = etree.HTML(html)
pages = html.xpath(regular)
try:
max_page,*_ = pages
except:
max_page = '1'
return max_page
def get_good_pages(self,html):
'''
获取所有精品帖子的页面,类似
https://tieba.baidu.com/f?kw=图片&ie=utf-8&tab=good&cid=&pn=50
'''
max_page = self.get_max_page(html,'//a[@class="last"]//@href')
if max_page != '1':
max_page = max_page.split('=')[-1]
max_page = int(max_page) + 1
for i in range(0,max_page,50):
yield (self.start_url.split('&')[0] + '&ie=utf-8&cid=0&pn={}'.format(i))
def get_good_urls(self,html):
'''
获取精品帖子里面内的所有精品贴url,类似
https://tieba.baidu.com/p/3868212854
'''
pages = self.get_good_pages(html)
for page in pages:
html = self.request(page)
html = etree.HTML(html)
self.tagname.extend(html.xpath('//a[@class = "j_th_tit"]//@title'))
urls = html.xpath('//a[@class = "j_th_tit"]//@href')
for url in urls:
url = url.split('?')[0]
url = self.domain + url
yield url
self.tag += 1
self.index = 1
def get_post_urls(self,url,max_page):
'''
获取所有精品帖子的url,类似
https://tieba.baidu.com/p/3868212854?pn=2
'''
for i in range(1,max_page + 1):
yield (url + '?pn={}'.format(i))
def get_single_urls(self,html):
urls = self.get_good_urls(html)
for url in urls:
html = self.request(url)
max_page = self.get_max_page(html,'//li[@class = "l_reply_num"]//@max-page')
max_page = int(max_page) + 1
single_urls = self.get_post_urls(url,max_page)
yield single_urls
def get_imgs(self,html):
'''
下载某个精品帖子的所有图片
'''
jpgdir = r'D:\pic\{}'.format( self.tagname[self.tag])
if not os.path.exists(jpgdir):
os.makedirs(jpgdir)
html = etree.HTML(html)
img_urls = html.xpath('//img[@class = "BDE_Image" and @width > "400"]//@src')
for img in img_urls:
print ("正在下载第{}张图片".format(self.index))
urllib.urlretrieve(img,r'{}\{}.jpg'.format(jpgdir,self.index))
self.index += 1
def run(self,html):
single_urls = self.get_single_urls(html)
for single_url in single_urls:
for url in single_url:
User_Agent = random.choice(user_agent_list) #伪装一下
headers = {'User-Agent':User_Agent}
html = self.request(url,headers = headers)
self.get_imgs(html)
if __name__=='__main__':
post_bar = input("请输入贴吧名称:")
start_url = 'https://tieba.baidu.com/f/good?kw={}&ie=utf-8&cid=0&pn=0'.format(post_bar)
crawler = Crawler(start_url)
html = crawler.request(start_url)
if '本吧暂不开放' in html:
print ("抱歉,根据相关法律法规和政策,本吧暂不开放。")
elif 'page404' in html:
print ('很抱歉,您要访问的页面不存在。')
else:
print("开始爬取{}吧所有精品帖子图片".format(post_bar))
crawler.run(html)
爬取百度贴吧所有精品贴照片
猜你喜欢
转载自blog.csdn.net/qq523176585/article/details/78554459
今日推荐
周排行