运用python3中的urllib爬取贴吧的图片:
import urllib import urllib.request import lxml import lxml.etree import re from urllib import parse #抓取贴吧页面数量信息 def gettiebalistnumbers(name): #计算搜索的关键词有多少页 输入名字 返回页数 url="https://tieba.baidu.com/f?" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式 word = {"kw": name} # 接口 贴吧的名字 word = parse.urlencode(word) # 编码成字符串 url = url + word # 拼接url request = urllib.request.Request(url, headers=headers) # 发送请求 # 也可以通过调用Request.add_header() 添加/修改一个特定的 header request.add_header("Connection", "keep-alive") # 一直活着 response = urllib.request.urlopen(request) # 打开请求 data = response.read().decode("utf-8") # 读取数据 print(response.code) # 可以查看相应状态码 restr = "<span class=\"card_infoNum\">([\s\S]*?)</span>" # 正则这个贴吧有多少帖子 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) #寻找页面所有符合条件的 tienumbers = mylist[0].replace(",","") #替换逗号 tienumbers = eval(tienumbers) #str转化为数字 restr = "<span class=\"card_menNum\">([\s\S]*?)</span>" # 正则关注贴吧的数 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) # 寻找页面所有符合条件的 Peoplenumbers = mylist[0].replace(",", "") # 替换逗号 Peoplenumbers = eval(Peoplenumbers) # str转化为数字 return tienumbers,Peoplenumbers def gettiebalist(name): #抓取所有的符合name的页数 输入搜索关键词,返回所有的页数url numberstuple=gettiebalistnumbers(name) #(元组) tienumbers=numberstuple[1] #帖子的数量 word = {"kw": name} # 接口 贴吧的名字 word = parse.urlencode(word) # 编码成字符串 tiebalist = [] if tienumbers%53==0: #生成页面列表 for i in range(tienumbers//53): tiebalist.append("https://tieba.baidu.com/f?"+word+"&pn="+str(i*50)) else: for i in range(tienumbers//53+1): tiebalist.append("https://tieba.baidu.com/f?"+word+"&pn="+str(i*50)) #print(tiebalist) return tiebalist def geturllistformpage(url): # 抓取页面的每个帖子url 输入一页url 返回列表内的的所有url headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"} request = urllib.request.Request(url, headers=headers) # 发起请求, # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header response = urllib.request.urlopen(request) data = response.read().decode("utf-8", "ignore") # 打开请求,抓取数据 # print(response.code) # 可以查看响应状态码 restr = "<ul id=\"thread_list\" class=\"threadlist_bright j_threadlist_bright\">([\s\S]*?)<div class=\"thread_list_bottom clearfix\">" # 正则表达式,()只要括号内的数据 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) # print(mylist[0])#抓取整个表格 restr = "href=\"/p/(\d+)\"" # 正则表达式,()只要括号内的数据 regex = re.compile(restr, re.IGNORECASE) urltitlelist = regex.findall(data) #抓取的url变化的数字 urllist = [] for title in urltitlelist: urllist.append("http://tieba.baidu.com/p/" + title) # 拼接链接 得到每个页面的帖子url列表 return urllist def urllistfrompage(url): #一个帖子页面的所有页数,输入一个帖子url 返回一个帖子所有页数的url列表 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式 request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) data = response.read()#.decode("utf-8","ignore") mytree=lxml.etree.HTML(data) numbers=eval(mytree.xpath("//*[@class =\"l_reply_num\"]//span[last()]/text()")[0]) urllist=[] for i in range(1,numbers+1): urllist.append(url+"?pn="+str(i)) return urllist def getjpglistfrompage(url): #输入一个分页的url 提取所有的图片url 并保存到本地 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式 request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) data = response.read() mytree=lxml.etree.HTML(data) jpgurllist=mytree.xpath("//*[@class=\"BDE_Image\"]/@src") return jpgurllist name="关晓彤" jpgnumbers=0 for souurl in gettiebalist(name): sousurl=geturllistformpage(souurl) for fenurl in sousurl: jpgallurl=urllistfrompage(fenurl) for rev in jpgallurl: for jpgurl in getjpglistfrompage(rev): jpgnumbers += 1 urllib.request.urlretrieve(jpgurl, "jpg/" + str(jpgnumbers) + ".jpg")