1 # -*- coding:utf-8 -*- 2 3 import urllib 4 import urllib2 5 from lxml import etree 6 7 class My_Spider(object): 8 ''' 9 来用lxml的功能进行李毅吧帖子内图片的爬取 10 ''' 11 #初始化 12 def __init__(self,kw,start_page,end_page): 13 self.kw = kw 14 self.start_page = start_page 15 self.end_page = end_page 16 self.headers = {"UserAgent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 17 self.url = "http://tieba.baidu.com" 18 self.filename = 0 19 20 #访问用户输入的贴吧主页 21 def loadpage(self): 22 #转码kw 23 keyword = {"kw":self.kw} 24 kw = urllib.urlencode(keyword) 25 26 #组合url 27 fullurl = self.url + "/f?" + kw 28 29 #构建请求返回响应等 30 request = urllib2.Request(fullurl, headers = self.headers) 31 response = urllib2.urlopen(request) 32 html = response.read() 33 #print html 34 35 return html 36 37 #对html源码解析为HTML DOM模型,并用xpath规则挑选出各个帖子的关键组成部分 38 def lxml_html(self,html): 39 #将html源码解析为HTML DOM模型,返回一个对象 40 content = etree.HTML(html) 41 #对HTML DOM模型对象使用xpath,传入规则,返回贴吧内匹配成功的各个帖子的关键后缀列表 42 link_list = content.xpath('//div[@class="t_con cleafix"]//a[@class="j_th_tit "]/@href') 43 #将各个帖子的url补全,存在一个空列表中 44 fulllink_list = [] 45 for link in link_list: 46 full_url = self.url + link 47 fulllink_list.append(full_url) 48 49 #print fulllink_list 50 51 return fulllink_list 52 53 #遍历帖子链接列表,并访问获得请求等 54 def load_paste(self,fulllink_list): 55 image_list = [] 56 #遍历每个帖子的url 57 for link in fulllink_list: 58 request = urllib2.Request(link, headers = self.headers) 59 response = urllib2.urlopen(request) 60 html = response.read() 61 62 #return html 63 content = etree.HTML(html) 64 65 image = content.xpath('//img[@class="BDE_Image"]/@src') 66 67 for i in image: #这里尴尬了,其实对65行的image列表用切片操作,一直不行,说是切片超出范围???没查出来,只能用笨方法 68 69 image_list.append(i) 70 71 return image_list 72 73 #print image_list
87 #将获得的图片链接写入本地 88 def write_image(self,image_list): 89 90 #记得要对图片链接进行请求 91 for image in image_list: 92 request = urllib2.Request(image) 93 response = urllib2.urlopen(request) 94 picture = response.read() 95 self.filename += 1 96 with open("第" + str(self.filename) + "张.jpg","wb") as f: 97 f.write(picture) 100 101 if __name__ =="__main__": 103 #提供访问贴吧名的接口 104 kw = raw_input("请输入要访问的贴吧名(最好是李毅吧哈哈哈哈):") 105 start_page = int(raw_input("请输入要查看的起始页码(int):")) 106 end_page = int(raw_input("请输入要查看的结束页码(int):")) 107 108 #创建爬虫对象 109 myspider = My_Spider(kw, start_page, end_page) 110 111 #访问要查询的贴吧页面,调用loadpage方法,返回贴吧html源码 112 html = myspider.loadpage() 113 114 #用lxml对页面内容进行转换html dom并且用xpath规则进行解析并得到贴吧帖子的链接列表 115 fulllink_list = myspider.lxml_html(html) 116 118 #对链接列表进行遍历并补全每个帖子的url链接 119 image_list = myspider.load_paste(fulllink_list) 120 #myspider.load_paste(fulllink_list) 121 #进入每个帖子,并将页面html进行lxml转换为html dom,并用xpath规则进行解析返回图片链接的列表url 122 #image_list = myspider.lxml_html_paste(html_paste) 123 124 #对每个图片url发送请求获得响应后,写入本地 125 myspider.write_image(image_list)