今天实战为大家介绍多级页面的抓取,以车质网投诉为例
1.准备工作
首先明确要抓取的字段,包括投诉编码,投诉品牌,车型等和投诉简述里面的日期,详细投诉内容和回复,分为两个页面。
通过查看典型问题是通过JS动态加载的,这里我们先处理一下,转成字典格式,是通过编码识别来判断的
调整后的数据是这样的,保存跟item同级目录下命名为chezhi.py
2.项目开始
创建一个项目
scrapy startproject chezhi
明确要抓取的字段
import scrapy class ChezhiItem(scrapy.Item): id = scrapy.Field() brand = scrapy.Field() car = scrapy.Field() car_style = scrapy.Field() content = scrapy.Field() question = scrapy.Field() state = scrapy.Field() time = scrapy.Field() detail_content = scrapy.Field() Reply = scrapy.Field()
编写spider代码
# -*- coding: utf-8 -*- import scrapy from chezhi.items import ChezhiItem from chezhi.chezhi import listcode class TousuSpider(scrapy.Spider): name = 'tousu' allowed_domains = ['www.12365auto.com'] def start_requests(self): url='http://www.12365auto.com/zlts/0-1878-0-0-0-0_0-0-{0}.shtml' for i in range(1,51): yield scrapy.Request(url.format(i),callback=self.pare) def pare(self,response): trs=response.xpath('//table//tr') # trs=article_list.xpath("tr") # items=[] # print(listcode) # list1=[] for i in trs[1:]: items=ChezhiItem() items['id']=i.xpath("td[1]/text()")[0].extract() items['brand']=i.xpath("td[2]/text()")[0].extract() items['car']=i.xpath("td[3]/text()")[0].extract() items['car_style']=i.xpath("td[4]/text()")[0].extract() items['content']=i.xpath("td[5]/a/text()")[0].extract() url=i.xpath("td[5]/a/@href")[0].extract() question_list=i.xpath("td[6]//text()")[0].extract() codes=question_list.split(',') # print(article['id'],codes,[listcode.get(j) for j in codes][:-1]) items['question']=str([listcode.get(j) for j in codes][:-1]) items['state']=i.xpath("td[8]/em/text()")[0].extract() # items.append(article) # print(url) # print (items,'------------') # yield items yield scrapy.Request(url=url,meta={'items':items},callback=self.pare_detail,dont_filter=True) def pare_detail(self,response): items=response.meta['items'] items['time']=response.xpath('//div[@class="jbqk"]/ul/li[5]/text()')[0].extract() items['detail_content']=response.xpath('//div[@class="tsnr"]/p/text()')[0].extract() items['Reply']=response.xpath('//div[@class="tshf"]/p/text()')[0].extract() # print(items) yield items
pipelines
import pymysql import pymysql.cursors class ChezhiPipeline(object): def process_item(self, item, spider): DBKWARGS=spider.settings.get('DBKWARGS') con=pymysql.connect(**DBKWARGS)#链接数据库 cur=con.cursor()游标 sql=('insert into chezhi values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)') lis=(item['id'],item['brand'],item['car'],item['car_style'],item['content'], item['question'],item['state'],item['time'],item['detail_content'],item['Reply']) try: cur.execute(sql,lis)#插入数据 except Exception as e: print("insert err:",e) con.rollback()#回滚 else: con.commit() cur.close()#关闭游标 con.close()#关闭链接 return item
settings文件编写
BOT_NAME = 'chezhi' SPIDER_MODULES = ['chezhi.spiders'] NEWSPIDER_MODULE = 'chezhi.spiders' DBKWARGS={'db':'test',"user":"root","passwd":"12345","host":"localhost","use_unicode":True,"charset":"utf8"} USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", ] DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', 'Cookie': 'uuid=384279260814621; UM_distinctid=1624818bb6a404-0fe2919d52c83c-546d3974-100200-1624818bb9730a; uuid=384279260814621; ASP.NET_SessionId=eyhvs1zwzzbeonr05b41o455; CNZZDATA2115974=cnzz_eid%3D1359293862-1521625094-null%26ntime%3D1521684499; Hm_lvt_d08ac25fd4c45e85e8d9f1c32e97a0eb=1521628003,1521684855; Hm_lpvt_d08ac25fd4c45e85e8d9f1c32e97a0eb=1521689053', 'Cache-Control': 'max-age=0', 'DNT': '1', 'Host': 'www.12365auto.com', 'Referer': 'http://www.12365auto.com/zlts/0-1878-0-0-0-0_0-0-1.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'chezhi (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False
到这里就可以运行啦,有兴趣的朋友可以在优化一下。
查看下抓取到的数据