1、item.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class DemoItem(scrapy.Item): title = scrapy.Field() viewcount = scrapy.Field() content = scrapy.Field() zhaiyao = scrapy.Field() times = scrapy.Field()
2、abc.py爬虫文件
# -*- coding: utf-8 -*- import scrapy from Demo.items import DemoItem import re class AbcSpider(scrapy.Spider): name = 'qr_yifeng' allowed_domains = ['www.cnyifeng.net'] # start_urls = ['http://abc.com/'] # 拼接url baseURL = "http://www.cnyifeng.net/news/48/{}.html" offset = 1 start_urls = [baseURL.format(offset)] # 一.循环所有的分类 def parse(self, response): node_typelist = response.xpath("//div[@class='lnav']/ul/li//@href").extract() for typenode in node_typelist: main_url = "http://www.cnyifeng.net" + typenode yield scrapy.Request(main_url,callback=self.parse_type) # 二.循环某一个分类 def parse_type(self,response): node_list = response.xpath("//div[@class='news_con']/dl[@class='news_dl']") for node in node_list: item = DemoItem() if len(node.xpath(".//a[@class='dt_1']//text()")): item['title'] = node.xpath(".//a[@class='dt_1']//text()").extract()[0] else: item['title'] = '' if len(node.xpath("./dd//text()")): item['zhaiyao'] = node.xpath("./dd//text()").extract()[0] else: item['zhaiyao'] = '' item['times'] = node.xpath(".//span//text()").extract()[0] mainUrl = 'http://www.cnyifeng.net' erUrl = mainUrl + node.xpath(".//a[@class='dt_1']/@href").extract()[0] yield scrapy.Request(erUrl,callback=self.parse_detail_info,meta={'item':item}) # 把item传递给详情页的方法中 if len(response.xpath("//div[@class='flickr']//span[@class='disabled']")) == 0: #处于中间页,上一页 下一页 都可以点击 url = response.xpath("//div[@class='flickr']/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse_type) else: ToNext = response.xpath("//div[@class='flickr']//span[@class='disabled']//text()").extract()[0].encode('utf-8') if str(ToNext != '下一页»'): url = response.xpath("//div[@class='flickr']/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse_type) def parse_detail_info(self,response): item = response.meta['item'] #接收列表页的模型 item['viewcount'] = '90' # if len(response.xpath("//div[@id='left']/div[@class='content_arc']/span/text()")): # # content_list = response.xpath("//div[@id='left']/div[@class='content_arc']/span/text()").extract() # # content_str ='' # # for model in content_list: # # content_str = content_str + str(model).strip() # # item['content'] = content_str # # else: # # item['content'] = '' html_str = response.text pattern = re.compile(r'<div class="content_arc">([\s\S]*?)</div>') content_str = pattern.findall(html_str) item['content'] = content_str # print(response.text) # print('*' * 30) yield item
3、通过openpyxl保存数据到Excel
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from Demo.myEncoding import myEncoder from openpyxl import Workbook class DemoPipeline(object): def __init__(self): # 保存数据到excel self.wb = Workbook() self.ws = self.wb.active self.ws.append(['标题','摘要','时间','查看人数','详细内容']) def process_item(self, item, spider): # 保存数据到excel list = (item['title'], item['zhaiyao'], item['times'], item['viewcount'], item['content']) self.ws.append(list) self.wb.save('yftxt.xlsx') return item
4、通过pymysql保存数据到mysql数据库
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from Demo.myEncoding import myEncoder import pymysql# 用法一: # def dbHandle(): # conn = pymysql.connect( # host='127.0.0.1', # port=3306, # user='qrcode', # passwd='123123', # db='test', # charset='utf8', # use_unicode=False # ) # return conn class DemoPipeline(object): def __init__(self): # 用法二 self.client = pymysql.connect( host='127.0.0.1', port=3306, user='qrcode', passwd='123123', db='test', charset='utf8', use_unicode=False ) self.cur = self.client.cursor()
def process_item(self, item, spider): # 用法二 sql = 'insert into t_table(title,zhaiyao,times,viewcount,content) values(%s,%s,%s,%s,%s)' list = (item['title'],item['zhaiyao'],item['times'],item['viewcount'],item['content']) self.cur.execute(sql,list) self.client.commit() # 用法一: # dbObject = dbHandle() # # cursor = dbObject.cursor() # 游标 # # sql ='insert into t_table(title,zhaiyao,times) VALUES(%s,%s,%s)' # # try: # # cursor.execute(sql,(item['title'],item['zhaiyao'],item['times'])) # # dbObject.commit() # # except Exception: # # print('error') # # dbObject.rollback() return item def close_spider(self, spider): # 用法二 self.cur.close() self.client.close()
5、保存到文本文件
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from Demo.myEncoding import myEncoder class DemoPipeline(object): def __init__(self): self.f = open("yf.json","w") def process_item(self, item, spider): content = json.dumps(dict(item),ensure_ascii=False,cls=myEncoder) + ",\n" self.f.write(content) return item def close_spider(self, spider): self.f.close()