python scrapy爬取翼蜂网络新闻[完整版]

1、item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DemoItem(scrapy.Item):
    
    title = scrapy.Field()

    viewcount = scrapy.Field()

    content = scrapy.Field()

    zhaiyao = scrapy.Field()

    times = scrapy.Field()

2、abc.py爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from Demo.items import DemoItem
import re

class AbcSpider(scrapy.Spider):
    name = 'qr_yifeng'
    allowed_domains = ['www.cnyifeng.net']
    # start_urls = ['http://abc.com/']

    # 拼接url
    baseURL = "http://www.cnyifeng.net/news/48/{}.html"

    offset = 1

    start_urls = [baseURL.format(offset)]

    # 一.循环所有的分类
    def parse(self, response):

        node_typelist = response.xpath("//div[@class='lnav']/ul/li//@href").extract()

        for typenode  in  node_typelist:

            main_url = "http://www.cnyifeng.net" + typenode

            yield scrapy.Request(main_url,callback=self.parse_type)

    # 二.循环某一个分类
    def parse_type(self,response):

        node_list = response.xpath("//div[@class='news_con']/dl[@class='news_dl']")

        for node in node_list:

            item = DemoItem()

            if len(node.xpath(".//a[@class='dt_1']//text()")):

                item['title'] = node.xpath(".//a[@class='dt_1']//text()").extract()[0]

            else:

                item['title'] = ''

            if len(node.xpath("./dd//text()")):

                item['zhaiyao'] = node.xpath("./dd//text()").extract()[0]
            else:
                item['zhaiyao'] = ''

            item['times'] = node.xpath(".//span//text()").extract()[0]

            mainUrl = 'http://www.cnyifeng.net'

            erUrl = mainUrl + node.xpath(".//a[@class='dt_1']/@href").extract()[0]

            yield scrapy.Request(erUrl,callback=self.parse_detail_info,meta={'item':item}) # 把item传递给详情页的方法中

        if len(response.xpath("//div[@class='flickr']//span[@class='disabled']")) == 0: #处于中间页，上一页 下一页 都可以点击
            
            url = response.xpath("//div[@class='flickr']/a[last()]/@href").extract()[0]
            
            yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse_type)
            
        else:
            
            ToNext = response.xpath("//div[@class='flickr']//span[@class='disabled']//text()").extract()[0].encode('utf-8')
            
            if str(ToNext != '下一页»'):
                
                url = response.xpath("//div[@class='flickr']/a[last()]/@href").extract()[0]
                
                yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse_type)
                

    def parse_detail_info(self,response):

        item = response.meta['item'] #接收列表页的模型

        item['viewcount'] = '90'

        # if len(response.xpath("//div[@id='left']/div[@class='content_arc']/span/text()")):
        #
        #     content_list = response.xpath("//div[@id='left']/div[@class='content_arc']/span/text()").extract()
        #
        #     content_str =''
        #
        #     for model in content_list:
        #
        #         content_str = content_str + str(model).strip()
        #
        #     item['content'] = content_str
        #
        # else:
        #
        #     item['content'] = ''

        html_str = response.text

        pattern = re.compile(r'<div class="content_arc">([\s\S]*?)</div>')

        content_str = pattern.findall(html_str)

        item['content'] = content_str

        # print(response.text)
        # print('*' * 30)

        yield item

3、通过openpyxl保存数据到Excel

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from Demo.myEncoding import myEncoder
from openpyxl import Workbook

class DemoPipeline(object):

    def __init__(self):

        # 保存数据到excel
        self.wb = Workbook()

        self.ws = self.wb.active
        self.ws.append(['标题','摘要','时间','查看人数','详细内容'])


    def process_item(self, item, spider):

        # 保存数据到excel

        list = (item['title'], item['zhaiyao'], item['times'], item['viewcount'], item['content'])

        self.ws.append(list)

        self.wb.save('yftxt.xlsx')

        return item

4、通过pymysql保存数据到mysql数据库

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from Demo.myEncoding import myEncoder
import pymysql# 用法一：
# def dbHandle():
#     conn = pymysql.connect(
#         host='127.0.0.1',
#         port=3306,
#         user='qrcode',
#         passwd='123123',
#         db='test',
#         charset='utf8',
#         use_unicode=False
#     )
#     return conn

class DemoPipeline(object):

    def __init__(self):
        # 用法二
         self.client = pymysql.connect(
             host='127.0.0.1',
             port=3306,
             user='qrcode',
             passwd='123123',
             db='test',
             charset='utf8',
             use_unicode=False
         )
         self.cur = self.client.cursor()


    def process_item(self, item, spider):
        # 用法二
         sql = 'insert into t_table(title,zhaiyao,times,viewcount,content) values(%s,%s,%s,%s,%s)'
        
         list = (item['title'],item['zhaiyao'],item['times'],item['viewcount'],item['content'])
        
         self.cur.execute(sql,list)
        
         self.client.commit()

        # 用法一:
        # dbObject = dbHandle()
        #
        # cursor = dbObject.cursor() # 游标
        #
        # sql ='insert into t_table(title,zhaiyao,times) VALUES(%s,%s,%s)'
        #
        # try:
        #
        #     cursor.execute(sql,(item['title'],item['zhaiyao'],item['times']))
        #
        #     dbObject.commit()
        #
        # except Exception:
        #
        #     print('error')
        #
        #     dbObject.rollback()

        return item

    def close_spider(self, spider):

        # 用法二
         self.cur.close()
        
         self.client.close()

5、保存到文本文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from Demo.myEncoding import myEncoder

class DemoPipeline(object):

    def __init__(self):

           self.f = open("yf.json","w")

    def process_item(self, item, spider):

        content = json.dumps(dict(item),ensure_ascii=False,cls=myEncoder) + ",\n"

        self.f.write(content)

        return item
    
    def close_spider(self, spider):
               self.f.close()

python scrapy爬取翼蜂网络新闻[完整版]

猜你喜欢