python小白学习记录 结合scrapy编写爬虫 爬取古诗文网右侧的标签

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 import requests
 4 from lxml import etree
 5 from selenium import webdriver
 6 from scrapy.http.response.html import HtmlResponse
 7 from hcrca.items import HcrcaItem
 8 import re
 9 class PoemSpiderPySpider(scrapy.Spider):
10     name = 'poem_spider.py'
11     allowed_domains = ['https://www.gushiwen.org/']
12     start_urls = ['https://www.gushiwen.org/']
13 
14     def parse(self, response):
15         print("="*40)
16         result = response.text
17         #titles = re.findall('[(<div class="cont">\s)(</a>\s)]<a href="https://so.gushiwen.org/.*?.aspx" target="_blank">(.*?)</a>',result,re.DOTALL)
18         html = etree.HTML(result)
19         titles = html.xpath('//div[@class="main3"]//div[@class="right"]//a[@href and @target]/text()')
20         print("=" * 40,"yield开始")
21         for title in titles:
22             item = HcrcaItem(name=title)
23             yield item
24         print("=" * 40,"yield结束")
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 import json
 8 
 9 class HcrcaPipeline(object):
10     def __init__(self):
11         self.fp = open("name.json","w",encoding="utf-8")
12     def open_spider(self,spider):
13         print("spider开始爬取")
14     def process_item(self, item, spider):
15         items_json = json.dumps(dict(item),ensure_ascii=False)
16         self.fp.write(items_json+"\n")
17         return item
18     def close_spider(self,spider):
19         self.fp.close()
20         print("spider结束爬取")

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class HcrcaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()

 1 from scrapy import cmdline 2 cmdline.execute("scrapy crawl poem_spider.py".split()) 

附:http://1172118044.top/file/download.html 源码下载地址

猜你喜欢

转载自www.cnblogs.com/jswf/p/12322369.html