1 # -*- coding: utf-8 -*- 2 import scrapy 3 import requests 4 from lxml import etree 5 from selenium import webdriver 6 from scrapy.http.response.html import HtmlResponse 7 from hcrca.items import HcrcaItem 8 import re 9 class PoemSpiderPySpider(scrapy.Spider): 10 name = 'poem_spider.py' 11 allowed_domains = ['https://www.gushiwen.org/'] 12 start_urls = ['https://www.gushiwen.org/'] 13 14 def parse(self, response): 15 print("="*40) 16 result = response.text 17 #titles = re.findall('[(<div class="cont">\s)(</a>\s)]<a href="https://so.gushiwen.org/.*?.aspx" target="_blank">(.*?)</a>',result,re.DOTALL) 18 html = etree.HTML(result) 19 titles = html.xpath('//div[@class="main3"]//div[@class="right"]//a[@href and @target]/text()') 20 print("=" * 40,"yield开始") 21 for title in titles: 22 item = HcrcaItem(name=title) 23 yield item 24 print("=" * 40,"yield结束")
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 import json 8 9 class HcrcaPipeline(object): 10 def __init__(self): 11 self.fp = open("name.json","w",encoding="utf-8") 12 def open_spider(self,spider): 13 print("spider开始爬取") 14 def process_item(self, item, spider): 15 items_json = json.dumps(dict(item),ensure_ascii=False) 16 self.fp.write(items_json+"\n") 17 return item 18 def close_spider(self,spider): 19 self.fp.close() 20 print("spider结束爬取")
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class HcrcaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field()
1 from scrapy import cmdline 2 cmdline.execute("scrapy crawl poem_spider.py".split())