截取爬虫文本内容

# -*- coding: utf-8 -*-
import scrapy
import re
import scrapy.cmdline
from a_detail.items import ADetailItem
from peewee import *
from a_detail.a_db import AList
database = MySQLDatabase('a_plus', **{'charset': 'utf8', 'sql_mode': 'PIPES_AS_CONCAT', 'use_unicode': True, 'user': 'root', 'password': '2008zxzx'})


class ASpiderSpider(scrapy.Spider):
    name = 'a_spider'
    def start_requests(self):
        for page in range(0,149):
            detail_db = AList.select(AList.detail_url,AList.id).where(AList.id.between(1+page*100,100+page*100) and AList.collect_state==0)
            for index in detail_db:
                id = index.id
                detail_url = index.detail_url
                yield scrapy.Request(url = detail_url,meta={"id":id},callback=self.parse)
    def parse(self, response):
        id = response.meta["id"]
        item = ADetailItem()
        content_text = response.xpath("//*[@id='bodyContent']/script/following-sibling::*").extract()
        content_str = '\n'.join(content_text)
        content_str2 = re.sub("<(.*?)>","",content_str)
        content = "参看".join(content_str2.split("参看")[:-1])
        item["id"] = id
        item["content"] = content
        yield item

def main():
    scrapy.cmdline.execute(['scrapy','crawl','a_spider'])
main()

以上代码主要是为了记录spider中的Xpath路径;

以及在遇到复杂源码时截取所需要的content,所用到的函数。

(仅供自己学习)

猜你喜欢

转载自www.cnblogs.com/jokerisme/p/12666321.html