# -*- coding: utf-8 -*- import scrapy import re import scrapy.cmdline from a_detail.items import ADetailItem from peewee import * from a_detail.a_db import AList database = MySQLDatabase('a_plus', **{'charset': 'utf8', 'sql_mode': 'PIPES_AS_CONCAT', 'use_unicode': True, 'user': 'root', 'password': '2008zxzx'}) class ASpiderSpider(scrapy.Spider): name = 'a_spider' def start_requests(self): for page in range(0,149): detail_db = AList.select(AList.detail_url,AList.id).where(AList.id.between(1+page*100,100+page*100) and AList.collect_state==0) for index in detail_db: id = index.id detail_url = index.detail_url yield scrapy.Request(url = detail_url,meta={"id":id},callback=self.parse) def parse(self, response): id = response.meta["id"] item = ADetailItem() content_text = response.xpath("//*[@id='bodyContent']/script/following-sibling::*").extract() content_str = '\n'.join(content_text) content_str2 = re.sub("<(.*?)>","",content_str) content = "参看".join(content_str2.split("参看")[:-1]) item["id"] = id item["content"] = content yield item def main(): scrapy.cmdline.execute(['scrapy','crawl','a_spider']) main()
以上代码主要是为了记录spider中的Xpath路径;
以及在遇到复杂源码时截取所需要的content,所用到的函数。
(仅供自己学习)