python伪代码之爬取冰川财富p2p信息运行代码持续更新:【内向即蛆虫--屠雅倩】

# -*- coding:utf-8 -*-
import re

import scrapy

from p2pinvestment.utils import (transform_capital,
                                 transform_rate, transform_term_of_investment,
                                 transform_time, transform_time1, transfrom_phone)

MAIN_URL = 'http://cg.51bccf.com/finance_user'
LIST_URL = 'http://cg.51bccf.com/finance_user/product_list.html'


class BccfSpider(scrapy.Spider):
    name = 'bccf'
    company_nmae = '冰川财富'
    custom_settings = {
        'CONCURRENT_REQUESTS': 8,
        'DEFAULT_REQUEST_HEADERS': {'Accept-Language': 'zh-CN,zh;q=0.8',  # 自定义请求头
                                    'Content-Type': 'application/x-www-form-urlencoded',
                                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36',
                                    'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive',
                                    'Accept-Encoding': 'gzip, deflate'},
    }

    def start_requests(self):
        yield scrapy.Request(url=LIST_URL, callback=self.parse_target_list)

    def parse_target_list(self, response):
        tree = response.xpath('//li[contains(@id,"rptList")]')
        for tr in tree:
            item = {}
            item['TargetName'] = tr.xpath('normalize-space(string(.//a))').extract()[0].replace('\xa0', '')
            item['RatePercentage'] = transform_rate(
                tr.xpath('normalize-space(string(.//td[1]))').extract()[0].replace('往期年化收益率', ''))
            item['TermOfInvestment'] = transform_term_of_investment(
                tr.xpath('normalize-space(string(.//td[2]))').extract()[0].replace('出借期限', ''))
            item['TargetCapital'] = transform_capital(
                tr.xpath('normalize-space(string(.//td[3]))').extract()[0].replace('出借金额', ''))
            detail_url = MAIN_URL + tr.xpath('./a/@href').extract()[0].replace('.', '', 1)
            yield scrapy.Request(url=detail_url, method='post', meta={'item': item}, callback=self.parse_detail,
                                 priority=2, dont_filter=True)

        if 'conRptListTainer:conPaging:paging:next' in response.text:
            pageturn_url = MAIN_URL + response.xpath('//form[@id="form"]/@action').extract()[0].replace('.', '', 1)
            if 'jsessionid' in pageturn_url:
                pageturn_url = re.sub(';.*?\?', '', pageturn_url).replace('.html', '.html?')
            yield scrapy.FormRequest(url=pageturn_url, formdata={'conRptListTainer:conPaging:paging:next': 'x'},
                                     callback=self.parse_target_list, dont_filter=True, priority=1)

    def parse_detail(self, response):
        phone_url = response.url
        item = response.meta['item'].copy()
        item['TargetStartTime'] = transform_time1(response.xpath('string(//span[@id="publishDate"])').extract()[0])
        item['TargetEndTime'] = transform_time()
        yield scrapy.Request(url=phone_url, meta={'item': item}, callback=self.parse_phone, priority=3,
                             dont_filter=True)

    def parse_phone(self, response):
        item = response.meta['item'].copy()
        tree = response.xpath('//tr[contains(@id,"tender")]')
        for tr in tree:
            item['PhoneNumber'] = transfrom_phone(tr.xpath('string((./td)[2])').extract()[0])
            item['Time'] = transform_time1(tr.xpath('string((./td)[4])').extract()[0])
            item['TargetCapital'] = transform_capital(tr.xpath('string((./td)[3])').extract()[0])
            item['CompanyName'] = self.company_nmae
            yield item

        if 'conJkDescription:conTenderRecord:paging:next' in response.text:
            turn_url = MAIN_URL + response.xpath('//form[@id="form"]/@action').extract()[0].replace('.', '', 1).replace(
                'amp;', '').replace(
                'IFormSubmitListener-body-form', 'IFormSubmitListener-body-infoForm')
            turn_item = response.meta['item']
            yield scrapy.FormRequest(url=turn_url, formdata={'conJkDescription:conTenderRecord:paging:next': 'x'},
                                     meta={'item': turn_item}, callback=self.parse_phone, priority=4, dont_filter=True)
python伪代码之爬取冰川财富p2p信息运行代码持续更新:【内向即蛆虫--屠雅倩】

猜你喜欢