# -*- coding:utf-8 -*- import re import scrapy from p2pinvestment.utils import (transform_capital, transform_rate, transform_term_of_investment, transform_time, transform_time1, transfrom_phone) MAIN_URL = 'http://cg.51bccf.com/finance_user' LIST_URL = 'http://cg.51bccf.com/finance_user/product_list.html' class BccfSpider(scrapy.Spider): name = 'bccf' company_nmae = '冰川财富' custom_settings = { 'CONCURRENT_REQUESTS': 8, 'DEFAULT_REQUEST_HEADERS': {'Accept-Language': 'zh-CN,zh;q=0.8', # 自定义请求头 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'}, } def start_requests(self): yield scrapy.Request(url=LIST_URL, callback=self.parse_target_list) def parse_target_list(self, response): tree = response.xpath('//li[contains(@id,"rptList")]') for tr in tree: item = {} item['TargetName'] = tr.xpath('normalize-space(string(.//a))').extract()[0].replace('\xa0', '') item['RatePercentage'] = transform_rate( tr.xpath('normalize-space(string(.//td[1]))').extract()[0].replace('往期年化收益率', '')) item['TermOfInvestment'] = transform_term_of_investment( tr.xpath('normalize-space(string(.//td[2]))').extract()[0].replace('出借期限', '')) item['TargetCapital'] = transform_capital( tr.xpath('normalize-space(string(.//td[3]))').extract()[0].replace('出借金额', '')) detail_url = MAIN_URL + tr.xpath('./a/@href').extract()[0].replace('.', '', 1) yield scrapy.Request(url=detail_url, method='post', meta={'item': item}, callback=self.parse_detail, priority=2, dont_filter=True) if 'conRptListTainer:conPaging:paging:next' in response.text: pageturn_url = MAIN_URL + response.xpath('//form[@id="form"]/@action').extract()[0].replace('.', '', 1) if 'jsessionid' in pageturn_url: pageturn_url = re.sub(';.*?\?', '', pageturn_url).replace('.html', '.html?') yield scrapy.FormRequest(url=pageturn_url, formdata={'conRptListTainer:conPaging:paging:next': 'x'}, callback=self.parse_target_list, dont_filter=True, priority=1) def parse_detail(self, response): phone_url = response.url item = response.meta['item'].copy() item['TargetStartTime'] = transform_time1(response.xpath('string(//span[@id="publishDate"])').extract()[0]) item['TargetEndTime'] = transform_time() yield scrapy.Request(url=phone_url, meta={'item': item}, callback=self.parse_phone, priority=3, dont_filter=True) def parse_phone(self, response): item = response.meta['item'].copy() tree = response.xpath('//tr[contains(@id,"tender")]') for tr in tree: item['PhoneNumber'] = transfrom_phone(tr.xpath('string((./td)[2])').extract()[0]) item['Time'] = transform_time1(tr.xpath('string((./td)[4])').extract()[0]) item['TargetCapital'] = transform_capital(tr.xpath('string((./td)[3])').extract()[0]) item['CompanyName'] = self.company_nmae yield item if 'conJkDescription:conTenderRecord:paging:next' in response.text: turn_url = MAIN_URL + response.xpath('//form[@id="form"]/@action').extract()[0].replace('.', '', 1).replace( 'amp;', '').replace( 'IFormSubmitListener-body-form', 'IFormSubmitListener-body-infoForm') turn_item = response.meta['item'] yield scrapy.FormRequest(url=turn_url, formdata={'conJkDescription:conTenderRecord:paging:next': 'x'}, meta={'item': turn_item}, callback=self.parse_phone, priority=4, dont_filter=True)
python伪代码之爬取冰川财富p2p信息运行代码持续更新:【内向即蛆虫--屠雅倩】
猜你喜欢
转载自blog.csdn.net/qq_37995231/article/details/79248126
今日推荐
周排行