pyspider框架学习之记录2

pyspider框架学习记录2

今天工作遇到一个比较繁琐的网站,涉及到了ajax技术和pyspider框架self.crawl函数中,请求参数params的使用。ajax异步请求分析方法与记录1一样,在请求不再采用post请求方式,而是采用get方式请求,所以不用请求体data参数,而是在self.crawl中添加params,详情可参考官方文档或者中文文档(http://www.pyspider.cn/book/pyspider/self.crawl-16.html)。代码相对来说比较简单,但是稍微有些繁琐,所以不详细一步一步讲解了,直接贴代码吧。。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-06 14:10:21
# Project: zhejiang

from pyspider.libs.base_handler import *
from pymongo import MongoClient
import datetime
import re

DB_IP = '127.0.0.1'
DB_PORT = 27017
DB_NAME = 'research'
DB_COL = 'zhejiang'
client = MongoClient(host=DB_IP, port=DB_PORT)
db = client[DB_NAME]
col = db[DB_COL]


class Handler(BaseHandler):
    url = 'http://zfxxgk.zj.gov.cn/web1/site/col/col62/index.html'
    crawl_config = {
        "headers": {
            "User-Agent": "Mozilla/5.0 (X11;Linux x86_64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/66.0.3359.181 Safari/537.36"
        }
    }

    def format_date(self, date):
        return datetime.datetime.strptime(date, '%Y-%m-%d')

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(self.url, fetch_type='js', callback=self.index_page)

    @config(age=60)
    def index_page(self, response):
        page = response.etree

        department_list = page.xpath("//td//a/text()")[8:]
        department_url = page.xpath("//td//a/@href")[12:]
        print len(department_list), len(department_url)

        for title, address in zip(department_list, department_url):
            print title, address
            save = {"depart_url": address,
                    "depart_title": title
                    }

            # print save["depart_title"]
            address = 'http://zfxxgk.zj.gov.cn' + address

            self.crawl(address, callback=self.parse_department, fetch_type='js', save=save)

    def parse_department(self, response):
        page = response.etree

        # 翻页
        page_num_str = page.xpath("//table[@class='tb_title']/tbody/tr/td[1]/text()")[0].encode('utf-8')
        page_num = int(re.findall('共(\d+)页', page_num_str)[0])
        base_url = 'http://zfxxgk.zj.gov.cn/xxgk/jcms_files/jcms1' + response.save["depart_url"] + 'zfxxgk/search.jsp?'

        # 获取cid
        cid = int(page.xpath("//form[@id='searchform']/input[@name='cid']/@value")[0])
        print cid
        # 获取jdid
        jdid = int(page.xpath("//form[@id='searchform']/input[@name='jdid']/@value")[0])
        print jdid
        # 获取divid
        divid = page.xpath("//form[@id='searchform']/input[@name='divid']/@value")[0]
        print divid

        # 请求参数
        params = {"showsub": 0,
                  "orderbysub": 0,
                  "cid": cid,
                  "vc_title": "",
                  "vc_number": "",
                  "binlay": "",
                  "c_issuetime": "",
                  "jdid": jdid,
                  "divid": divid,
                  "vc_keyword": "",
                  "vc_abs": "",
                  "vc_ztfl": "",
                  "vc_service": "",
                  "c_createtime": ""
                  }

        save = {"categories": response.save["depart_title"]}

        for each in range(1, page_num + 1):
            page_url = base_url + 'currpage={}&'.format(each)

            # print page_url

            self.crawl(page_url, callback=self.parse_page, params=params, save=save)

    def parse_page(self, response):
        page = response.etree

        categories = [response.save["categories"]]

        content_list = page.xpath("//tr[@class='tr_main_value_odd' or @class='tr_main_value_even']")

        for each in content_list:
            content_title = each.xpath("./td[1]/a/text()")[0].encode('utf-8')
            content_url = each.xpath("./td[1]/a/@href")[0]
            content_date = each.xpath("./td[2]/text()")[0]

            print content_title, content_url, content_date

            save = {"title": content_title,
                    "url": content_url,
                    "date": content_date,  ### 在这里不要格式化日期,因为save数据在传输的时候会被序列化,到下个函数再用的时候,会变成字符串
                    "categories": categories
                    }

            self.crawl(content_url, callback=self.parse_body, save=save)

    def parse_body(self, response):
        page = response.etree

        body_list = page.xpath("//text()")

        body = ''
        for each in body_list:
            body += each.strip().encode('utf-8')

        result = {"title": response.save["title"],
                  "categories": response.save["categories"],
                  "date": self.format_date(response.save["date"]),
                  "url": response.save["url"],
                  "body": body,
                  "update_time": datetime.datetime.now(),
                  "source": "浙江省人民政府"
                  }

        yield result

    def on_result(self, result):
        if result is None:
            return
        # print result

        update_key = {
            'date': result['date'],
            'title': result['title']
        }
        col.update(update_key, {'$set': result}, upsert=True)

猜你喜欢

转载自blog.csdn.net/qq_36653505/article/details/80600497