创建crawl—scrapy

创建项目文件夹

scrapy startproject myproject

进入 myproject

scrapy genspider -t crawl crawl_baidu baidu.com

创建成功后在mproject文件夹中创建 run_baidu.py 运行文件

在spider文件夹中找到 crawl_baidu.py 进行编写代码

例：以拉勾网为例

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from myproject.items import LagouItem


class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['lagou.com']
    start_urls = ['http://lagou.com/']
   #此处写所需要的url的正则，它会匹配所有符合需求的url，callbacl是将结果返回给parse_item函数，follow为True时继续在此网页中寻找符合的url，False则终止寻找
    rules = (
        # Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'https://www\.lagou\.com/zhaopin/\w+/'), follow=True),
        Rule(LinkExtractor(allow=r'https://www\.lagou\.com/jobs/\d+.html'), callback='parse_detail', follow=False)
    )

    def parse_item(self, response):
        print('列表页')
        # i = {}
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        # return i

    def parse_detail(self, response):
       #根据xpath匹配想要的数据
        title = response.xpath('//span[@class="name"]/text()').extract_first()
        company = response.xpath('//div[@class="company"]/text()').extract_first()
        salary = response.xpath('//span[@class="salary"]/text()').extract_first()
        job_adv = response.xpath('//dd[@class="job-advantage"]//text()').extract()
        job_adv = ''.join(job_adv)#此处是将列表转化为字符串的用法
        job_bt = response.xpath('//dd[@class="job_bt"]//text()').extract()
        job_bt = ''.join(job_bt)
        # print(title, company, salary, job_adv, job_bt)
        item = LagouItem()
        #将结果返回到Items进行再到pipeline下载
        item['title'] = title
        item['company'] = company
        item['salary'] = salary
        item['job_adv'] = job_adv
        item['job_bt'] = job_bt
        yield item
        print('详情页')

猜你喜欢