创建项目文件夹
scrapy startproject myproject
进入 myproject
scrapy genspider -t crawl crawl_baidu baidu.com
创建成功后在mproject文件夹中创建 run_baidu.py 运行文件
在spider文件夹中找到 crawl_baidu.py 进行编写代码
例:以拉勾网为例
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from myproject.items import LagouItem class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['lagou.com'] start_urls = ['http://lagou.com/'] #此处写所需要的url的正则,它会匹配所有符合需求的url,callbacl是将结果返回给parse_item函数,follow为True时继续在此网页中寻找符合的url,False则终止寻找 rules = ( # Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), Rule(LinkExtractor(allow=r'https://www\.lagou\.com/zhaopin/\w+/'), follow=True), Rule(LinkExtractor(allow=r'https://www\.lagou\.com/jobs/\d+.html'), callback='parse_detail', follow=False) ) def parse_item(self, response): print('列表页') # i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() # return i def parse_detail(self, response): #根据xpath匹配想要的数据 title = response.xpath('//span[@class="name"]/text()').extract_first() company = response.xpath('//div[@class="company"]/text()').extract_first() salary = response.xpath('//span[@class="salary"]/text()').extract_first() job_adv = response.xpath('//dd[@class="job-advantage"]//text()').extract() job_adv = ''.join(job_adv)#此处是将列表转化为字符串的用法 job_bt = response.xpath('//dd[@class="job_bt"]//text()').extract() job_bt = ''.join(job_bt) # print(title, company, salary, job_adv, job_bt) item = LagouItem() #将结果返回到Items进行再到pipeline下载 item['title'] = title item['company'] = company item['salary'] = salary item['job_adv'] = job_adv item['job_bt'] = job_bt yield item print('详情页')