# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import re from zhaopin_project.items import LagouItem from w3lib.html import remove_tags class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['lagou.com'] start_urls = ['http://lagou.com/'] rules = ( Rule(LinkExtractor(allow=r'https://www\.lagou\.com/zhaopin/\w+/'), follow=True), Rule(LinkExtractor(allow=r'https://www\.lagou\.com/jobs/\d+.html'), callback='parse_detail', follow=False), ) def parse_item(self, response): print('寡人来到了列表页') def parse_detail(self,response): html_str = response.text try: # 职位名称 title = re.findall('<span class="name">(.*?)</span>',html_str)[0] # 月薪 salary = re.findall('<span class="salary">(.*?)</span>',html_str)[0] # 位置 position = re.findall('<span>(.*) /</span>',html_str)[0] # 经验要求 jingyan = re.findall('<span>(.*) /</span>',html_str)[1] # 学历要求 xueli = re.findall('<span>(.*) /</span>',html_str)[2] #时间 time = re.findall('<p class="publish_time">(.*) (.*)</p>',html_str)[0] shijian = time[0] # 发布网站 fabu = time[1] # 职位描述 job_bt = re.findall('<dd class="job_bt">(.*?)</dd>',html_str,re.S)[0] job_bt = remove_tags(job_bt) print('--' * 50) item = LagouItem() item['title'] = title item['salary'] = salary item['position'] = position item['jingyan'] = jingyan item['xueli'] = xueli item['shijian'] = shijian item['fabu'] = fabu item['job_bt'] = job_bt yield item # print('寡人来了详情页') except: pass
拉钩
猜你喜欢
转载自www.cnblogs.com/lxh777/p/9581008.html
今日推荐
周排行