爬取tencent职位招聘的

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from TencentSpider.items import TencentspiderItem,TencentDetailItem

class TencentSpider(CrawlSpider):
name = 'tencent'
allowed_domains = ['hr.tencent.com'] # 如果指定，如果其他网站匹配到了下面的格式的话，就会去
# 别的网站去爬取
start_urls = ['https://hr.tencent.com/position.php?&start=0']
"""
LinkExtractor(allow='start=\d+')返回的是一个列表，Rule依次发送请求，并且继续跟进，调用指定函数去处理
"""
rules = [
# Rule(LinkExtractor(allow='start=\d+'), callback='parse_tencent', follow=True),
Rule(LinkExtractor(allow='position_detail.php'),callback='parse_info',follow=True)
]

def parse_tencent(self, response):
link_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") # 可以这样筛选
for each in link_list:
item = TencentspiderItem()
item['position_name'] = each.xpath("./td[1]/a/text()").extract()[0]
item['position_link'] = each.xpath("./td[1]/a/@href").extract()[0]
position_type = each.xpath("./td[2]/text()").extract()[0]
if not position_type:
position_type = "为空"
item['position_type'] = position_type
item['position_need'] = each.xpath("./td[3]/text()").extract()[0]
item['position_place'] = each.xpath("./td[4]/text()").extract()[0]
item['position_time'] = each.xpath("./td[5]/text()").extract()[0]
yield item

"""这个可以将本页面中的链接都取出来进去将数据爬下来"""
def parse_info(self,response):
"""可以将"""
item = TencentDetailItem()
item['position_name'] = response.xpath('//*[@id="sharetitle"]').extract()[0]

yield item

"""其他设置和其他文章没有太大的区别"""

爬取tencent职位招聘的

猜你喜欢