爬取腾讯社招职位信息地址
https://hr.tencent.com/position.php
三个文件代码如下:
spdier.py
# -*- coding: utf-8 -*-
# author : pengshiyu
# date : 2-18-4-19
import scrapy
from scrapy.selector import Selector
from tencent_position_item import TencentPositionItem
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class TencentPositionSpider(scrapy.Spider):
name = "tencent_position"
allowed_domains = ["tencent.com"]
custom_settings = {
"ITEM_PIPELINES":{
"myspider.tencent_position_spider.tencent_position_pipeline.TencentPositionPipeline": 100,
}
}
start_urls =[
"https://hr.tencent.com/position.php"
]
def parse(self, response):
base_url = "https://hr.tencent.com/"
rows = response.css(".even, .odd")
# 或者使用xpath解析器 或 | ; 与 and
# rows = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
for row in rows:
position_name = row.xpath("./td[1]/a/text()").get()
position_link = row.xpath("./td[1]/a/@href").get()
position_type = row.xpath("./td[2]/text()").get()
position_number = row.xpath("./td[3]/text()").get()
work_location = row.xpath("./td[4]/text()").get()
publish_time = row.xpath("./td[5]/text()").get()
# 输出提取的信息
print "*"*30
print position_name
print position_link
print position_type
print position_number
print work_location
print publish_time
# 保存到item
item = TencentPositionItem()
item["position_name"] = position_name
item["position_link"] = base_url + position_link
item["position_type"] = position_type
item["position_number"] = position_number
item["work_location"] = work_location
item["publish_time"] = publish_time
yield item
# 翻页, 下一页
# 方式1 正则匹配, 可能有的版本不能用re_first,那就用re
regex = u'<a href="([^<]*)" id="next">下一页</a>'
ret = Selector(response).re_first(regex, replace_entities=False)
# 方式2 css选择器查找
next_url = response.css("#next::attr(href)").extract_first()
if next_url != u"javascript:;":
next_url = base_url + next_url
print "下一页:", next_url
yield scrapy.Request(url=next_url, callback=self.parse)
else:
print "最后一页了", next_url
item.py
# -*- coding:utf-8 -*-
import scrapy
class TencentPositionItem(scrapy.Item):
position_name = scrapy.Field() # 职位名称
position_link = scrapy.Field() # 职位链接详情
position_type = scrapy.Field() # 职位类型
position_number = scrapy.Field() # 职位数量
work_location = scrapy.Field() # 工作地点
publish_time = scrapy.Field() # 发布时间
pipline.py
# -*- coding: utf-8 -*-
import json
import os
BASE_DIR = os.path.abspath(__file__)
class TencentPositionPipeline(object):
def __init__(self):
self.f = open("tencent_position.txt", "w")
self.count = 0
def process_item(self, item, spider):
content = json.dumps(dict(item), ensure_ascii=False)+"\n"
self.f.write(content)
self.count += 1
return item
def close_spider(self, spider):
print "爬取信息条数:{count}".format(count=self.count)
self.f.close()