RDC暑假项目需求:爬取牛客网招聘信息
暑假工作室的合作项目,需求是爬取牛客网计算机行业岗位信息,第一次用scrapy做合作项目,代码有瑕疵还请指教!
前言
Author: Ray
数据来源:牛客网 (https://www.nowcoder.com/)
搜索条件筛选:实习,广州
搜索量:由于牛客网搜索机制的原因,为了让结果更可能的准确,页面最多为5页
搜索项:前端、后端,后台、Android、大数据,算法
(搜索结果中多为字节跳动的JD,可能与牛客网有合作关系。。)
字段说明:(共24项)
projectID:项目ID
jobID:岗位在所属项目下的ID
job:岗位名称
job_type:岗位类型
company:企业名称
location:工作地点
salary:薪酬
attendance:实习要求
regular_chance:转正机会
company_type:企业类型
fund:资金状况
center:总部
duty:岗位职责
requirement:岗位要求
hrID:未知的特别信息
publisher_ID:该招聘信息发布者ID
publisher_name:该招聘信息发布者用户名
average_resume_processing_rate:平均简历处理率
current_resume_processing_rate:当前简历处理率
average_resume_processing_time:平均简历处理时间
current_resume_processing_time:当前简历处理时间
test:牛客网上笔试题目数目
interview_comment:面试短评数目
interview_experience:面经数目
code
spider:
# Author: Ray
# 数据来源:牛客网 (https://www.nowcoder.com/)
import scrapy
# from scrapy.spiders import Spider
from nowcoder.items import NowcoderItem
import re
import time
# scrapy crawl recruit -o recruit.csv
class RecruitSpider(scrapy.Spider):
name = 'recruit'
allowed_domains = ['nowcoder.com']
start_urls = ['https://www.nowcoder.com']
def start_requests(self):
base_url = 'https://www.nowcoder.com/intern/center?city=广州&recruitType=1&query='
# 搜索结果为牛客网默认排序
# searchbox = ['前端', '后端', 'Android','大数据,算法']
search = str(input('请输入搜索项:'))
request_url = base_url + search + '&page='
for page in range(1, 6):
yield scrapy.Request(request_url + str(page), callback=self.parse)
def parse(self, response):
further_request_url = response.xpath('//a[@class = "reco-job-title"]/@href').extract()
for i in further_request_url:
detail_url = 'https://www.nowcoder.com' + i
time.sleep(1)
yield scrapy.Request(detail_url, callback=self.parse_job, meta={
})
def parse_job(self, response):
item = NowcoderItem()
item['projectID'] = re.findall('projectId: \'(\d+)\'', response.text)[0]
item['jobID'] = re.findall('originalJobIds: \'(\d+)\'', response.text)[0]
item['job'] = response.xpath('//h2/a[@class="js-unfold"]/text()').extract_first()
item['job_type'] = response.xpath('//span[@class="rec-job-item js-nc-title-tips"]/text()').extract_first()
item['company'] = response.xpath('//h3[@class="teacher-name js-company-name"]/text()').extract_first()
item['location'] = response.xpath('//span[@class="rec-job-item js-nc-title-tips"]/text()').extract()[1]
if re.findall('<p>薪资:(\d+-\d+元/天)', response.text):
item['salary'] = re.findall('<p>薪资:(\d+-\d+元/天)', response.text)[0]
else:
item['salary'] = re.findall('<p>薪酬:(面议)', response.text)[0]
item['attendance'] = re.findall(' 实习要求:(\d天/周)', response.text)[0]
item['regular_chance'] = re.findall('转正机会:(.+)</p>', response.text)[0]
if response.xpath('//p[@class="com-type"]/text()'):
item['company_type'] = response.xpath('//p[@class="com-type"]/text()').extract_first()
if response.xpath('//p[@class="com-price"]/text()'):
item['fund'] = response.xpath('//p[@class="com-price"]/text()').extract_first()
item['center'] = response.xpath('//p[@class="com-lbs"]/text()').extract_first()
s = response.xpath('//div[@class="nc-post-content js-duty-content"]/text()').extract_first().split('\n')
ss = ''
for i in s:
ss += i
item['duty'] = ss
s = response.xpath('//div[@class="nc-post-content js-duty-content"]/text()').extract()[1].split('\n')
ss = ''
for i in s:
ss += i
item['requirement'] = ss
item['average_resume_processing_rate'] = response.xpath('//span[@class="font-green"]/text()').extract_first()
item['current_resume_processing_rate'] = response.xpath('//span[@class="font-green"]/text()').extract()[1]
item['average_resume_processing_time'] = response.xpath('//span[@class="font-green"]/text()').extract()[2]
item['current_resume_processing_time'] = response.xpath('//span[@class="font-green"]/text()').extract()[3]
if response.xpath('//a[@class="record-num"]/text()'):
item['test'] = response.xpath('//a[@class="record-num"]/text()').extract_first()
item['interview_comment'] = response.xpath('//a[@class="record-num"]/text()').extract()[1]
item['interview_experience'] = response.xpath('//a[@class="record-num"]/text()').extract()[2]
if response.xpath('//a[@class="link-green js-send-message nc-req-auth nc-req-active"]/@data-receiver-id'):
item['publisher_ID'] = response.xpath(
'//a[@class="link-green js-send-message nc-req-auth nc-req-active"]/@data-receiver-id').extract_first()
item['publisher_name'] = response.xpath(
'//a[@class="link-green js-send-message nc-req-auth nc-req-active"]/@data-receiver-name').extract_first()
item['hrID'] = re.findall('hrId: \'(\d+)\'', response.text)[0] # 未知的特别信息
yield item
items:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NowcoderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 下面爬取24种信息
'''
projectID:项目ID
jobID:岗位在所属项目下的ID
job:岗位名称
job_type:岗位类型
company:企业名称
location:工作地点
salary:薪酬
attendance:实习要求
regular_chance:转正机会
company_type:企业类型
fund:资金状况
center:总部
duty:岗位职责
requirement:岗位要求
hrID:未知的特别信息
publisher_ID:该招聘信息发布者ID
publisher_name:该招聘信息发布者用户名
average_resume_processing_rate:平均简历处理率
current_resume_processing_rate:当前简历处理率
average_resume_processing_time:平均简历处理时间
current_resume_processing_time:当前简历处理时间
test:牛客网上笔试题目数目
interview_comment:面试短评数目
interview_experience:面经数目
'''
projectID = scrapy.Field()
jobID = scrapy.Field()
job = scrapy.Field()
job_type = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
salary = scrapy.Field()
attendance = scrapy.Field()
regular_chance = scrapy.Field()
company_type = scrapy.Field()
fund = scrapy.Field()
center = scrapy.Field()
duty = scrapy.Field()
requirement = scrapy.Field()
hrID = scrapy.Field() # 未知的特别信息
publisher_ID = scrapy.Field()
publisher_name = scrapy.Field()
average_resume_processing_rate = scrapy.Field()
current_resume_processing_rate = scrapy.Field()
average_resume_processing_time = scrapy.Field()
current_resume_processing_time = scrapy.Field()
test = scrapy.Field()
interview_comment = scrapy.Field()
interview_experience = scrapy.Field()
scrapy爬取,保存csv文件乱码问题
在settings.py文件中加入以下设置:
FEED_EXPORT_ENCODING ='utf-8' # 保存json和txt文件,若出现 unicode
# 或
FEED_EXPORT_ENCODING = 'gb18030' # 保存csv表格文件时,若出现中文乱码