#爬取猎聘网金融行业分类下银行的4000条招聘数据,并存入csv文件
#requests,lxml是第三方库,需要额外安装
#下方有修改后的版本
import requests,csv,time
from lxml import etree
from requests.exceptions import RequestException
url ='https://www.liepin.com/zhaopin/?headckid=4ba8c02991d96408&industries=130'
num=1
def get_one_page(url):
try:
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return etree.HTML(response.text)
return None
except RequestException:
return None
def parse_one_page(html):#使用xpath定位数据
for job in html.xpath('//div[@class="sojob-item-main clearfix"]'):
city=job.xpath('div/p/a/text()')[0].strip()
name=job.xpath('div/h3/a/text()')[0].strip()
url=job.xpath('div/h3/a/@href')[0].strip()
firm=job.xpath('div//p[@class="company-name"]/a/text()')[0].strip()
salary=job.xpath('div/p/span/text()')[0].strip()
exper=job.xpath('div/p//span[3]/text()')[0].strip()
edu=job.xpath('div/p//span[2]/text()')[0].strip()
time=job.xpath('div//p[@class="time-info clearfix"]/time/text()')[0].strip()
yield{
'城市':city,
'职位':name,
'网址':url,
'公司':firm,
'薪酬':salary,
'工作经验要求':exper,
'学历要求':edu,
'发布时间':time
}
def init_csv():#初始化csv文件,写入标题行和爬取时间等相关信息
crawl_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
with open('result.csv','a',newline='') as my_csv:
my_writer=csv.writer(my_csv)
my_writer.writerow(['爬取对象:猎聘网金融行业银行',url,crawl_time])
my_writer.writerow(['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间'])
my_csv.close()
def write_to_csv(content):#写入csv文件
with open('result.csv','a',newline='') as my_csv:
fieldnames= ['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间']
my_writer=csv.DictWriter(my_csv,fieldnames=fieldnames)
my_writer.writerow(content)
def main(offset):
crawl_url=url+'&curPage='+str(offset)
html = get_one_page(crawl_url)
for item in parse_one_page(html):
write_to_csv(item)
if __name__ == '__main__':
init_csv()
for i in range(1,100):
try:
main(i)
time.sleep(1)#避免频繁访问,被封IP
except BaseException as e:
print(num,e)
finally:
num+=1
运行结果如下:
============== RESTART: C:/Users/Administrator/Desktop/test.py ==============
50 'gbk' codec can't encode character '\xa0' in position 11: illegal multibyte sequence
66 'gbk' codec can't encode character '\xa0' in position 11: illegal multibyte sequence
95 list index out of range
96 list index out of range
>>>
有4个异常抛出
修改后:
#爬取猎聘网金融行业分类下银行的4000条招聘数据,并存入csv文件
#requests,lxml是第三方库,需要额外安装
import requests,csv,time
from lxml import etree
from requests.exceptions import RequestException
url ='https://www.liepin.com/zhaopin/?headckid=4ba8c02991d96408&industries=130'
num_1,num_2=0,0
def get_one_page(url):
try:
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return etree.HTML(response.text)
return None
except RequestException:
return None
def parse_one_page(html):#使用xpath定位数据
global num_2
for job in html.xpath('//div[@class="sojob-item-main clearfix"]'):
num_2+=1
try:
city=job.xpath('div/p/a/text()')[0].strip()
name=job.xpath('div/h3/a/text()')[0].strip()
url=job.xpath('div/h3/a/@href')[0].strip()
firm=job.xpath('div//p[@class="company-name"]/a/text()')[0].strip()
salary=job.xpath('div/p/span/text()')[0].strip()
exper=job.xpath('div/p//span[3]/text()')[0].strip()
edu=job.xpath('div/p//span[2]/text()')[0].strip()
time=job.xpath('div//p[@class="time-info clearfix"]/time/text()')[0].strip()
yield{
'城市':city,
'职位':name,
'网址':url,
'公司':firm,
'薪酬':salary,
'工作经验要求':exper,
'学历要求':edu,
'发布时间':time,
'页面':num_1,
'条目':num_2
}
except BaseException as e:
print(num_1,num_2,e)
yield{
'页面':num_1,
'条目':num_2
}
def init_csv():#初始化csv文件,写入标题行和爬取时间等相关信息
crawl_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
with open('result.csv','a',newline='') as my_csv:
my_writer=csv.writer(my_csv)
my_writer.writerow(['爬取对象:猎聘网金融行业银行',url,crawl_time])
my_writer.writerow(['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间','页面','条目'])
my_csv.close()
def write_to_csv(content):#写入csv文件
with open('result.csv','a',newline='') as my_csv:
fieldnames= ['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间','页面','条目']
my_writer=csv.DictWriter(my_csv,fieldnames=fieldnames)
try:
my_writer.writerow(content)
except BaseException as e:
print(num_1,num_2,e)
def main(offset):
global num_1,num_2
num_1,num_2=num_1+1,0
crawl_url=url+'&curPage='+str(offset)
html = get_one_page(crawl_url)
for item in parse_one_page(html):
write_to_csv(item)
if __name__ == '__main__':
init_csv()
for i in range(1,100):
main(i)
time.sleep(1)#避免频繁访问,被封IP