我们以腾讯社招页面来做演示:http://hr.tencent.com/position.php?&start=10#a
使用BeautifuSoup4解析器,将招聘网页上的职位名称、职位类别、招聘人数、工作地点、发布时间,以及每个职位详情的点击链接存储出来。
from bs4 import BeautifulSoup import urllib.request import json # 使用了json格式存储 def tencent(): url = 'http://hr.tencent.com/' request = urllib.request.Request(url + 'position.php?&start=10#a') response =urllib.request.urlopen(request) resHtml = response.read() output =open('tencent.json','wb+') html = BeautifulSoup(resHtml,'lxml') # 创建CSS选择器 result = html.select('tr[class="even"]') result2 = html.select('tr[class="odd"]') result += result2 print(result) items = [] for site in result: item = {} name = site.select('td a')[0].get_text() detailLink = site.select('td a')[0].attrs['href'] catalog = site.select('td')[1].get_text() recruitNumber = site.select('td')[2].get_text() workLocation = site.select('td')[3].get_text() publishTime = site.select('td')[4].get_text() item['name'] = name item['detailLink'] = url + detailLink item['catalog'] = catalog item['recruitNumber'] = recruitNumber item['publishTime'] = publishTime item['workLocation'] = workLocation items.append(item) # 禁用ascii编码,按utf-8编码 line = json.dumps(items,ensure_ascii=False) output.write(line.encode()) output.close() if __name__ == "__main__": tencent()
爬取结果tencent.json
[ { "detailLink": "http://hr.tencent.com/position_detail.php?id=40821&keywords=&tid=0&lid=0", "workLocation": "北京", "catalog": "职能类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "S2-MIG风险管理经理(北京)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40825&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "市场类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "19116-互联网+医疗行业经理(深圳)" }, {"detailLink": "http://hr.tencent.com/position_detail.php?id=40815&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "技术类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "24012-H5游戏开发工程师(深圳)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40818&keywords=&tid=0&lid=0", "workLocation": "北京", "catalog": "设计类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "23674-视觉设计(北京)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40820&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "设计类", "publishTime": "2018-05-24", "recruitNumber": "2", "name": "24491-高级多媒体设计师(深圳)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40824&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "技术类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "26564-后台开发工程师(深圳)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40828&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "市场类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "MIG15-腾讯叮当高级销售经理" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40817&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "产品/项目类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "26564-项目经理(深圳)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40819&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "产品/项目类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "24491-游戏英文文案翻译(深圳)" }, { "detailLink": "http://hr.tencent.com/position_detail.php?id=40822&keywords=&tid=0&lid=0", "workLocation": "深圳", "catalog": "产品/项目类", "publishTime": "2018-05-24", "recruitNumber": "1", "name": "SD3-海外PM(日语方向)" } ]