import urllib.request # 用于请求打开网页
from bs4 import BeautifulSoup # 网页解析获取数据
import re # 引入正则表达式
import xlwt # 保存文件为excel
# 正则表达式提取岗位信息
jobHref = re.compile(r'"job_href":"(.*?)"', re.S) # 岗位链接
jobName = re.compile(r'"job_name":"(.*?)"', re.S) # 岗位名称
comHref = re.compile(r'"company_href":"(.*?)"', re.S) # 公司链接
comName = re.compile(r'"company_name":"(.*?)"') # 公司名称
salary = re.compile(r'"providesalary_text":"(.*?)"') # 薪资水平
companytype = re.compile(r'"companytype_text":"(.*?)"') # 公司类型
attribute = re.compile(r'"attribute_text":\[(.*?)\]', re.S) # 招聘条件
workarea = re.compile(r'"workarea_text":"(.*?)"', re.S) # 工作地点
companysize = re.compile(r'"companysize_text":"(.*?)"', re.S) # 公司规模
companyind = re.compile(r'"companyind_text":"(.*?)"', re.S) # 主要业务
jobwelf = re.compile(r'"jobwelf":"(.*?)"', re.S) # 福利待遇
def main():
key = input("请输入您想查询的岗位:")
baseurl1 = "https://search.51job.com/list/030200,000000,0000,00,9,99," + key + ",2,"
baseurl2 = ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
savePath = f".\\{key}_广州.xls"
# 1.爬取网页
datalist = getData(baseurl1, baseurl2)
# 2.保存数据
saveData(datalist, savePath)
# 模拟浏览器,包装url,
def askUrl(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
request = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
return html
# 爬取网页 ,注意这一步我们所需要的数据不在标签中,而是在Js里面
def getData(baseurl1, baseurl2):
datalist = []
for i in range(1, 11):
datapage = []
url = baseurl1 + str(i) + baseurl2 # 将两个baseurl串起来就是一个整体的url,str[i]是随着页面变化的变量
html = askUrl(url) # 保存获取到的html
# 逐一解析
soup = BeautifulSoup(html, "html.parser")
# 找到所需要的数据块
for item in soup.find_all(text=jobHref):
job_href = re.findall(jobHref, item)
job_name = re.findall(jobName, item)
com_href = re.findall(comHref, item)
com_name = re.findall(comName, item)
salary_level = re.findall(salary, item)
company_type = re.findall(companytype, item)
condition = re.findall(attribute, item)
work_area = re.findall(workarea, item)
company_size = re.findall(companysize, item)
company_service = re.findall(companysize, item)
job_welf = re.findall(jobwelf, item)
datapage.append([job_href, job_name, com_href, com_name, salary_level, company_type, condition, work_area,
company_size, company_service, job_welf])
for j in range(len(job_href)):
record = [job_href[j], job_name[j], com_href[j], com_name[j], salary_level[j], company_type[j],
condition[j], work_area[j], company_size[j], company_service[j], job_welf[j]]
datalist.append(record)
return datalist
# 保存数据
def saveData(datalist, savePath):
# 创建表格,添加sheet
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = workbook.add_sheet('java最新招聘情况', cell_overwrite_ok=True)
# 将表头写入表中
col = ["岗位链接", "岗位名称", "公司链接", "公司名称", "薪资水平", "公司类型", "招聘条件", "工作地点", "公司规模",
"主要业务", "福利待遇"]
for i in range(0, len(col)):
sheet.write(0, i, col[i]) # 表示将col[i]写入第0行第i列
# 将每条数据写入表中
for i in range(len(datalist)):
print(f"第{i + 1}条数据")
data = datalist[i]
for j in range(0, 11):
sheet.write(i + 1, j, data[j]) # 表示将data[j]写入第i+1行第j列
workbook.save(savePath)
if __name__ == "__main__":
main()
print("爬虫执行完毕!")
前程无忧51Job岗位爬取~~新手路过,请多指教
猜你喜欢
转载自blog.csdn.net/baidu_41833099/article/details/118764679
今日推荐
周排行