from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common import exceptions
import csv
# 创建保存文件
f = open('C:\\Users\\2239670\\Downloads\\AIGC_boss.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f, fieldnames=["职位名称",
"地区",
"薪水",
"标签",
"能力要求",
"公司名字",
"公司介绍",
"福利待遇",
"职位描述",
"企业类型",
"工作地址",
"详情链接"])
csv_writer.writeheader() # 写入表头
options = ChromeOptions()
# 反检测
options.add_experimental_option("excludeSwitches", ["enable-automation"])
driver = Chrome(options=options)
# 设置隐性等待时间为10s
driver.implicitly_wait(10)
# https://www.zhipin.com/web/geek/job?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&city=101010100&page=1
driver.get('https://www.zhipin.com/web/geek/job?query=AIGC&city=101010100')
for page in range(1, 4): # 爬取3页
k = 0 # 用来设置每页爬取的数量,每页有30条数据,因全部爬取用selenium较慢,为测试效果每页只爬取5条
sleep(2)
# 滚动条滚到底部
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
li_lists = driver.find_elements(By.CSS_SELECTOR, '.job-card-wrapper')
print(len(li_lists))
for li in li_lists:
job_name = li.find_element(By.CLASS_NAME, 'job-name').text
job_area = li.find_element(By.CLASS_NAME, 'job-area').text
salary = li.find_element(By.CLASS_NAME, 'salary').text
job_tag = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-left .tag-list').text.replace('\n', ',')
job_ability = li.find_element(By.XPATH, './div[2]/ul').text
company_name = li.find_element(By.CLASS_NAME, 'company-name').text
welfare = li.find_element(By.CLASS_NAME, 'info-desc').text
link = li.find_element(By.CLASS_NAME, 'job-card-left').get_attribute('href')
# 点击详情页
clic = li.find_element(By.CSS_SELECTOR, '.job-card-left')
driver.execute_script('arguments[0].click()', clic)
# 窗口切换到最新打开的页面
driver.switch_to.window(driver.window_handles[-1])
sleep(2)
job_des = driver.find_element(By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text.replace('\n',
' ')
try: # 有的公司没有公司介绍
company_info = driver.find_element(By.CSS_SELECTOR,
'.job-body-wrapper .company-info-box .fold-text').text.replace('\n', ' ')
except exceptions.NoSuchElementException:
company_info = ''
try:
company_type = driver.find_element(By.CLASS_NAME, 'company-type').text.replace('企业类型\n', '')
except exceptions.NoSuchElementException:
company_type = ''
address = driver.find_element(By.CLASS_NAME, 'location-address').text
dic = {
"职位名称": job_name,
"地区": job_area,
"薪水": salary,
"标签": job_tag,
"能力要求": job_ability,
"公司名字": company_name,
"公司介绍": company_info,
"福利待遇": welfare,
"职位描述": job_des,
"企业类型": company_type,
"工作地址": address,
"详情链接": link
}
# 写入数据
csv_writer.writerow(dic)
k += 1
print('ddddddddddddddddddddddicccccccccc')
print(dic)
driver.close()
# 窗口切换到第一个页面
driver.switch_to.window(driver.window_handles[0])
if k == 5: # 每页爬取5条数据
break
sleep(2)
# 点击下一页,这里下一页的按钮不好定位,用XPATH的话只对1-4页有用,第五页后面要改成a[11]
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
driver.execute_script('arguments[0].click()', c)
driver.close()
driver.quit()
from selenium.webdriver import Chrome
猜你喜欢
转载自blog.csdn.net/sinat_37574187/article/details/130818712
今日推荐
周排行