#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#
import requests
import json
from random import randint, choice
import pymongo
from time import sleep
from multiprocessing import Process, JoinableQueue as Queue
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# 采集公司信息
def company(q):
# br = get_chrome()
br = login_lagou(20)
br.set_window_rect(602, 0, 600, 800)
db = get_mongodb()
# 采集到的数据放到company表中
company = db.company1
while True:
if q.empty():
break
try:
company_id = q.get()
url = 'https://www.lagou.com/gongsi/' + str(company_id) + '.html'
br.get(url)
company_info = {}
company_info['name'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/h1/a').text
company_info['job_num'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[1]/strong').text
company_info['efficiency'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[2]/strong').text
company_info['time_consuming'] = br.find_element_by_xpath(
'/html/body/div[3]/div/div/div[2]/ul/li[3]/strong').text
company_info['last_login'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[5]/strong').text
company_info['introduction'] = br.find_element_by_xpath(
'/html/body/div[6]/div[1]/div/div[2]/div[2]/div[2]/span[1]').text
company_info['inancing'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[2]/span').text
company_info['scale'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[3]/span').text
company.insert(company_info)
# 采集公司的评价
# company_comment(company_id, db, br)
print('%d公司信息已采集入库' % company_id)
q.task_done()
except Exception as e:
print('遇到异常', e)
q.put(company_id)
sleep(10)
sleep(1)
br.close()
q.join()
# 采集公司的评论信息
# def company_comment(id, db, br):
# """
# :param id: 公司id
# :param db: 数据库
# :param br: 浏览器驱动
# :return:
# """
# url = 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId='+str(id)
# br.get(url)
# 采集工作的具体信息
def work(q):
# br = get_chrome()
br = login_lagou(20)
br.set_window_rect(101, 0, 600, 600)
db = get_mongodb()
job_table = db.job1
while True:
if q.empty():
break
try:
id = q.get()
url = 'https://www.lagou.com/jobs/' + str(id) + '.html'
br.get(url)
job = {'id': id}
content = br.find_element_by_xpath('/html/body/div[5]/div[1]/dl[1]/dd[2]').text
job['content'] = content
job_table.insert(job)
print('%d招聘启事具体内容已入库' % id)
q.task_done()
except Exception as e:
print('遇到异常', e)
q.put(id)
sleep(10)
sleep(1)
br.close()
q.join()
# 获得一个无界面浏览器驱动
def get_chrome():
options = Options()
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
br = webdriver.Chrome(chrome_options=options)
return br
# 获取一个mongodb连接对象
def get_mongodb():
# 连接mongodb
cli = pymongo.MongoClient(host='192.168.12.244', port=27017)
db = cli.xxx
db.authenticate('ss', '123456')
return db
# 获取cookie信息
def get_cookie(br):
# br = get_chrome()
br.get('https://www.lagou.com/')
tmp_cookies = br.get_cookies()
# 动态获取到cookies
return {i['name']: i['value'] for i in tmp_cookies}
def login_lagou(sec):
br = get_chrome()
br.get('https://www.lagou.com/frontLogin.do')
mobi = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[1]/input')
pwd = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[2]/input')
mobi.send_keys('15324818121')
pwd.send_keys('123456')
sleep(sec)
# 打开连个选项卡备用
# br.execute_script('window.open("https://www.lagou.com/")')
# br.execute_script('window.open("https://www.lagou.com/")')
# br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[5]/input').click()
return br
if __name__ == '__main__':
# 存放公司信息的队列
companies = Queue()
# 存放岗位信息的队列
jobs = Queue()
br = login_lagou(20)
br.set_window_rect(0, 0, 200, 600)
# 启动一个进程采集公司的信息
c = Process(target=company, args=(companies,))
c.start()
sleep(20)
# 启动一个进程采集岗位信息
jo = Process(target=work, args=(jobs,))
jo.start()
# 准备请求头信息
header = {
'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%85%A8%E5%9B%BD',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
UAs = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:65.0) Gecko/20100101 Firefox/65.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7'
]
cookies = get_cookie(br)
# 获取一个mongodb连接对象
db = get_mongodb()
# 采集职位信息
i = 1
header['User-Agent'] = choice(UAs)
while True:
if i > 30:
break
data = {'first': 'false', 'pn': i, 'kd': 'Python'}
re = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false',
data=data, headers=header, cookies=cookies)
content = json.loads(re.text)
try:
if content['success']:
# 将职位信息放到mongodb的work表中
tab = db.work1
resultData = content['content']['positionResult']['result']
tab.insert(resultData)
for j in resultData:
# 将该公司的id放到 公司队j列中
companies.put(j['companyId'])
# 把工作id放到队列中
jobs.put(j['positionId'])
except Exception as e:
# 采集遇到异常的话就抛出异常病退出循环
print('遇到异常', e, content)
# 获取最新的cookie
cookies = get_cookie(br)
header['User-Agent'] = choice(UAs)
print('更换身份,正在重试')
# 因为后面会进行加一操作,而我们这次并没有成功猜到,那么需要重新采集
i -= 1
sleep_time = randint(1, 3)
print('列表第%d页已完成, 打算睡%d秒' % (i, sleep_time), )
sleep(sleep_time)
i += 1
# companies.put(None)
# jobs.put(None)
br.close()
c.join()
jo.join()
爬取拉勾网信息
猜你喜欢
转载自www.cnblogs.com/imshun/p/10513049.html
今日推荐
周排行