#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: ligang
@contact: [email protected]
@software: PyCharm IDEA
@file: company_position_count.py
@create at: 2018-09-04 10:40
"""
from mf_utils.core import BaseInitCore
from mf_utils.logger import Logger
from mf_utils.decorates import cls_catch_exception
from datetime import datetime
from mf_utils.sql.redis_m import get_redis_client
from mf_utils.sql.mysql import MysqlHandle
from conf import settings
import json, re, gevent
from gevent import monkey
monkey.patch_all()
redis_client = get_redis_client(
host=settings.REDIS_HOST,
port=settings.REDIS_PORT,
db=settings.REDIS_DB,
password=settings.REDIS_PASSWORD
)
class CompanyPosition(BaseInitCore):
def __init__(self):
super(CompanyPosition, self).__init__()
self.logger = Logger.file_logger()
self.mysql_handle = MysqlHandle(host='127.0.0.1', user="root", passwd='mysql', db='ligang',
port=3306, charset='utf8')
def get_zhi_lian_position_list(self, company_name, start=0, res_lst=None):
try:
url = 'https://fe-api.zhaopin.com/c/i/sou?' \
'start={start}&pageSize=60&cityId=489' \
'&kw={company_name}' \
'&kt=2'.format(start=start, company_name=company_name)
res = self.html_downloader.download(url)
data_lst = json.loads(res.text).get('data').get('results')
total = int(json.loads(res.text).get('data').get('numFound'))
current_page = int(re.findall(
'(?<=start=).*?(?=&)', res.url)[0]) / 60 + 1
self.logger.debug('current_page - %s' % current_page)
for data in data_lst:
position_info = dict()
position_info['site'] = 'ZHI_LIAN'
position_info['city'] = data.get('city').get('display')
position_info['jobName'] = data.get('jobName')
if len(res_lst) >= 60:
return res_lst
res_lst.append(position_info)
start = current_page * 60
if (current_page - 1) * 60 < total:
self.get_zhi_lian_position_list(
company_name, start=start, res_lst=res_lst)
return res_lst
except Exception as e:
self.logger.exception(e)
return res_lst
def get_five_one_position_list(self, company_name, page=1, res_lst=None):
try:
url = 'https://search.51job.com/list/000000,' \
'000000,0000,00,9,99,{company_name}' \
',1,{page}.html'.format(company_name=company_name, page=page)
res = self.html_downloader.download(url)
soups = self.html_parser.parser(res.content)
current_page = int(soups.find(
'div', class_='p_in').find('li', class_='on').text)
total_page = int(re.findall('\d+', soups.find(
'div', class_='p_in').find('span', class_='td').text)[0])
self.logger.debug('current_page - %s' % current_page)
data_lst = soups.find(
'div', id='resultList').find_all('div', class_='el')[1:]
for data in data_lst:
position_info = dict()
position_info['site'] = 'FIVE_ONE'
position_info['jobName'] = data.find('a').get('title')
if len(res_lst) >= 30:
return res_lst
res_lst.append(position_info)
if current_page < total_page:
page += 1
self.get_five_one_position_list(
company_name, page=page, res_lst=res_lst)
return res_lst
except Exception as e:
self.logger.exception(e)
return res_lst
@cls_catch_exception
def get_zhi_lian_position_detail(self, job_id):
url = 'https://jobs.zhaopin.com/{}.htm'.format(job_id)
headers = {
'Cookie': 'ZP_OLD_FLAG=false;'
}
res = self.html_downloader.download(url, headers=headers)
self.logger.debug('get detail {}'.format(job_id))
soups = self.html_parser.parser(res.content)
position_desc = soups.find('div', class_='pos-ul').text.strip()
return position_desc
@cls_catch_exception
def get_five_one_position_detail(self, job_id):
url = 'https://jobs.51job.com/all/{}.html'.format(job_id)
res = self.html_downloader.download(url)
self.logger.debug('get detail {}'.format(job_id))
soups = self.html_parser.gbk_parser(res.content)
city, exp, degree = soups.find(
'p', class_='msg ltype').text.strip().replace(u' ', '').split('|')[:3]
if u'招' in degree:
degree = ''
position_desc_lst = soups.find(
'div', class_='bmsg job_msg inbox').find_all('p', recursive=False)
position_desc = ''.join(
map(lambda x: x.text.strip(), position_desc_lst)).replace('\n', ' ')
return city, exp, degree, position_desc
def main():
key = 'lg'
cp = CompanyPosition()
cp.logger.info('start company position search. redis_queue: {}'.format(key))
while True:
try:
site, company_name = redis_client.lpop(key).split('|')
cp.logger.info('start_task: {} | {}'.format(site, company_name))
if site == "ZHI_LIAN":
res_lst = cp.get_zhi_lian_position_list(company_name, res_lst=[])
elif site == "FIVE_ONE":
res_lst = cp.get_five_one_position_list(company_name, res_lst=[])
else:
res_lst = []
# res_lst = zhi_lian_lst + five_one_lst
sql = 'insert into ligang.lg_position_2(company_name,city, position, source,publis_time) values(%s,%s,%s,%s,%s)'
for res in res_lst:
print res
data = (company_name, res.get('city'), res.get('jobName'), site, datetime.now())
cp.mysql_handle.save(sql=sql, data=data)
print json.dumps(res_lst, ensure_ascii=False, indent=4)
cp.logger.info(
'match position : {},TOTAL'
'{}'.format(site, len(res_lst)))
except Exception as e:
cp.logger.exception(e)
if __name__ == '__main__':
# main()
gevent.joinall([
gevent.spawn(main, ) for i in range(settings.COROUTINE_NUM)
])
文件数据入redis队列:
# encoding=utf-8 from company_position import redis_client positions = [ 'FIVE_ONE|天津津天连达贸易有限公司', 'FIVE_ONE|上海丽享贸易有限公司' ] for task in positions: redis_client.lpush('lg', task) print redis_client.llen('lg')
settings.py配置文件:
REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 REDIS_PASSWORD = '' REDIS_DB = 0 COROUTINE_NUM = 5