拉勾项目

import requests,re
session = requests.Session()

#步骤一、首先登陆login.html，获取cookie
r1 = session.get('https://passport.lagou.com/login/login.html', headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'})

X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r1.text)[0]
X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r1.text)[0]

#步骤二、用户登陆，携带上一次的cookie，后台对cookie中的 jsessionid 进行授权
r3 = session.post(
    url='https://passport.lagou.com/login/login.json',
    data={
        'isValidate': True,
        # 'username': '[email protected]',
        # 'password': 'xxxxxxxxxxxxx',
        'username':'17610990101',
        'password':'xxxxxxxxxxxxxx',
        'request_form_verifyCode': '',
        'submit': '',
    },
    headers={
        'X-Anit-Forge-Code': X_Anti_Forge_Code,
        'X-Anit-Forge-Token': X_Anti_Forge_Token,
        'X-Requested-With': 'XMLHttpRequest',
        "Referer": "https://passport.lagou.com/login/login.html",
        "Host": "passport.lagou.com",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    },
)
print(r3.text)
# print(r3.headers)

#步骤三：进行授权
r4 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                  allow_redirects=False,
                  headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'})

# print(r4.headers)
location=r4.headers['Location']
# print(location)

#步骤四：请求重定向的地址，拿到最终的登录session
r5= session.get(location,
                  allow_redirects=True,
                  headers={
                      'Host': "www.lagou.com",
                      'Referer':'https://passport.lagou.com/login/login.html?',
                      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'})

# print(r5.headers)

#===============以上登录deal

#爬取职位信息
#实现根据传入参数，筛选职位信息
from urllib.parse import urlencode
keyword='python爬虫'
url_encode=urlencode({'k':keyword},encoding='utf-8') 
url='https://www.lagou.com/jobs/list_%s?labelWords=&fromSearch=true&suginput=' %url_encode.split('=')[1] #根据用户的keyword拼接出搜索职位的url

def search_position(
                    keyword,
                    pn=1,
                    city='北京',
                    district=None,
                    bizArea=None,
                    isSchoolJob=None,
                    xl=None,
                    jd=None,
                    hy=None,
                    yx=None,
                    needAddtionalResult=False,
                    px='detault'):
    params = {
                 'city': city,  # 工作地点,如北京
                 'district': district,  # 行政区，如朝阳区
                 'bizArea': bizArea,  # 商区，如望京
                 'isSchoolJob': isSchoolJob,  # 工作性质，如应届
                 'xl': xl,  # 学历要求，如大专
                 'jd': jd,  # 融资阶段，如天使轮,A轮
                 'hy': hy,  # 行业领域，如移动互联网
                 'yx': yx,  # 工资范围，如10-15k
                 'needAddtionalResult': needAddtionalResult,
                 'px': 'detault'
             },
    r8 = session.post('https://www.lagou.com/jobs/positionAjax.json',
                      params={
                          'city': city, #工作地点,如北京
                          'district': district,#行政区，如朝阳区
                          'bizArea': bizArea, #商区，如望京
                          'isSchoolJob': isSchoolJob, #工作性质，如应届
                          'xl': xl, #学历要求，如大专
                          'jd': jd,#融资阶段，如天使轮,A轮
                          'hy': hy, #行业领域，如移动互联网
                          'yx': yx, #工资范围，如10-15k
                          'needAddtionalResult': needAddtionalResult,
                          'px':'detault'
                      },
                      headers={
                          'Host': "www.lagou.com",
                          'Origin': 'https://www.lagou.com',
                          'Referer': url,
                          'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                          'X-Anit-Forge-Code': '0',
                          'X-Anit-Forge-Token': '',
                          'X-Requested-With': 'XMLHttpRequest',
                          'Accept': 'application/json, text/javascript, */*; q=0.01'
                      },
                      data={
                          'first': True,
                          'pn': pn,
                          'kd': keyword,
                      }
                      )
    print(r8.status_code)
    print(r8.json())
    return r8.json()


#求一份北京朝阳区10-15k的python爬虫工作
keyword='python爬虫'
yx='10k-15k'
city='北京'
district='朝阳区'
isSchoolJob='0' #应届或实习

response=search_position(keyword=keyword,yx=yx,city=city,district=district,isSchoolJob=isSchoolJob)
results=response['content']['positionResult']['result']

#打印公司的详细信息
def get_company_info(results):
    for res in results:
        info = '''
        公司全称 : %s
        地址 : %s,%s
        发布时间 : %s
        职位名 : %s
        职位类型 : %s,%s
        工作模式 : %s
        薪资 : %s
        福利 : %s
        要求工作经验 : %s
        公司规模 : %s
        详细链接 : https://www.lagou.com/jobs/%s.html
        ''' % (
            res['companyFullName'],
            res['city'],
            res['district'],
            res['createTime'],
            res['positionName'],
            res['firstType'],
            res['secondType'],
            res['jobNature'],
            res['salary'],
            res['positionAdvantage'],
            res['workYear'],
            res['companySize'],
            res['positionId']
        )
        print(info)
        # 经分析，公司的详细链接都是：https://www.lagou.com/jobs/2653020.html ，其中那个编号就是职位id
        #print('公司全称[%s],简称[%s]' %(res['companyFullName'],res['companyShortName']))

get_company_info(results)
View Code
猜你喜欢