import requests
from lxml import etree
import chardet
def get_one_page(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
response=requests.get(url,headers=headers)
response.encoding=chardet.detect(response.content)['encoding']
return response.text
def parse_one_page(html):
result=etree.HTML(html)
item={}
item['t1']=result.xpath('//div[@class="el"]/p/span/a/text()')
item['t2']=result.xpath('//div[@class="el"]/span[@class="t2"]/a/text()')
item['t3']=result.xpath('//div[@class="el"]/span[@class="t3"]/text()')
t4=result.xpath('//div[@class="el"]/span[@class="t4"]')
item['t4']=[]
for i in t4:
item['t4'].append(i.xpath('string(.)'))
item['t5']=result.xpath('//div[@class="el"]/span[@class="t5"]/text()')
item['href']=result.xpath('//div[@class="el"]/p/span/a/@href')
for i in range(len(item['t1'])):
item['t1'][i]=item['t1'][i].strip()
zw_low=[]
zw_height=[]
for xz in item['t4']:
if xz !="":
xz=xz.strip().split('-')
if len(xz)>1:
if xz[1][-1]=='月' and xz[1][-3]=='万':
zw_low.append(float(xz[0])*10000)
zw_height.append(float(xz[1][0:-3])*10000)
elif xz[1][-1]=='年' and xz[1][-3]=='万':
zw_low.append(round((float(xz[0])*10000)/12,1))
zw_height.append(round((float(xz[1][0:-3])*10000)/12,1))
elif xz[1][-1]=='月' and xz[1][-3]=='千':
zw_low.append(float(xz[0])*1000)
zw_height.append(float(xz[1][0:-3])*1000)
else:
zw_low.append(0)
zw_height.append(0)
else:
if xz[0][-1] =='天' and xz[0][-3]=='元':
zw_low.append(xz[0][0:-3])
zw_height.append(xz[0][0:-3])
else:
zw_low.append(0)
zw_height.append(0)
else:
zw_low.append(0)
zw_height.append(0)
item['xz_low']=zw_low
item['xz_height']=zw_height
for i in range(len(item['t5'])):
item['t5'][i]='2018-'+item['t5'][i]
yield item
def write_to_mysql(content):
for i in range(len(content['t1'])):
zwmc=content['t1'][i]
print(zwmc)
gsmc=content['t2'][i]
print(gsmc)
gzdd=content['t3'][i]
print(gzdd)
xz_low=content['xz_low'][i]
print(xz_low)
xz_height=content['xz_height'][i]
print(xz_height)
ptime=content['t5'][i]
print(ptime)
href=content['href'][i]
print(href)
def main(page):
url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,'+str(page)+'.html'
html=get_one_page(url)
for i in parse_one_page(html):
print(i)
write_to_mysql(i)
if __name__ == '__main__':
for i in range(1,20):
main(i)