爬取整个北京地区的python职位存储为sqlite文件
最终爬取结果截图
共爬取数据18804条
创建数据库文件
class Crawldb:
def __init__(self):
self.db_file = 'lago_python.db'
self.table = 'lago'
self.create_db_file()
self.conn = sqlite3.connect('lago_python.db')
self.create_table
创建数据库文件
def create_db_file(self):
if not os.path.exits(self.db_file):
f = open(self.db.db_file,'w+')
f.close()
return True
创建表
def create_table(self):
cursor = self.conn.cursor()
sql = "DROP TABLE IF EXISTS'{table}'".format(table=self.table)
cursor.execute(sql)
sql = "create table %s (id integer primary key not null, " \
"district varchar(25) not null, biz_area varchar(25) not null, createTime varchar(25) not null, " \
"companyShortName varchar(25) not null,companySize varchar(25) not null, industryField varchar(25) not null," \
"positionName varchar(25) not null, firstType varchar(25) not null, secondType varchar(25) not null, " \
"salary varchar(25) not null, workYear varchar(25) not null,education varchar(25) not null, positionId varchar(25) not null)" % self.table
cursor.execute(sql)
cursor.close()
self.conn.commit()
return True
插入数据
def insert_data(self, qrgs):
sql = "insert into `lago_` (`district`, `biz_area`, `createTime`," \
"`companyShortName`,`companySize`,`industryField`," \
" `positionName`, `firstType`,`secondType`," \
" `salary`, `workYear`, `education`, `positionId`) values "
sql += "('%s', '%s','%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % args
row = self.conn.cursor().execute(sql).rowcount
self.conn.cursor().close()
self.conn.commit()
return row
爬取拉勾
class Crawl:
def __init__(self):
self.city = '北京'
self.db = Crawldb()
self.district = dict()
self.headers=[
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
]
self.browesers = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
]
爬取行政区信息
def get_administrative(self):
url = "https://www.lagou.com/jobs/list_python?city={city}&cl=false&fromSearch=true&labelWords=&suginput=".format(city = self.city)
html = requests.get(url,headers = self.headers).content.decode('utf-8')
soup = BeautifulSoup(html,'lxml')
res = soup.find_all('div',class_='content')
for tag in res[0].find_all('a'):
tag.string != "不限" and self.district.setdefault(tag.string,[])
爬取商业区信息
def get_business(self):
for i in self.district.keys():
url = 'https://www.lagou.com/jobs/list_python?px=default&city={city}&district={district}#filterBox'.format(city=self.city,district=dis)
html = requests.get(url, headers = self.headers).content.decode('utf-8')
soup = BeautifulSoup(html,'lxml')
res = soup.find_all('li',class_='detail-bizArea-area')
for tag in res[0].find_all('a'):
tag.string != "不限" and self.district[i].apeend(tag.string)
print(self.district[i])
爬取商业区职位信息
def crawl_positions(self, district, biz_area):
url = "https://www.lagou.com/jobs/positionAjax.json?px=new&city="+self.city+"&district="+district+"&bizArea="+biz_area+"&needAddtionalResult=false"
referer_url = "https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?px=new&city="+quote(self.city)+"&district="+quote(district)+"&bizArea="+quote(biz_area)
self.headers['User-Agent'] = self.browsers[random.randint(0,5)]
self.headers['Referer'] = referer_url
page = 1
flag = 'true'
result = queue.Queue()
while True:
if page != 1:
flag = 'false'
data = {
'first':flag,
'pn':page,
'kd':'python'
}
res = self.analysis_data(url,data)
if not res['content']['positionResult']['result']:
break
for i in res['content']['positionResult']['result']:
data = district, biz_area, i['createTime'],\
i['companyShortName'],i['companySize'],i['industryField'], \
i['positionName'], i['firstType'], i['secondType'],\
i['salary'], i['workYear'], i['education'], i['positionId']
result.put(data)
page = page + 1
return result
使用代理ip从url中获取数据
def get_data(self, url, data)
try:
response = requests.get('http://localhost:5555/random')
proxy = {'http':'http://'+response.text}
res = requests.post(url, headers = self.headers, proxies=proxy, data=data, timeout=5)
except:
print('代理失效,重新获取请求')
res = self.get_data(url,data)
return res
分析数据是否有key error
def analysis_data(self,url,data):
res = self.get_data(url,data)
res = json.loads(res.content.decode('utf-8'))
try:
if not res['content']['positionResult']['result']:
pass
except KeyError:
print('Keyerror,继续请求')
res = self.analysis_data(url, data)
return res
爬取职位信息主程序
def crawl(self):
for i in self.district.key():
for biz_area in self.district[i]:
res_queue = self.crawl_positions(i,biz_area)
print('%s %s的职位信息爬取完毕:%d' %(i, biz_area,res_queue.qsize()))
while not res_queue.empty():
data = res_queue.get()
row = self.db.insert_data(data)
if not row:
print('插入数据失败')
爬取部分运行
def run(self):
self.get_administrative()
self.get_business()
self.crawl