from bs4 import BeautifulSoup
from urllib import request
import pandas as pd
import numpy as np
import urllib.parse as urp
import time
import json
loc_1 = [30.389814,103.801536]
loc_2 = [30.836448,104.299382]
#步长根据测试选择相对合适的值
step = 0.02
#for循环嵌套,获取loc_2与loc_1间步长0.03的矩形区域列表
loc_fin = []
for a in range(1,int((loc_2[0]-loc_1[0])/step+1)+1):
print(a)
for b in range(1,int((loc_2[1]-loc_1[1])/step+1)+1):
print(b)
lat_1 = round((loc_1[0]+step*a),6)
lon_1 = round((loc_1[1]+step*b),6)
lat_2 = round((lat_1-step),6)
lon_2 = round((lon_1-step),6)
loc_fin.append(str(lat_2)+","+str(lon_2)+','+str(lat_1)+','+str(lon_1))
df = pd.DataFrame({'loc':loc_fin,'id':np.arange(len(loc_fin))})
class RestrantInfo:
def __init__(self,data):
self._df = data
self._df_final = pd.DataFrame([],columns = ['name','area','address',\
'lng','lat','price','com_num','key_word',\
'tag','type','children'])
def _getjson(self,url):
req = request.urlopen(url)
res = req.read().decode()
result = json.loads(res)
return result
def _getrating(self,detail):
try:
return detail['overall_rating']
except:
return np.nan
def _getchild(self,detail):
try:
return detail['childeren']
except:
return np.nan
def _getcom_num(self,detail):
try:
return detail['comment_num']
except:
return np.nan
def _getword(self,detail):
try:
return detail['di_review_keyword']
except:
return np.nan
def _gettag(self,detail):
try:
return detail['tag']
except:
return np.nan
def _gettype(self,detail):
try:
return detail['type']
except:
return np.nan
def _getprice(self,detail):
try:
return detail['price']
except:
return np.nan
def getinfo(self,start,end):
my_ak = ##使用自己的AK
for i in self._df.index[start:end]:
print(i)
temp_df = pd.DataFrame([],columns = ['name','area','address',\
'lng','lat','price','com_num','key_word',\
'tag','type','children','rating'])
url = 'http://api.map.baidu.com/place/v2/search?query='+urp.quote('美食')+\
'&bounds='+self._df.loc[i,'loc']+'&output=json&scope=2&page_size=20&page_num='+\
str(0)+'&ak='+my_ak
result = self._getjson(url)
total = result['total']
if total > 0:
for j in np.arange(0,int(total/20)+1):
try:
url = 'http://api.map.baidu.com/place/v2/search?query='+urp.quote('美食')+\
'&bounds='+self._df.loc[i,'loc']+'&output=json&scope=2&page_size=20&page_num='+\
str(j)+'&ak='+my_ak
r1 = self._getjson(url)
r2 = pd.Series(r1['results'])
name = r2.apply(lambda x:x['name'])
area = r2.apply(lambda x:x['area'])
address = r2.apply(lambda x:x['address'])
lat = r2.apply(lambda x:x['location']).apply(lambda x:x['lat'])
lng = r2.apply(lambda x:x['location']).apply(lambda x:x['lng'])
rating = r2.apply(lambda x:x['detail_info']).apply(self._getrating)
children = r2.apply(lambda x:x['detail_info']).apply(self._getchild)
tag = r2.apply(lambda x:x['detail_info']).apply(self._gettag)
type_1 = r2.apply(lambda x:x['detail_info']).apply(self._gettype)
key_word = r2.apply(lambda x:x['detail_info']).apply(self._getword)
com_num = r2.apply(lambda x:x['detail_info']).apply(self._getcom_num)
price = r2.apply(lambda x:x['detail_info']).apply(self._getprice)
temp_df['name'] = name
temp_df['area'] = area
temp_df['address'] = address
temp_df['lng'] = lng
temp_df['lat'] = lat
temp_df['price'] = price
temp_df['com_num'] = com_num
temp_df['key_word'] = key_word
temp_df['tag'] = tag
temp_df['type'] = type_1
temp_df['children'] = children
temp_df['rating'] = rating
self._df_final = self._df_final.append(temp_df,ignore_index=True)
except:
continue
return self._df_final
成都美食信息爬虫
猜你喜欢
转载自blog.csdn.net/weixin_41968760/article/details/80927248
今日推荐
周排行