总体思路为:
1、从网站上获取每个园区的id
2、从每个园区的网页上获取地图的iframe
3、保存加载地图iframe的网页,因为里面有polygon数据
4、从html网页中使用正则表达式提取polygon,并将数据处理成arcgis所能识别的数据格式
5、使用arcgis将点转线,在将线转面,最后导出为shp文件
最终成果如下图:
import os
from datetime import datetime
from urllib import request
import pandas as pd
import re
# 浏览器的请求头
from bs4 import BeautifulSoup
from utils.read_write import writeOneCSV, readTXT, writeTXT, writeOneTXT
headers = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"}
opener = request.build_opener()
opener.add_headers = [headers]
request.install_opener(opener)
def getCodeList():
code = readTXT('D:\project\jianguiyuan\data\省份id.txt')
return code
# 发送请求
def requerts_url(url,i,start_page):
try:
response = request.urlopen(url).read().decode("utf-8")
return response
except:
print(datetime.now())
print(i)
print(start_page)
print(url)
bian(i,start_page)
# 共26123个数哦
def bian(start_x,start_page):
for i in range(start_x,len(codeList)):
print(codeList[i])
firsturl = "https://f.qianzhan.com/yuanqu/diqu/"+codeList[i]+'/'
data = requerts_url(firsturl, i,start_page)
soup = BeautifulSoup(data, 'lxml')
tr = soup.table.find_all('tr')
for row in tr[1:]:
a = row.find_all('a')
href = a[0].attrs['href']
# yid = re.findall(r"item/(.+?)\.html",a)
# /yuanqu/item/eb42f331cc0f3062.html
# https://f.qianzhan.com/yuanqu/yqmap?center=122,23&zoom=14&yid=5292a91bbac8d78f
url = "https://f.qianzhan.com" + href
detail = requerts_url(url, i,start_page)
soup = BeautifulSoup(detail, 'lxml')
iframe = soup.findAll('iframe')
src = iframe[0].attrs['bus_taxi']
mapUrl = "https://f.qianzhan.com" + src
detail = requerts_url(mapUrl, i,start_page)
writeOneTXT(detail,savePath+href)
end = re.findall(r"收录(.+?)个", data)
page = int(int(end[0])/20)+2
for x in range(start_page, page):
print(x)
second = firsturl + '?pg='+str(x)
data1 = requerts_url(second, i,start_page)
soup = BeautifulSoup(data1, 'lxml')
tr = soup.table.find_all('tr')
for row in tr[1:]:
a = row.find_all('a')
href = a[0].attrs['href']
url = "https://f.qianzhan.com" + href
detail = requerts_url(url, i,start_page)
soup = BeautifulSoup(detail, 'lxml')
iframe = soup.findAll('iframe')
src = iframe[0].attrs['bus_taxi']
mapUrl = "https://f.qianzhan.com" + src
detail = requerts_url(mapUrl, i,start_page)
writeOneTXT(detail, savePath + href)
print(savePath+href)
if __name__ == '__main__':
# city_range(1,1)
# bianli(195551, 856009)194686
codeList = getCodeList()
savePath ='D:\da\map'
path = r'D:\dat区\\'
bian(0,2)
如需数据或帮忙处理数据请私聊我。。。