【新手,代码水平请见谅,后期会优化】
数据网页链接:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/
import urllib.request
from bs4 import BeautifulSoup
import time
import random
def url_open( url ):
formate = {
'User - Agent': 'Mozilla / 5.0(WindowsNT10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 65.0.3325.181Safari / 537.36'
}
data = urllib.parse.urlencode(formate).encode(encoding='UTF8')
req = urllib.request.Request(url=url, data=data,method='POST')
response = urllib.request.urlopen(req)
html = response.read().decode('gbk')
return html
def bs(html, url):
# 从北京市开始
i = 0
soup = BeautifulSoup(html, "html.parser")
# 获取省
for li in soup.find_all('tr', class_="provincetr"):
for a in li:
if i > 0:
i = i - 1
else:
province = a.find('a').get_text()
# print(province)
# 获取省的下一级链接(市)
b = a.find('a')
plink = b.get('href')
# print(plink)
i = 1
# 获取市的相关数据
citylink = url_open(url + plink)
# print(citylink)
citysoup = BeautifulSoup(citylink, "html.parser")
for citytr in citysoup.find_all('tr', class_="citytr"):
for citya in citytr:
# time.sleep(random.randint(3000, 7000) / 1000) # 随机等待2~7秒
city = citya.find('a').get_text()
# print(city)
if i > 0:
pass
i = i - 1
else:
# 获取市的下一级链接
c = citya.find('a')
plink2 = c.get('href')
# print(plink2)
i = i+1
# 获取县区的相关数据
countylink = url_open(url + plink2)
# print(plink[0:1])
countsoup = BeautifulSoup(countylink, 'html.parser')
for countytr in countsoup.find_all('tr', class_="countytr"):
for countya in countytr:
# time.sleep(random.randint(1000, 3000) / 1000) # 随机等待2~7秒
if countya.find('a') is None:
count = countya.string
# print(count)
else:
county = countya.find('a').get_text()
count = county
# print(count)
if i > 0:
pass
i = i - 1
else:
# 获取县区的下一级链接
d = countya.find('a')
plink3 = d.get('href')
# print(plink3)
i = i + 1
# 获取镇、街道办事处相关数据
townlink = url_open(url + plink2[0:2]+'/'+plink3)
townsoup = BeautifulSoup(townlink, 'html.parser')
for towntr in townsoup.find_all('tr', class_='towntr'):
for towna in towntr:
town = towna.find('a').get_text()
# print(town)
# time.sleep(random.randint(700, 1500) / 1000) # 随机等待2~7秒
if i > 0:
pass
i = i - 1
else:
# 获取街道、镇的下一级链接
e = towna.find('a')
plink4 = e.get('href')
# print(plink4)
i = i + 2
# 获取居委会、村委会相关数据
villagelink = url_open(url + plink2[0:2]+'/'+plink3[0:2]+'/' + plink4)
villagesoup = BeautifulSoup(villagelink, 'html.parser')
for villagetr in villagesoup.find_all('tr', class_='villagetr'):
for villagetd in villagetr:
time.sleep(random.randint(500, 1000) / 1000) # 随机等待2~7秒
villagetd.find_all('td')
# print(villagetd.string)
if i > 0:
pass
i = i - 1
else:
village = villagetd.string
i = i + 2
print(province+','+city+',' + count + ',' + town + ',' + village)
with open('D:/地址/address.txt', 'a') as f:
f.write(province+',' + city + ',' + count + ','+town+',' + village)
f.write('\n')
i = 1
url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/"
html = url_open(url)
bs(html, url)
# print(html)