全国行政区划代码
因为要用地区编码,找了一遍,不是太旧,就是要收费,所以自己去爬一下,需要可以自取
- 数据来源 国家统计局
- 数据范围: 省、市 、区、镇、居委会 5级行政代码,其中四个直辖市是4级
- 获取数据时间:2023/4/5
- 数据量:662892条
- 写入excel容量:19.3M
- 此版本使用单线程抓取数据,此过程及其耗时,抓取耗时75分钟(此过程可用线程池加速),写入excel 耗时5分钟
# encoding:utf-8
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from openpyxl.workbook import Workbook
'''
存放抓取的数据
'''
class Entry:
def __init__(self, code: str = None, desc: str = None, url: str = '', parent_code: str = '000000000000',
level: int = 1):
self.level = level
self.url = url
self.code = code
self.desc = desc
self.arr = []
self.parent_code = parent_code
def __repr__(self):
return '(%s %s %s %s)' % (self.code, self.desc, self.level, self.url)
'''
抓取数据,写入excel
'''
class China:
def __init__(self):
print('开始抓取数据,当前时间:', datetime.now().strftime('%X'), '\n')
self.se = requests.Session()
# 访问首页抓取省信息
self.province = []
for a in self.__parse_url('index.html', 'td a:not([class])'):
# 原始unicode转义
desc = a.text.encode('raw_unicode_escape').decode()
url = a['href']
code = '%d0000000000' % int(url[0:2])
self.province.append(Entry(code, desc, url, '000000000000', 1))
if len(self.province) == 31:
self.province.append(Entry('710000000000', '台湾省', '', '000000000000', 1))
self.province.append(Entry('810000000000', '香港特别行政区', '', '000000000000', 1))
self.province.append(Entry('820000000000', '澳门特别行政区', '', '000000000000', 1))
# 可以使用线程池优化,抓取所有数据太慢了
# 遍历抓取各省数据
for obj in self.province:
self.__parse_data(obj)
print('\n抓取数据,完成时间:', datetime.now().strftime('%X'), '\n')
global read_num
print(f'读取数据:{
read_num}条')
# 编码、名称、上级行政编码、当前数据是几级行政单位
th = ('code', 'desc', 'parent_code', 'level')
workbook = Workbook()
sheet = workbook.active
sheet.title = '汇总数据'
sheet.append(th)
self.sheet = sheet
for data in self.province:
self.__write(data)
workbook.save('中华人民共和国行政区划代码.xlsx')
print('\n写入数据,完成时间:', datetime.now().strftime('%X'), '\n')
global write_num
print(f'写入数据:{
write_num}条')
# 写入数据
def __write(self, data: Entry):
desc = data.desc
arr = data.arr
parent_code = data.parent_code
level = data.level
code = data.code
# 直辖市 下面分有 市辖区和县 ,这两个数据用不着
if desc == '市辖区' or desc == '县':
if arr != []:
for obj in arr:
obj.parent_code = parent_code
obj.level = level
self.__write(obj)
return
global write_num
write_num += 1
self.sheet.append((code, desc, parent_code, level))
if arr != []:
for obj in arr:
obj.level = level + 1
self.__write(obj)
# 解析url,获取其中指定的元素
def __parse_url(self, index, re):
url = f'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{
index}'
html = self.se.get(url, timeout=None)
soup = BeautifulSoup(html.text, "lxml")
return soup.select(re)
# 在网页中抓取对应数据
def __parse_data(self, obj: Entry):
index = obj.url
if index == '':
return
level = obj.level
key = ''
if level == 1:
key = 'citytr'
if level == 2:
key = 'countytr'
if level == 3:
key = 'towntr'
if level == 4:
key = 'villagetr'
arr = obj.arr
parent_code = obj.code
for tr in self.__parse_url(index, f'tr[class={
key}]'):
global read_num
read_num +=1
a_arr = tr.select('a')
if a_arr == []:
code = tr.select('td')[0].text
url = ''
if len(tr.select('td')) == 2:
desc = tr.select('td')[1].text.encode('raw_unicode_escape').decode()
else:
desc = tr.select('td')[2].text.encode('raw_unicode_escape').decode()
else:
a = a_arr[0]
code = a.text
url = a['href']
desc = a_arr[1].text.encode('raw_unicode_escape').decode()
if desc == '市辖区' and url == '':
continue
if url != '':
pre = url[url.index('/') + 1:]
if level == 2:
url = f'{
pre[0:2]}/{
url}'
if level == 3:
url = f'{
pre[0:2]}/{
pre[2:4]}/{
url}'
new_obj = Entry(code, desc, url, parent_code, level + 1)
arr.append(new_obj)
for i in arr:
self.__parse_data(i)
read_num=0
write_num=0
if __name__ == '__main__':
China()