import time,os
from selenium import webdriver
from lxml import etree
import requests
def downloads_excel(excel_url, year, name):
get_excel = requests.get(str(excel_url))
save_file_path = 'C:/' + str(year)
save_file_name = save_file_path + '/' + str(name)
if (not os.path.exists(save_file_path)):
os.makedirs(save_file_path) # 如果文件夹不存在,创建新文件夹
if (not os.path.exists(save_file_name)):
# 如果文件不存在,开始写入文件
with open(save_file_name, 'wb') as fp:
fp.write(get_excel.content)
def get_content_by_selenium(url):
#1.创建驱动
# driver = webdriver.PhantomJS() #无界面驱动
driver = webdriver.Chrome() #无界面驱动
#下载2016到2019的重庆统计年鉴
for year in range(2016, 2020):
#2.拼接url
complete_url = url + str(year) +'/lefte.htm'
#3.请求
driver.get(complete_url)
#4.解析页面
html = driver.page_source
html = etree.HTML(html)
file_name_List = html.xpath('//ul/li/a/text()')
excel_url_List = html.xpath('//ul/li/a/@href')
# 下载文件
for i in range(len(excel_url_List)):
if ('.htm' in str(excel_url_List[i]) or '主要统计指标解释' in str(file_name_List[i]) or '简要说明' in str(file_name_List[i])):
continue;
downloads_excel(url + str(year) + '/' + str(excel_url_List[i]), #下载Excel的链接
str(year),#文件夹名
str(file_name_List[i]) + '.xls'#下载的excel文件名
)
print(str(year)+'年的已全部下载!')
time.sleep(1)
driver.close()
#基础url
base_url = 'http://tjj.cq.gov.cn/tjnj/'
get_content_by_selenium(base_url)
参考;Python-selenium-爬取河南省统计年鉴信息