版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_29622761/article/details/84251057
爬虫地址
http://per.spdb.com.cn/professional_investment_research/preferential_merchants/
爬虫技术
参照其他几篇文章:
爬虫-中国银行卡-优惠商户活动数据(2018-11-15)
爬虫-新浪财经-信用卡优惠商店数据(2018-11-15)
爬虫代码
# -*-coding:utf-8-*-
import os
import requests
import xlrd
import xlwt
from lxml import etree
from xlutils.copy import copy
def get_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response
except Exception:
return None
def parse_page(page):
all_contexnt = page.xpath('//div[@class="content_text_l fl"]/ul/li/p/text()')
# print(all_contexnt)
for i in range(len(all_contexnt)):
time = str(all_contexnt[0]).replace('\xa0', '')
target = all_contexnt[1]
content = all_contexnt[2]
try:
rule = all_contexnt[3]
except Exception:
rule = ''
try:
address = all_contexnt[4]
except Exception:
address = ''
try:
telephone = all_contexnt[5]
except Exception:
telephone = ''
try:
introduction = all_contexnt[6]
except Exception:
introduction = ''
info_list = [time, target, content, rule, address, telephone, introduction]
print(info_list)
return info_list
def write_data(sheet, row, lst):
for data_infos in lst:
j = 0
for data in data_infos:
sheet.write(row, j, data)
j += 1
row += 1
def save(file_name, data):
if os.path.exists(file_name):
# 打开excel
rb = xlrd.open_workbook(file_name, formatting_info=True)
# 用 xlrd 提供的方法获得现在已有的行数
rn = rb.sheets()[0].nrows
# 复制excel
wb = copy(rb)
# 从复制的excel文件中得到第一个sheet
sheet = wb.get_sheet(0)
# 向sheet中写入文件
write_data(sheet, rn, data)
# 删除原先的文件
os.remove(file_name)
# 保存
wb.save(file_name)
else:
header = ['blank_netnetname', 'blank_netaddress', 'blank_netphonenum', 'blank_activitycontent',
'activity_deadtime']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('浦发银行-信用卡数据')
# 向 excel 中写入表头
for h in range(len(header)):
sheet.write(0, h, header[h])
# 向sheet中写入内容
write_data(sheet, 1, data)
book.save(file_name)
def main():
print('*' * 80)
print('\t\t\t\t浦发银行-信用卡数据下载,下载地址为:http://creditcard.pingan.com/cms-tmplt/searchShangHuList.do')
print('作者:谢华东 2018.11.19')
print('--------------')
path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入则保存到当前地址:\n'))
file_name = path + '浦发银行-信用卡数据.xls'
print(file_name)
base_url = 'http://per.spdb.com.cn/professional_investment_research/preferential_merchants/{catolog}/index_{num_page}.shtml'
all_nedd_scrapy_url = ['dnxb', 'sqshq', 'yxjnh']
# 计算总共有多少页
for catolog in all_nedd_scrapy_url:
# print(catolog)
for i in range(0, 3):
if i == 0:
url = 'http://per.spdb.com.cn/professional_investment_research/preferential_merchants/{catolog}/index.shtml'.format(
catolog=catolog)
else:
url = base_url.format(catolog=catolog, num_page=i)
# print(url)
response = get_page(url)
print('该' + url + '下没有数据')
if response == None:
continue
else:
page = response.content.decode('utf-8')
# print(page)
html = etree.HTML(page)
all_detail_url = html.xpath('//ul[@class="ul_list"]/li/a/@href')
# print(all_detail_url)
# print('------------')
all_info_list = []
for url in all_detail_url:
detail_url = 'http://per.spdb.com.cn/professional_investment_research/preferential_merchants/' + catolog + url[
1:]
print(detail_url)
detail_page = etree.HTML(get_page(detail_url).content.decode('utf-8'))
info_list = parse_page(detail_page)
all_info_list.append(info_list)
save(file_name, all_info_list)
if __name__ == '__main__':
main()
致谢
感谢这美好的生活。时光很长,代码要慢慢写。