关于使用什么模块写爬虫做数据解析,并没有刻板的规定,鉴于正在学习仍使用xpath
分析站长素材网的免费简历模板,需求分析不再赘述,且看编码过程
#!/usr/bin/env python
# encoding: utf-8
"""
@file: 解析站长素材_免费简历模板.py
@time: 2020/2/29 14:30
"""
import requests
from lxml import etree
import random
import os
def resume():
index = int(input('您要下载几页:'))
file = './免费简历模板'
if not os.path.exists(file):
os.mkdir(file)
headers = {
'Connection': 'close', # 请求成功释放当前资源
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.116 Safari/537.36'
}
url = 'http://sc.chinaz.com/jianli/free_%d.html'
for page in range(1, index):
if page == 1:
# 第一页的url不一样 需要单独处理
new_url = 'http://sc.chinaz.com/jianli/free.html'
else:
# 拼接从第二页开始往后的url
new_url = format(url % page)
response = requests.get(url=new_url, headers=headers)
# 手动设置响应数据的编码格式(处理中文乱码)
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
# 找到简历对应的div
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
# 获取详情页的url
detail_url = div.xpath('./a/@href')[0]
# 使用alt作为简历的文件名称
resume_name = div.xpath('./a/img/@alt')[0]
# 获取简历详情页面
detail_page = requests.get(url=detail_url, headers=headers).text
tree = etree.HTML(detail_page)
# 获取下载地址
download_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
# 随机选下载地址,防ip检测
download_url = random.choice(download_list)
# 持久化存储
data = requests.get(url=download_url, headers=headers).content
filename = resume_name + '.rar'
filepath = file + '/' + filename
with open(filepath, 'wb') as fp:
fp.write(data)
print(filename, '下载成功')
if __name__ == '__main__':
resume()
print('Over!!!')