# 目标网站:https://sc.chinaz.com/jianli/free.html
from lxml import etree
import requests
# 获取网页源代码:
def get_page_source_code(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36"
}
resp = requests.get(url=url, headers=headers)
resp.encoding = "utf-8"
html = resp.text
resp.close()
return html
# 解析子页面数据,用于在analyze_target_data()中被调用
def analyze_child_data_and_save_data(child_name,child_html):
tree = etree.HTML(child_html)
# 需要解析出来的是下载地址:
download_url = tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
resp = requests.get(download_url)
content = resp.content
resp.close()
with open(f"./6/{
child_name}.{
download_url.split('.')[-1]}", "wb") as f:
f.write(content)
# 解析数据:
def analyze_target_data(main_html):
# 解析主页数据,获得主页中的子页面名称和url:
tree = etree.HTML(main_html)
# 1. 获得缩小范围后的div区域列表:
div_list = tree.xpath('//div[@id="main"]/div/div')
for div in div_list:
child_page_name = div.xpath('./a/img/@alt')[0]
child_page_url = "https:" + div.xpath('./a/@href')[0]
print(child_page_url, child_page_name)
# 对子页面发起请求:
child_html = get_page_source_code(child_page_url)
# 对子页面进行解析:
analyze_child_data_and_save_data(child_page_name, child_html)
def main():
# 实现翻页爬取:
start = int(input("请输入您要爬取的开始页:"))
end = int(input("请输入您要爬取的结束页:"))
if start <= 0 or end <= 0:
print("请输入正确的页码(从1开始)")
else:
for page in range(start, end + 1):
print(f"正在爬取第{
page}页:")
if page == 1:
url = "https://sc.chinaz.com/jianli/free.html"
else:
url = f"https://sc.chinaz.com/jianli/free_{
page}.html"
main_html = get_page_source_code(url)
analyze_target_data(main_html)
if __name__ == '__main__':
main()
运行结果:
- xpath表达式中可以使用运算符:
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')