# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
url='http://sc.chinaz.com/jianli/free.html'
page_text=requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
div_list=tree.xpath('//div[@id="main"]/div/div')
if not os.path.exists('./muban'):
os.mkdir('./muban')
url_list=[]
name_list=[]
for div in div_list:
muban_href= div.xpath('./a/@href')[0]
#print(muban_href)
download_url = requests.get(url=muban_href,headers=headers).text
#print(download_url)
download_tree = etree.HTML(download_url)
name=download_tree.xpath('//div[@class="bread clearfix"]/a[3]/text()')[0]+'.rar'
download_name = name.encode('iso-8859-1').decode('utf-8')
#print(download_name)
download_url = download_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')
url_list.append(download_url)
name_list.append(download_name)
for name,url in zip(name_list,url_list):
print(name,url)
python爬虫学习(十五)xpath模板下载
猜你喜欢
转载自blog.csdn.net/haimian_baba/article/details/103820485
今日推荐
周排行