1-11 requests模块之爬取简历模板(xpath解析)

# 目标网站:https://sc.chinaz.com/jianli/free.html
from lxml import etree
import requests


# 获取网页源代码:
def get_page_source_code(url):
    headers = {
    
    
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36"
    }
    resp = requests.get(url=url, headers=headers)
    resp.encoding = "utf-8"
    html = resp.text
    resp.close()
    return html


# 解析子页面数据,用于在analyze_target_data()中被调用
def analyze_child_data_and_save_data(child_name,child_html):
    tree = etree.HTML(child_html)
    # 需要解析出来的是下载地址:
    download_url = tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
    resp = requests.get(download_url)
    content = resp.content
    resp.close()
    with open(f"./6/{
      
      child_name}.{
      
      download_url.split('.')[-1]}", "wb") as f:
        f.write(content)


# 解析数据:
def analyze_target_data(main_html):
    # 解析主页数据,获得主页中的子页面名称和url:
    tree = etree.HTML(main_html)
    # 1. 获得缩小范围后的div区域列表:
    div_list = tree.xpath('//div[@id="main"]/div/div')
    for div in div_list:
        child_page_name = div.xpath('./a/img/@alt')[0]
        child_page_url = "https:" + div.xpath('./a/@href')[0]
        print(child_page_url, child_page_name)
        # 对子页面发起请求:
        child_html = get_page_source_code(child_page_url)
        # 对子页面进行解析:
        analyze_child_data_and_save_data(child_page_name, child_html)


def main():
    # 实现翻页爬取:
    start = int(input("请输入您要爬取的开始页:"))
    end = int(input("请输入您要爬取的结束页:"))
    if start <= 0 or end <= 0:
        print("请输入正确的页码(从1开始)")
    else:
        for page in range(start, end + 1):
            print(f"正在爬取第{
      
      page}页:")
            if page == 1:
                url = "https://sc.chinaz.com/jianli/free.html"
            else:
                url = f"https://sc.chinaz.com/jianli/free_{
      
      page}.html"
            main_html = get_page_source_code(url)
            analyze_target_data(main_html)


if __name__ == '__main__':
    main()

运行结果:
在这里插入图片描述

  • xpath表达式中可以使用运算符:
	a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')

猜你喜欢

转载自blog.csdn.net/ungoing/article/details/124116979