一个Python爬虫代码,直接copy可用

直接复制可用

爬取一个网站,直接复制可用

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

# 设置初始网址和timeout
base_url = "需要爬取的网址"
timeout = (500, 3000)
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.svg', '.webp']


# 定义一个函数来爬取页面并创建目录结构
def crawl_site(url, defaultindex="index.html", current_dir = "currendfiles"):
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        parsed_url = urlparse(url)
        current_dir = os.path.join(current_dir,  parsed_url.netloc)
        # 创建当前页面的目录
        os.makedirs(current_dir, exist_ok=True)

        # 下载当前页面的HTML,如果文件已存在则跳过
        html_file_path = os.path.join(current_dir, defaultindex)
        if not os.path.exists(html_file_path):
            with open(html_file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)


        # 处理CSS文件
        for css_link in soup.find_all('link', rel='stylesheet', href=True):
            css_url = urljoin(url, css_link['href'])
            parsedcss_url = urlparse(css_url)
            cssfilename = os.path.basename(parsedcss_url.path)
            css_directory_path = os.path.join(current_dir, "css")
            # 创建目录结构
            if css_directory_path:
                os.makedirs(css_directory_path, exist_ok=True)

            # 下载CSS文件,如果文件已存在则跳过
            css_filename = os.path.join(css_directory_path, cssfilename)
            if not os.path.exists(css_filename):
                css_response = requests.get(css_url, timeout=timeout)
                
                with open(css_filename, 'wb') as file:
                    file.write(css_response.content)


        # 处理静态文件(如CSS、JS、图片等)
        for link in soup.find_all(['script', 'img'], src=True):
            resource_url = urljoin(url, link.get('src'))
            if resource_url:
                # 解析链接并提取路径部分
                parsed_url = urlparse(resource_url)
                resource_path = parsed_url.path
                ## 使用os.path.basename获取最后一个部分,即 "logo.png"
                filename = os.path.basename(resource_path)
                directory_path = os.path.dirname(resource_path)
                # 使用os.path.basename再次获取最后一个部分,即 "images"
                images_field = os.path.basename(directory_path)
                current_dir2 = ""

                if images_field :
                    current_dir2 = os.path.join(current_dir,  images_field)
                else:
                    _, file_extension = os.path.splitext(resource_path)

                    if file_extension.lower() in image_extensions:
                        current_dir2 = os.path.join(current_dir, "images")


                # 创建目录结构       
                if current_dir2:
                    os.makedirs(current_dir2, exist_ok=True)

                save_url = os.path.join(current_dir2,filename)             
                # 下载文件,如果文件已存在则跳过
                if not os.path.exists(save_url):
                    resource_response = requests.get(resource_url, timeout=timeout)
                    with open(save_url, 'wb') as file:
                        file.write(resource_response.content)


            # 处理三方外链
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            if href.startswith(('http://', 'https://')):
                # 解析链接并提取路径部分
                parsed_url = urlparse(href)
                link_path = parsed_url.path
                directory_name = parsed_url.netloc
                defaultindename = parsed_url.path.split("/")[-1]              
                if directory_name.startswith('www.'):
                    continue
                               
                crawl_site(href, defaultindename)

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {
      
      url}: {
      
      e}")

# 开始爬取
crawl_site(base_url)

猜你喜欢

转载自blog.csdn.net/weixin_45047825/article/details/134489777