直接复制可用
爬取一个网站,直接复制可用
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
# 设置初始网址和timeout
base_url = "需要爬取的网址"
timeout = (500, 3000)
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.svg', '.webp']
# 定义一个函数来爬取页面并创建目录结构
def crawl_site(url, defaultindex="index.html", current_dir = "currendfiles"):
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
parsed_url = urlparse(url)
current_dir = os.path.join(current_dir, parsed_url.netloc)
# 创建当前页面的目录
os.makedirs(current_dir, exist_ok=True)
# 下载当前页面的HTML,如果文件已存在则跳过
html_file_path = os.path.join(current_dir, defaultindex)
if not os.path.exists(html_file_path):
with open(html_file_path, 'w', encoding='utf-8') as html_file:
html_file.write(response.text)
# 处理CSS文件
for css_link in soup.find_all('link', rel='stylesheet', href=True):
css_url = urljoin(url, css_link['href'])
parsedcss_url = urlparse(css_url)
cssfilename = os.path.basename(parsedcss_url.path)
css_directory_path = os.path.join(current_dir, "css")
# 创建目录结构
if css_directory_path:
os.makedirs(css_directory_path, exist_ok=True)
# 下载CSS文件,如果文件已存在则跳过
css_filename = os.path.join(css_directory_path, cssfilename)
if not os.path.exists(css_filename):
css_response = requests.get(css_url, timeout=timeout)
with open(css_filename, 'wb') as file:
file.write(css_response.content)
# 处理静态文件(如CSS、JS、图片等)
for link in soup.find_all(['script', 'img'], src=True):
resource_url = urljoin(url, link.get('src'))
if resource_url:
# 解析链接并提取路径部分
parsed_url = urlparse(resource_url)
resource_path = parsed_url.path
## 使用os.path.basename获取最后一个部分,即 "logo.png"
filename = os.path.basename(resource_path)
directory_path = os.path.dirname(resource_path)
# 使用os.path.basename再次获取最后一个部分,即 "images"
images_field = os.path.basename(directory_path)
current_dir2 = ""
if images_field :
current_dir2 = os.path.join(current_dir, images_field)
else:
_, file_extension = os.path.splitext(resource_path)
if file_extension.lower() in image_extensions:
current_dir2 = os.path.join(current_dir, "images")
# 创建目录结构
if current_dir2:
os.makedirs(current_dir2, exist_ok=True)
save_url = os.path.join(current_dir2,filename)
# 下载文件,如果文件已存在则跳过
if not os.path.exists(save_url):
resource_response = requests.get(resource_url, timeout=timeout)
with open(save_url, 'wb') as file:
file.write(resource_response.content)
# 处理三方外链
for link in soup.find_all('a', href=True):
href = link.get('href')
if href.startswith(('http://', 'https://')):
# 解析链接并提取路径部分
parsed_url = urlparse(href)
link_path = parsed_url.path
directory_name = parsed_url.netloc
defaultindename = parsed_url.path.split("/")[-1]
if directory_name.startswith('www.'):
continue
crawl_site(href, defaultindename)
except requests.exceptions.RequestException as e:
print(f"Request failed for {
url}: {
e}")
# 开始爬取
crawl_site(base_url)