蟒蛇爬虫-美图

概述

最近刚学习python爬虫，requests，正则，bs4，xpath等知识模块。于是想着动手实战综合一下。

分析思路一

首先当然是打开网页，按下F12，审核网页元素

通过观察，我们发现图片分类放在class为"menu"的ul中，也就是我们第一步就是要取到这些分类链接已经它们的名称（用来作保存的目录）

接着我们随便打开一个分类，在网页底部分页栏处，可以发现，这个图片分类的总页数。因此，在爬取首页分类的同时，我们还要进入各个分类，拿到每个分类的总页数，所以开始动手我们的第一步代码

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

"Cookie": "UM_distinctid=16f17eb1956204-08f1bf21c6e9dc-2393f61-144000-16f17eb195774a; CNZZDATA1255357127=1932079525-1576651463-https%253A%252F%252Fwww.baidu.com%252F%7C1577411949"

}

# 获取html字符串

def parse_html(url):

resp = requests.get(url, headers=headers)

return resp.content.decode()

# 解析首页返回所有分类地址和名称以及总页数,并保存到文件里

def get_category_url_list()

category_list = []

html = etree.HTML(html_str)

# 解析顶部链接

menu_a = html.xpath("//ul[@class='menu']//a")

for i in menu_a:

# 排除无用链接

if len(i.xpath("./text()")) == 0 or i.xpath("./text()")[0] == '首页':

continue

href = i.xpath("./@href")[0]

name = i.xpath("./text()")[0]

# 进入分类链接，解析出总页数

html_str = parse_html(href)

html = etree.HTML(html_str)

# 存在有些分类只有一页，没有分页栏的情况

# 进行三元运算判断，没有分页栏，则总页数为1

total_page = html.xpath("//div[@id='page']/a[last()-1]/text()")[0] if len(

html.xpath("//div[@id='page']/a[last()-1]/text()")) > 0 else 1

item = {"href": href, "name": name, "total_page": int(total_page)}

category_list.append(item)

# 解析底部链接

menu_b = html.xpath("//div[@class='mode1_source']//a")

for j in menu_b:

href = j.xpath("./@href")[0]

name = j.xpath("./text()")[0]

# 进入分类链接，解析出总页数

html_str = parse_html(href)

html = etree.HTML(html_str)

total_page = html.xpath("//div[@id='pages']/a[last()-1]/text()")[0] if len(

html.xpath("//div[@id='pages']/a[last()-1]/text()")) > 0 else 1

item = {"href": href, "name": name, "total_page": total_page}

category_list.append(item)

# 保存到文件里（复用，避免多次访问首页）

with open("meitulu.json", "w", encoding="utf-8") as f:

f.write(json.dumps(category_list, ensure_ascii=False, indent=2))

# 开始运行

if __name__ == '__main__':

get_category_url_list()

分析思路二

拿到了所有分类及总页数，接下来当然就是遍历分类，然后挨个去爬图片

没想到，链接还挺规律的，这就好办了，a便签都不用拿了，只要拿img标签就行了，取出它的src链接，还有alt里的内容，方括号里41也就是图片的个数

代码示例二

class Meitulu:

def __init__(self):

self.category_url_list = None

def run(self):

# 1.解析刚才保存的json数据

with open("meitulu.json", "r", encoding="utf-8") as f:

self.category_url_list = json.load(f)

# 2.遍历获取到名称,地址,分类的总页数

for category_url in self.category_url_list:

href = category_url["href"] # 第二层url

category_name = category_url["name"]

total_page = category_url["total_page"]

for i in range(1, total_page):

# 3.请求具体分类页面,解析出所有图片链接和名称

img_url_list = parse_img_url_list(href + "_" + i) if i != 1 else parse_img_url_list(href)

# 创建目录

index = self.category_url_list.index(category_url) + 1

if len(str(index)) == 1:

index = "0" + str(index)

# 4.保存

for img_url in img_url_list:

print("此页详情数: ", len(img_url_list))

urls = img_url['urls']

name = img_url['name']

for url in urls:

dic_path = "D:/pythonTest/美图录/{}{}/{}/".format(index, category_name, name)

if not os.path.exists(dic_path):

os.makedirs(dic_path)

# D:/pythonTest/美图录/01女神/xxx/1.jpg

file_path = dic_path + url[url.rfind("/") + 1:len(url)]

save_img(url, file_path)

# 解析页面,构造所有外汇MT4教程

def parse_img_url_list(url):

img_url_list = []

html = etree.HTML(parse_html(url))

img_xpath = html.xpath("//ul[@class='img']/li/a/img")

for img in img_xpath:

url_and_name = {}

name = img.xpath("./@alt")[0]

url_and_name["name"] = name # 相册名

count = re.findall(name)[0]

url_list = []

for i in range(1, int(count)):

url = img.xpath("./@src")[0].replace("0.", str(i) + ".") # 图片地址

url_list.append(url)

url_and_name["urls"] = url_list

img_url_list.append(url_and_name)

return img_url_list

# 保存图片

def save_img(url, file_path):

with closing(requests.get(url, headers=headers, stream=True)) as response:

chunk_size = 1024 # 单次请求最大值

content_length = int(response.headers['content-length']) # 文件大小

data_count = 0 # 当前已传输的大小

with open(file_path, "wb") as file:

for data in response.iter_content(chunk_size=chunk_size):

file.write(data)

done_block = int((data_count / content_length) * 50)

data_count = data_count + len(data)

now_jd = (data_count / content_length) * 100 # 当前进度

print("\r%s：[%s%s] %d%%" % (

file_path, done_block * '█', ' ' * (50 - 1 - done_block), now_jd), end=" ")

print("\r")

开始运行

if __name__ == '__main__':

# get_category_url_list()

meitulu = Meitulu()

meitulu.run()

如果嫌爬得太多,可以把meitulu.json的数据删掉一下

原文链接：https://blog.csdn.net/mb1791592482/article/details/103739411

猜你喜欢