蟒蛇爬虫-美图

概述

最近刚学习python爬虫,requests,正则,bs4xpath等知识模块。于是想着动手实战综合一下。

分析思路一

首先当然是打开网页, 按下F12,审核网页元素

通过观察,我们发现图片分类放在class"menu"ul中,也就是我们第一步就是要取到这些分类链接已经它们的名称(用来作保存的目录)

接着我们随便打开一个分类,在网页底部分页栏处,可以发现,这个图片分类的总页数。因此,在爬取首页分类的同时,我们还要进入各个分类,拿到每个分类的总页数,所以开始动手我们的第一步代码

headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

    "Cookie": "UM_distinctid=16f17eb1956204-08f1bf21c6e9dc-2393f61-144000-16f17eb195774a; CNZZDATA1255357127=1932079525-1576651463-https%253A%252F%252Fwww.baidu.com%252F%7C1577411949"

}

 

# 获取html字符串

def parse_html(url):

    resp = requests.get(url, headers=headers)

    return resp.content.decode()

 

 

# 解析首页返回所有分类地址和名称以及总页数,并保存到文件里

def get_category_url_list()

    category_list = []

 

    html = etree.HTML(html_str)

 

    # 解析顶部链接

    menu_a = html.xpath("//ul[@class='menu']//a")

    for i in menu_a:

     # 排除无用链接

        if len(i.xpath("./text()")) == 0 or i.xpath("./text()")[0] == '首页':

            continue

        href = i.xpath("./@href")[0]

        name = i.xpath("./text()")[0]

 

        # 进入分类链接,解析出总页数

        html_str = parse_html(href)

        html = etree.HTML(html_str)

        # 存在有些分类只有一页,没有分页栏的情况

        # 进行三元运算判断,没有分页栏,则总页数为1

        total_page = html.xpath("//div[@id='page']/a[last()-1]/text()")[0] if len(

            html.xpath("//div[@id='page']/a[last()-1]/text()")) > 0 else 1

 

        item = {"href": href, "name": name, "total_page": int(total_page)}

 

        category_list.append(item)

 

    # 解析底部链接

    menu_b = html.xpath("//div[@class='mode1_source']//a")

    for j in menu_b:

        href = j.xpath("./@href")[0]

        name = j.xpath("./text()")[0]

 

        # 进入分类链接,解析出总页数

        html_str = parse_html(href)

        html = etree.HTML(html_str)

        total_page = html.xpath("//div[@id='pages']/a[last()-1]/text()")[0] if len(

            html.xpath("//div[@id='pages']/a[last()-1]/text()")) > 0 else 1

 

        item = {"href": href, "name": name, "total_page": total_page}

        category_list.append(item)

 

    # 保存到文件里(复用,避免多次访问首页)

    with open("meitulu.json", "w", encoding="utf-8") as f:

        f.write(json.dumps(category_list, ensure_ascii=False, indent=2))

 

# 开始运行

if __name__ == '__main__':

    get_category_url_list()        

分析思路二

拿到了所有分类及总页数,接下来当然就是遍历分类,然后挨个去爬图片

没想到,链接还挺规律的,这就好办了,a便签都不用拿了,只要拿img标签就行了,取出它的src链接,还有alt里的内容,方括号里41也就是图片的个数

代码示例二

class Meitulu:

    def __init__(self):

        self.category_url_list = None

 

    def run(self):

        # 1.解析刚才保存的json数据

        with open("meitulu.json", "r", encoding="utf-8") as f:

            self.category_url_list = json.load(f)

 

        # 2.遍历获取到名称,地址,分类的总页数

        for category_url in self.category_url_list:

            href = category_url["href"]  # 第二层url

            category_name = category_url["name"]

            total_page = category_url["total_page"]

            for i in range(1, total_page):

                # 3.请求具体分类页面,解析出所有图片链接和名称

                img_url_list = parse_img_url_list(href + "_" + i) if i != 1 else parse_img_url_list(href)

 

                # 创建目录

                index = self.category_url_list.index(category_url) + 1

                if len(str(index)) == 1:

                    index = "0" + str(index)

 

                # 4.保存

                for img_url in img_url_list:

                    print("此页详情数: ", len(img_url_list))

                    urls = img_url['urls']

                    name = img_url['name']

                    for url in urls:

                        dic_path = "D:/pythonTest/美图录/{}{}/{}/".format(index, category_name, name)

                        if not os.path.exists(dic_path):

                            os.makedirs(dic_path)

                        # D:/pythonTest/美图录/01女神/xxx/1.jpg        

                        file_path = dic_path + url[url.rfind("/") + 1:len(url)]

                        save_img(url, file_path)

 

# 解析页面,构造所有外汇MT4教程

def parse_img_url_list(url):

    img_url_list = []

    html = etree.HTML(parse_html(url))

 

    img_xpath = html.xpath("//ul[@class='img']/li/a/img")

 

    for img in img_xpath:

        url_and_name = {}

        name = img.xpath("./@alt")[0]

        url_and_name["name"] = name  # 相册名

        count = re.findall(name)[0]

        url_list = []

        for i in range(1, int(count)):

            url = img.xpath("./@src")[0].replace("0.", str(i) + ".")  # 图片地址

            url_list.append(url)

        url_and_name["urls"] = url_list

        img_url_list.append(url_and_name)

 

    return img_url_list

 

# 保存图片

def save_img(url, file_path):

    with closing(requests.get(url, headers=headers, stream=True)) as response:

        chunk_size = 1024  # 单次请求最大值

        content_length = int(response.headers['content-length'])  # 文件大小

        data_count = 0  # 当前已传输的大小

        with open(file_path, "wb") as file:

            for data in response.iter_content(chunk_size=chunk_size):

                file.write(data)

                done_block = int((data_count / content_length) * 50)

                data_count = data_count + len(data)

                now_jd = (data_count / content_length) * 100  # 当前进度

                print("\r%s[%s%s] %d%%" % (

                    file_path, done_block * '', ' ' * (50 - 1 - done_block), now_jd), end=" ")

            print("\r")

开始运行

if __name__ == '__main__':

    # get_category_url_list()

    meitulu = Meitulu()

    meitulu.run()

如果嫌爬得太多,可以把meitulu.json的数据删掉一下

原文链接:https://blog.csdn.net/mb1791592482/article/details/103739411

猜你喜欢

转载自www.cnblogs.com/benming/p/12119480.html