爬取图片并按标题建立文件夹存图

import requests,re
import json
from urllib import request
import os

url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',

}
for i in range(1):

    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(20*i)
    response= requests.get(url,headers=headers)

    res = response.json()
    data_list = res['data']

    # 新建文件夹
    # if not os.path.exists(title) :
    #     os.mkdir(title)
    for data_item in data_list:
        if 'article_url' in data_item:
            article_url = data_item['article_url']
            title=data_item['title']
           #新建一个总文件夹/一个标题在建立一个文件夹(分类)
            if not os.path.exists('download'+'/'+title):
                os.mkdir('download'+'/'+title)
            #print(title)
            # print(article_url)
            response = requests.get(article_url,headers=headers)

            res_html = response.text
            # print(res_html)
            res_zhengze = r'gallery: JSON\.parse\((.*)\),'
            pattern = re.search(res_zhengze, res_html)
            if pattern:
                res_2 = json.loads( pattern.group(1))
                res_3 = json.loads(res_2)
            else:
                continue#终止下面的执行代码,从下一个url开始执行
            for res_4 in res_3['sub_images']:
                res_5 = res_4['url']
                print(res_5)
                name = res_5.split('/')[-1] + '.jpg'
                filename ='download/'+title+'/'+ name

                # 下载图片
                request.urlretrieve(res_5, filename)

猜你喜欢

转载自blog.csdn.net/weixin_42958164/article/details/81750663