import requests import re import json import os from urllib import request hearders = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } for i in range(0,60,20): url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab '.format(i) # 可以通过response.json 直接获取转化后的对象(dict) response = requests.get(url) html_json_dict = response.json() # 获取dict中的data key对应的列表 data_list = html_json_dict['data'] # print(data_list) # print(type(data_list)) # 如果列表中的每一项,有article_url我们就取这个值 for data_item in data_list: if 'article_url' in data_item: article_url = data_item['article_url'] # print(article_url) response = requests.get(article_url, headers=hearders) # print(response) # with open('jinritoutiao.html', 'wb') as f: # f.write(response.content) html_str = response.text # print(html_str) # pattern = r'gallery: JSON\.parse\((.*)\),' # match_res = re.search(pattern, html_str) # # print(match_res.group(1)) if not os.path.exists('download11'): os.mkdir('download11') if match_res: match_res_json = match_res.group(1) match_json_one = json.loads(match_res_json) # print(match_json_one) # print(type(match_json_one)) match_json_two = json.loads(match_json_one ) # print(match_json_two) # print(type(match_json_two)) match_dict_url= match_json_two['sub_images'] # print(match_dict_url) # print(type(match_dict_url)) for v in match_dict_url: image_url = v['url'] print(image_url) filename = 'download11/' + image_url.split('/')[-1] + '.jpg' request.urlretrieve(image_url, filename) else: print('哈哈哈')