import requests,json,re,os from urllib import request headers={ "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400", "cookie":"tt_webid=6590204151702865412; tt_webid=6590204151702865412; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16541746a22ab-0980602f6a1886-34497b51-100200-16541746a23108; __tasessionId=qtlh48pt41534422027645; CNZZDATA1259612802=1044114658-1534399527-https%253A%252F%252Fwww.baidu.com%252F%7C1534421128; csrftoken=d2b46370bc8e2e2b6ba59384b3292811; tt_webid=6590204151702865412" } # 构建代理 proxy = { 'http' : 'http://alice:[email protected]:6666', 'https' : 'http://alice:[email protected]:6666' } offset=0 # i =1 for offset in range(0,60,20): url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(offset) # print(offset) response =requests.get(url,headers=headers,proxies=proxy) # 可以通过response.json 直接获取转化后的对象(dict) index_dict=response.json() index_list=index_dict['data'] # print(index_list) for i in index_list: if 'article_url' in i: url = i['article_url'] # 标题 title = i['title'] # print(url) # 访问详情页 response =requests.get(url,headers=headers,proxies=proxy) # 返回bytes类型 html_bytes=response.text pattern = r"gallery: JSON\.parse\((.*)\)," match_res=re.search(pattern,html_bytes) if match_res: # print(match_res.group(1)) json_origin = match_res.group(1) if json_origin: # 这是第一遍loads, 返回值是str res_buzhidao = json.loads(json_origin) # 这是第二遍loads, 返回值是dict res_dict = json.loads(res_buzhidao) # 通过键选取值(是一个图片地址列表) sub_images_list = res_dict['sub_images'] # 如果没有则新建文件夹 if not os.path.exists('download/' + title): os.mkdir('download/' + title) print(title) for image in sub_images_list: image_url = image['url'] # 下载的路径与图片名字 filename = 'download/' + title + '/' + image_url.split('/')[-1] + '.jpg' # 下载图片 request.urlretrieve(image_url, filename) else: continue else: print('报错了, 不应该来我这')
今日头条图片
猜你喜欢
转载自blog.csdn.net/q810935819/article/details/81750351
今日推荐
周排行