前言
需要大量图片做数据采集是,这个时候就需要用到python获取图片,本篇以python多线程获取图片为例。
一、查看页面元素
查看页面源代码。
二、请求url查看
通过F12查看请求url。
tn: resultjson_com
logid: 12339447258259285711
ipn: rj
ct: 201326592
is:
fp: result
fr:
word: 动漫图片
queryWord: 动漫图片
cl: 2
lm: -1
ie: utf-8
oe: utf-8
adpicid:
st: -1
z:
ic:
hd:
latest:
copyright:
s:
se:
tab:
width:
height:
face: 0
istype: 2
qc:
nc: 1
expermode:
nojc:
isAsync:
pn: 60
rn: 30
gsm: 3c
1669373933133:
代码实现
# -*- coding: utf-8 -*-
import os
import re
import time
from multiprocessing import Pool
import requests
from multiprocessing.dummy import Pool as ThreadPool # 线程池
def get_image(keyword, page_num, save_dir):
# 浏览器伪装
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
# 请求url
url = 'https://image.baidu.com/search/acjson?'
n = 0;
pn = 1 # pn是从第几张图片开始
page_num = page_num + 1;
for m in range(1, page_num):
# 请求参数
param = {
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': '',
'hd': 1,
'latest': '',
'copyright': '',
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': '',
'pn': pn,
'rn': '30',
'gsm': '1e',
'1669373933133': ' '
}
request = requests.get(url=url, headers=header, params=param)
if request.status_code == 200:
print('success.')
request.encoding = 'utf-8'
html = request.text
image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for image_url in image_url_list:
image_data = requests.get(url=image_url, headers=header).content
# with open(os.path.join(save_dir, "{}_{:06d}.jpg".format("1", n)), 'wb') as fp:
# fp.write(image_data)
pool.apply_async(download, args=(n, image_data, save_dir), error_callback=func.err_call_back)
n = n + 1
pn += 29
class Func(object):
def __init__(self):
# 利用匿名函数模拟一个不可序列化象
# 更常见的错误写法是,在这里初始化一个数据库的长链接
self.num = lambda: None
def work(self, num=None):
self.num = num
return self.num
@staticmethod
def call_back(res):
print('Hello,World! {res}')
@staticmethod
def err_call_back(err):
print('出错啦:[{}]'.format(err))
def download(n, image_data, save_dir):
# time.sleep(1)
fp = open(os.path.join(save_dir, "{}_{:06d}.jpg".format("1", n)), 'wb')
fp.write(image_data)
fp.close()
if __name__ == '__main__':
func = Func()
keyword = '动漫图片'
save_dir = keyword
page_num = int(input("页数:"))
# 线程池中线程数
pool = Pool(10)
# pool = ThreadPool(5)
# i = 0;
# while i < page_num:
get_image(keyword, page_num, save_dir)
# i = i + 1
print('完成')
点赞 收藏 关注
见善如不及,见不善如探汤。