版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014119694/article/details/82425232
百度爬虫
获取人名
# coding: utf-8
import requests
from lxml import etree
from lxml.etree import HTMLParser
proxies = {
}
#r=requests.get('http://www.baidu.com',proxies=proxies)
'''
headers = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
session = requests.session()
response = session.get("https://www.zhihu.com", headers=headers, proxies=proxies,verify=False)
#proxy.huawei.com
'''
#NameLists=[]
def getName(link):
print(link)
NameList=[]
r=requests.get(link,proxies=proxies)
try:
r.encoding='gbk'
html=etree.HTML(r.text)
NameList=html.xpath('//div[@class="i_cont_s"]/a/text()')
#print(NameList)
with open('Name.txt', 'a+') as f:
f.write('\n'.join(NameList))
except:
print('-----------: ',link)
pass
return(len(NameList))
#print(NameList)
#A-Z
'''
baselink='http://www.manmankan.com/dy2013/mingxing/'
for i in range(ord('A'),ord('Z')+1):
link=baselink+chr(i)+'/'
getName(link)
page=2
while(1):
slink=link+'index_'+str(page)+'.shtml'
lens=getName(slink)
page+=1
if(lens<1):
break
'''
Links=[
'http://www.manmankan.com/dy2013/mingxing/yanyuan/neidi/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/xianggang/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/taiwan/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/riben/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/oumei/',
'http://www.manmankan.com/dy2013/mingxing/yanyuan/hanguo/',
'http://www.manmankan.com/dy2013/mingxing/geshou/neidi/',
'http://www.manmankan.com/dy2013/mingxing/geshou/xianggang/',
'http://www.manmankan.com/dy2013/mingxing/geshou/taiwan/',
'http://www.manmankan.com/dy2013/mingxing/geshou/riben/',
'http://www.manmankan.com/dy2013/mingxing/geshou/oumei/',
'http://www.manmankan.com/dy2013/mingxing/geshou/hanguo/'
]
for link in Links:
getName(link)
爬照片
# coding: utf-8
'''
with open('Name.txt', 'r+') as f:
NameList=f.read().splitlines()
print(len(NameList))
'''
import requests
import os
import multiprocessing
proxies = {
"http": "http://d84105117:@[email protected]:8080/", #注意最后的'/'一定要有
"https": "http://d84105117:@[email protected]:8080/"
}
def getManyPages(keyword,pages):
params=[]
for i in range(0,30*pages,30):
params.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1536131285172': ''
})
url = 'https://image.baidu.com/search/acjson'
urls = []
for param in params:
try:
print(requests.get(url,params=param,proxies=proxies).url)
urls.append(requests.get(url,params=param,proxies=proxies).json().get('data'))
except Exception as e:
pass
return urls
def getImg(dataList, localPath):
if not os.path.exists(localPath): # 新建文件夹
os.mkdir(localPath)
x = 0
for list in dataList:
for i in list:
if i.get('thumbURL') != None:
print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'),proxies=proxies,timeout=15, verify=False)
open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
x += 1
else:
pass
#print('图片链接不存在')
def spider(keyword):
print('正在处理',keyword)
dataList = getManyPages(keyword,3) # 参数1:关键字,参数2:要下载的页数
getImg(dataList,keyword+'/') # 参数2:指定保存的路径
with open('ok.txt', 'a+') as f:
f.write(keyword+'\n')
if __name__ == '__main__':
with open('Name.txt', 'r+') as f:
NameList=f.read().splitlines()
with open('ok.txt', 'r+') as f:
OkList=f.read().splitlines()
pool=multiprocessing.Pool(processes=4)
for keyword in NameList:
if keyword in OkList:
print(keyword+' is already ok,continue-----')
continue
pool.apply_async(spider, args=(keyword, ))
pool.close()
pool.join()
图片裁剪
# coding: utf-8
import mxnet as mx
from mtcnn_detector import MtcnnDetector
import cv2
import os
import time
import numpy as np
detector = MtcnnDetector(model_folder='model', ctx=mx.cpu(0), num_worker = 4 , accurate_landmark = False)
base_dirs=r'D:\code\python\china\\'
dirlist=os.listdir(base_dirs)
for dirs in dirlist:
savedir=r'D:\data\\'+dirs
if not os.path.exists(savedir): # 新建文件夹
os.mkdir(savedir)
#else:
# continue
base_dir=base_dirs+dirs+'\\'
index=0
imagelist=os.listdir(base_dir)
while index<len(imagelist):
imagecp=base_dir+imagelist[index]
print(imagecp)
img = cv2.imdecode(np.fromfile(imagecp, dtype=np.uint8), 1)
results = detector.detect_face(img)
if results is not None and len(results[0])==1:
total_boxes = results[0]
points = results[1]
#draw = img.copy()
b=total_boxes[0]
print(b)
try:
bound0=(b[3]-b[1])/2
bound1=(b[2]-b[0])/2
b[1]-=bound0
b[0]-=bound1
if b[1]<0:b[1]=0
if b[0]<0:b[0]=0
b[2]+=bound1
b[3]+=bound0
if b[2]>img.shape[1]:b[2]=img.shape[1]
if b[3]>img.shape[0]:b[3]=img.shape[0]
imageok=img[int(b[1]):int(b[3]),int(b[0]):int(b[2])]
if imageok.shape[0]>100 and imageok.shape[1]>100:
diss=savedir+'\\'+imagelist[index]
#cv2.imwrite(diss,imageok)
cv2.imencode('.jpg', imageok)[1].tofile(diss)
cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))
#cv2.imshow("img",img)
#key = cv2.waitKey(0)
except:
pass
index+=1
'''
cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))
for p in points:
for i in range(5):
cv2.circle(draw, (p[i], p[i + 5]), 1, (0, 0, 255), 2)
cv2.imshow("img",draw)
key = cv2.waitKey(0)
'''
'''
if(results is not None and len(results[0])==1):
try:
b=results[0][0]
imageok=img[int(b[1])-50:int(b[0])+50,int(b[3])-50:int(b[2])+50]
if imageok.shape[0]<100 or imageok.shape[1]<100:
continue
diss=savedir+'\\'+imagelist[index]
cv2.imwrite(diss,imageok)
# cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))
#cv2.imshow("img",imageok)
#key = cv2.waitKey(0)
except:
pass
'''
'''
MTCNN:
https://github.com/pangyupo/mxnet_mtcnn_face_detection
Reference:
https://blog.csdn.net/qq_32166627/article/details/60882964