【前言】几天研究验证码解决方案有三种吧。第一、手工输入,即保存图片后然后我们手工输入;第二、使用cookie,不好弄还得解析各种包;第三、图像处理方案,研究生也做相关课题,就用了这种。
一、处理思路
直接上代码。亲测可用自动登录中国期货市场监控中心的网站https://investorservice.cfmmc.com/
1 # /usr/bin/python 2 # encoding: utf-8 3 4 import time 5 from selenium import webdriver 6 import sys 7 import urllib2 8 import urllib 9 import time 10 import re 11 12 from PIL import Image 13 from pytesseract import * 14 import PIL.ImageOps 15 16 import requests 17 import hmac 18 import hashlib 19 import base64 20 import time 21 import random 22 23 #方案一:在线二维码识别(也是先下载到本地,但是由于动态二维码原因,两次获取的页面不一样,导致验证码不匹配。匹配不成功) 24 def yanzheng_online(): 25 # 爬取图片 26 reload(sys) 27 sys.setdefaultencoding('utf8') 28 29 headers = ("User-Agent", 30 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36") 31 opener = urllib2.build_opener() 32 opener.addheaders = [headers] 33 urllib2.install_opener(opener) 34 35 #循环爬取多张,建立数据集 36 # for i in range(1, 1500): 37 # url = "https://investorservice.cfmmc.com/veriCode.do?t=1531728079700" + str(i) 38 # data = urllib2.urlopen(url).read() 39 # # data=urllib2.quote(data).decode('utf-8') 40 # file = "G:/360Downloads/pic/" + str(i) + ".png" 41 # playFile = open(file, 'wb') 42 # playFile.write(data) 43 # playFile.close() 44 # time.sleep(1) 45 url = "https://investorservice.cfmmc.com/veriCode.do?t=1531728079700" 46 data = urllib2.urlopen(url).read() 47 file = "G:/360Downloads/pic/" + "yanzhengma" + ".png" 48 playFile = open(file, 'wb') 49 playFile.write(data) 50 playFile.close() 51 time.sleep(1) 52 53 # 图像处理 54 im = Image.open('G:/360Downloads/pic/yanzhengma.png') 55 56 im = im.convert('L') 57 #im.show() 58 im2 = im.point(lambda x: 0 if x > 200 else 255) 59 #im2.show() 60 im3 = im2.save("G:/360Downloads/pic/yanzhengma.png") 61 62 # 借助腾讯免费的OCR识别 63 appid = "1257XX2374" # 写入自己的腾讯云号码,我修改了 64 # bucket = "你的bucket" # 不要也可以 65 secret_id = "AKIDGKXXXXXXXXX1XnnWyA5sFgz" # 写入自己的账号里面的地址 66 secret_key = "EDwRggaXXXXXXXXXXysY0CA" # 同上 67 expired = time.time() + 2592000 68 onceExpired = 0 69 current = time.time() 70 rdm = ''.join(random.choice("0123456789") for i in range(10)) 71 userid = "0" 72 fileid = "tencentyunSignTest" 73 74 info = "a=" + appid + "&k=" + secret_id + "&e=" + str(expired) + "&t=" + str(current) + "&r=" + str( 75 rdm) + "&u=0&f=" # 去掉bucket 76 77 signindex = hmac.new(secret_key, info, hashlib.sha1).digest() # HMAC-SHA1加密 78 sign = base64.b64encode(signindex + info) # base64转码 79 80 url = "http://recognition.image.myqcloud.com/ocr/general" 81 headers = {'Host': 'recognition.image.myqcloud.com', 82 "Authorization": sign, 83 } 84 files = {'appid': (None, appid), 85 # 'bucket': (None, bucket), 86 'image': ('yanzhengma.png', open('G:/360Downloads/pic/yanzhengma.png', 'rb'), 'image/jpeg') 87 88 } 89 90 r = requests.post(url, files=files, headers=headers) 91 responseinfo = r.content 92 #print responseinfo 93 # 创建内存中的word文档对象 94 # file=docx.Document() 95 r_index = r'itemstring":"(.*?)"' # 做一个正则匹配,会匹配出一些特殊符号 96 result = re.findall(r_index, responseinfo) 97 #print result 98 # result2=re.findall(r'\w+',result) 99 # new_crazy = filter(str.isalnum, result) 100 # print new_crazy 101 a = 0 102 for i in result: 103 # file.add_paragraph(i) 104 # 只识别出数字和字母 105 new_crazy = filter(str.isalnum, i) 106 #print new_crazy 107 a = new_crazy 108 # file.save("D:\\writeResult.docx") 109 return a 110 111 #方案二:网页裁剪验证码,本地识别识别。匹配成功!! 112 def yanzheng_local(): 113 #对截取的图片处理 114 im = Image.open('G:/360Downloads/pic/yanzhengma.png') 115 box = (526, 247, 623, 273) # 设置要裁剪的区域96*25,根据自己验证码位置 116 region = im.crop(box) # 此时,region是一个新的图像对象。 117 # region.show()#显示的话就会被占用,所以要注释掉 118 region.save("G:/360Downloads/pic/yanzhengma.png") 119 # 爬取图片 120 reload(sys) 121 sys.setdefaultencoding('utf8') 122 123 headers = ("User-Agent", 124 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36") 125 opener = urllib2.build_opener() 126 opener.addheaders = [headers] 127 urllib2.install_opener(opener) 128 129 # 图像处理 130 im = Image.open('G:/360Downloads/pic/yanzhengma.png') 131 132 im = im.convert('L') 133 #im.show() 134 im2 = im.point(lambda x: 0 if x > 200 else 255) 135 #im2.show() 136 im3 = im2.save("G:/360Downloads/pic/yanzhengma.png") 137 138 # 腾讯ocr识别 139 appid = "1257XXX374" # 写入自己的腾讯云号码 140 # bucket = "你的bucket" # 不要也可以 141 secret_id = "AKIDGKXXXXXXXXnnWyA5sFgz" # 写入自己的账号里面的地址 142 secret_key = "EDwRggaXXXXXXXXtVrysY0CA" # 同上 143 expired = time.time() + 2592000 144 onceExpired = 0 145 current = time.time() 146 rdm = ''.join(random.choice("0123456789") for i in range(10)) 147 userid = "0" 148 fileid = "tencentyunSignTest" 149 150 info = "a=" + appid + "&k=" + secret_id + "&e=" + str(expired) + "&t=" + str(current) + "&r=" + str( 151 rdm) + "&u=0&f=" # 去掉bucket 152 153 signindex = hmac.new(secret_key, info, hashlib.sha1).digest() # HMAC-SHA1加密 154 sign = base64.b64encode(signindex + info) # base64转码 155 156 url = "http://recognition.image.myqcloud.com/ocr/general" 157 headers = {'Host': 'recognition.image.myqcloud.com', 158 "Authorization": sign, 159 } 160 files = {'appid': (None, appid), 161 # 'bucket': (None, bucket), 162 'image': ('yanzhengma.png', open('G:/360Downloads/pic/yanzhengma.png', 'rb'), 'image/jpeg') 163 164 } 165 166 r = requests.post(url, files=files, headers=headers) 167 responseinfo = r.content 168 #print responseinfo 169 # 创建内存中的word文档对象 170 # file=docx.Document() 171 r_index = r'itemstring":"(.*?)"' # 做一个正则匹配 172 result = re.findall(r_index, responseinfo) 173 #print result 174 # result2=re.findall(r'\w+',result) 175 # new_crazy = filter(str.isalnum, result) 176 # print new_crazy 177 a = 0 178 for i in result: 179 # file.add_paragraph(i) 180 # 只识别出数字和字母 181 new_crazy = filter(str.isalnum, i) 182 183 #print new_crazy 184 a = new_crazy 185 # print 'a' 186 # file.save("D:\\writeResult.docx") 187 return a 188 189 def login(username, password): 190 # url = 'https://passport.cnblogs.com/user/signin' # 使用这个url登录成功后定位到园子 191 #url = 'https://passport.cnblogs.com/user/signin?ReturnUrl=https%3A%2F%2Fwww.cnblogs.com%2F' # url中指明定位到博客园首页 192 url = 'https://investorservice.cfmmc.com ' 193 194 driver = webdriver.Chrome(executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') 195 driver.get(url) 196 # print driver.title 197 name_input = driver.find_element_by_name('userID') # 找到用户名的框框 198 pass_input = driver.find_element_by_name('password') # 找到输入密码的框框 199 yanzheng_input=driver.find_element_by_name('vericode') #验证码输入框 200 login_button = driver.find_element_by_name('imageField2') # 找到登录按钮 201 202 name_input.clear() 203 name_input.send_keys(username) # 填写用户名 204 time.sleep(0.2) 205 pass_input.clear() 206 pass_input.send_keys(password) # 填写密码 207 #验证码获取 208 #local方法专用,截取验证码所在的网页 209 driver.get_screenshot_as_file('G:/360Downloads/pic/yanzhengma.png') # 截图网页保存 210 211 212 #yzm=yanzheng_online() 213 #使用本地裁剪识别,即方案二 214 yzm=yanzheng_local() 215 print yzm 216 yanzheng_input.send_keys(yzm) 217 time.sleep(1.2) 218 login_button.click() # 点击登录 219 220 time.sleep(1.2) 221 #print driver.get_cookies() 222 223 #打印“登录成功”表示成功,否则重新运行
if('login'in driver.current_url):
print "登录成功" 226 driver.close() 227 228 if __name__ == "__main__": 229 #账号密码 230 user = "xxxxxxx" 231 pw = "xxxxxxxx" 232 login(user, pw)