scrapy爬虫代理——利用crawlera神器,无需再寻找代理IP
在使用爬虫爬取网络数据时,如果长时间对一个网站进行抓取时可能会遇到IP被封的情况,这种情况可以使用代理更换ip来突破服务器封IP的限制。
随手在百度上搜索免费代理IP,可以得到一系列的网站,这里我们通过对西刺网站的抓取来举例。
验证代理IP是否可用,原理是使用代理IP访问指定网站,如果返回状态为200,表示这个代理是可以使用的。
http://www.jianshu.com/p/588241a313e7
方法1:
import requests try: requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://121.31.154.12:8123"}) except: print 'connect failed' else: print 'success'
方法2:
import telnetlib try: telnetlib.Telnet('127.0.0.1', port='80', timeout=20) except: print 'connect failed' else: print 'success'
例子:
#coding=UTF-8 import urllib.request def validateIp(): inFile = open('proxy.txt', 'r') f = open("proxy2.txt","w") url = "http://www.baidu.com/" for line in inFile.readlines(): try: #print(line) line = line.strip('\n') proxy_host = '://'.join(line.split('=')) #print(proxy_host) proxy_temp = {line.split("=")[0]:proxy_host} print(proxy_temp) urllib.request.urlopen(url,proxies=proxy_temp).read() f.write(line+'\n') except Exception as e: print('%s connect failed' % line) continue f.close() if __name__ == '__main__': validateIp()
结果:
{'http': 'http://218.21.169.19:8998'}
http=218.21.169.19:8998 connect failed
{'http': 'http://27.46.74.38:9999'}
http=27.46.74.38:9999 connect failed
{'http': 'http://60.173.35.99:808'}
http=60.173.35.99:808 connect failed
{'http': 'http://218.4.95.182:80'}
http=218.4.95.182:80 connect failed
{'http': 'http://218.56.132.155:8080'}
http=218.56.132.155:8080 connect failed
。。
网上例子:
#coding:utf-8 import urllib2 def url_user_agent(url): #设置使用代理 proxy = {'http':'27.24.158.155:84'} proxy_support = urllib2.ProxyHandler(proxy) # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1)) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) #添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题 # i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'} req = urllib2.Request(url,headers=i_headers) html = urllib2.urlopen(req) if url == html.geturl(): doc = html.read() return doc return url = 'http://www.dianping.com/search/category/2/10/g311' doc = url_user_agent(url) print doc
自己写的可用例子:
#coding=UTF-8 import urllib.request import chardet def url_user_agent(): #url = 'http://quote.stockstar.com/stock' url = 'http://www.baidu.com/' inFile = open('proxy.txt', 'r') f = open("available.txt","wb") for line in inFile.readlines(): #f.write(line+'\n') #print(line) line = line.strip('\n') #proxy_host = '://'.join(line.split('=')) proxy_host = line.split('=')[1] #print(proxy_host) proxy_temp = {line.split("=")[0]:proxy_host} print(proxy_temp) #proxy_temp = {'http':'58.33.37.205:8118'} #设置使用代理 #proxy_temp = {'http':'119.5.0.100:808'} proxy_support = urllib.request.ProxyHandler(proxy_temp) # opener = urllib.request.build_opener(proxy_support,urllib.request.HTTPHandler(debuglevel=1)) opener = urllib.request.build_opener(proxy_support) #i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'} #req = urllib.request.Request(url,headers=i_headers) opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")] urllib.request.install_opener(opener) #添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题 # i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} try: html = urllib.request.urlopen(url,timeout=5) content = html.read() print(content) #print(type(content)) #print(chardet.detect(content)) print("==============================") if content.strip() != '': line = line + '\n' data = line.encode(encoding="UTF-8") f.write(data) except Exception as e: print('%s connect failed' % line) f.close() print("Test End !") if __name__ == '__main__': url_user_agent()
多线程例子:
#coding=UTF-8 import urllib.request import urllib import re import time import socket import threading #整理代理IP格式 proxys = [] inFile = open('proxy.txt','r') proxy_ip=open('proxy_ip.txt','w') #新建一个储存有效IP的文档 for line in inFile.readlines(): line = line.strip('\n') #proxy_host = '://'.join(line.split('=')) proxy_host = line.split('=')[1] #print(proxy_host) proxy_temp = {line.split("=")[0]:proxy_host} print(proxy_temp) proxys.append(proxy_temp) lock=threading.Lock() #建立一个锁 #验证代理IP有效性的方法 def test(i): socket.setdefaulttimeout(5) #设置全局超时时间 #url = "http://quote.stockstar.com/stock" #打算爬取的网址 url = "http://www.baidu.com/" #打算爬取的网址 try: proxy_support = urllib.request.ProxyHandler(proxys[i]) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")] urllib.request.install_opener(opener) res = urllib.request.urlopen(url).read() # 获取锁,用于线程同步 lock.acquire() #获得锁 print(proxys[i],'is OK') proxy_ip.write('%s\n' %str(proxys[i])) #写入该代理IP # 释放锁,开启下一个线程 lock.release() #释放锁 except Exception as e: lock.acquire() print(proxys[i],e) lock.release() #单线程验证 '''for i in range(len(proxys)): test(i)''' #多线程验证 threads=[] start = time.clock() for i in range(len(proxys)): thread=threading.Thread(target=test,args=[i]) threads.append(thread) thread.start() #阻塞主进程,等待所有子线程结束 for thread in threads: thread.join() proxy_ip.close() #关闭文件 end = time.clock() print("开始时间: %f s" % start) print("结束时间: %f s" % end) print("校验IP耗时: %f s" % (end - start))
结果:
...
{'http': '221.197.1.210:14515'} <urlopen error timed out>
{'http': '183.78.183.156:82'} is OK
开始时间: 0.000000 s
结束时间: 6.352310 s
校验IP耗时: 6.352309 s