import re
from urllib.parse import urlparse
import requests
from fake_useragent import UserAgent
class ProxySpider:
def __init__(self,url_init='https://www.xicidaili.com/nn/'):
'''
初始化
:param url_init:某匿代理ip
'''
self.headers = {'User-Agent': UserAgent().random}#随机useragent
self.url_seed = url_init#~匿
self.timeout = 5#测试ip是否可用超时时间
def myRequest(self):
'''
简单的请求
:return:
'''
return requests.get(self.url_seed, headers=self.headers)
def save_resource(self,path_file):
res = self.myRequest()
with open(path_file,'w+',encoding='utf-8') as f:
f.write(res.content.decode('utf-8'))
def parse(self):
'''
解析某匿
:return:
'''
content_html = self.myRequest().content.decode('utf-8')
content_html = content_html.replace('\n','')
res = re.findall('<tr\sclass="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(HTTP|HTTPS)</td>',content_html,re.I)
return [proxy_parts[2]+'://'+proxy_parts[0] + ':' +proxy_parts[1] for proxy_parts in res]
def url_parts(self,domain):
'''
将url切割成多部分
:param domain:
:return:
'''
return urlparse(domain)
def ip_filter_available(self):
'''
过滤一个可用的代理ip
:return:
'''
ips = self.parse()
for ip in ips:
pr = self.url_parts(ip)
scheme = pr.scheme
netloc = pr.netloc
ip_new = scheme + '://' + netloc
proxies = {scheme:ip_new}
try:
res = requests.get('https://www.baidu.com/',proxies=proxies,timeout=self.timeout)
if res.status_code == 200:
return proxies
except Exception as e:
continue
测试:
proxy_spider = ProxySpider()
print(proxy_spider.ip_filter_available())
结果: