使用代理ip,能提高我们的爬虫的效率,花一点时间用python写了爬取代理ip,代理ip网站是西刺的,一页就可以。
引入模块
import reuqests
import re
获得网页
def get_html():
url = 'https://www.xicidaili.com/wt/' # 西刺免费代理IP,http代理ip
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
#请求头
req = requests.get(url,headers = header)
return req.text
获得ip
def get_ip_and_port(html):
ip_list = re.findall(r'<td>(\d+\.\d+\.\d+\.\d+|\d+)</td>', html) # ip地址 | 端口
#网站格式都很标准,简单过正则匹配就行
return ip_list
ip格式
def save_proxy_ip(ip_list):
file_object = open('proxy_ip.txt','w') #新建一个文本(proxy_ip.txt)来调整ip格式
num = 1
# num为偶数就换行,奇数加 :,调整为 ip:port 标准格式
for each in ip_list:
if num % 2 == 0:
file_object.write(each + '\n')
else:
file_object.write(each + ':')
num += 1
file_object.close()
测试保存ip是否可用
def test_ip(): # 测试保存ip是否可用,可用保存到新文件ok_ip.txt
ip_file = open('proxy_ip.txt','r+') #打开所有ip
new_file = open('ok_ip.txt','w') #新建文本来保存可用ip
for ip in ip_file:
num = 0
if num < 3: #测试三次
test_url = 'http://httpbin.org/get' #测试http网址
h_ip = 'http://'+str(ip)
try:
resp = requests.get(test_url,proxies = {'http':h_ip},timeout = 1) #超时一秒
except:
num ++
else:
new_file.write(ip)
break
ip_file.close()
new_file.close()
至此已经保存好可用ip在ok_ip.txt
所有代码
import reuqests
import re
def get_html():
url = 'https://www.xicidaili.com/wt/' # 西刺免费代理IP,http代理ip
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
#请求头
req = requests.get(url,headers = header)
return req.text
def get_ip_and_port(html):
ip_list = re.findall(r'<td>(\d+\.\d+\.\d+\.\d+|\d+)</td>', html) # ip地址 | 端口
#网站格式都很标准,简单过正则匹配就行
return ip_list
def save_proxy_ip(ip_list):
file_object = open('proxy_ip.txt','w') #新建一个文本(proxy_ip.txt)来调整ip格式
num = 1
# num为偶数就换行,奇数加 :,调整为 ip:port 标准格式
for each in ip_list:
if num % 2 == 0:
file_object.write(each + '\n')
else:
file_object.write(each + ':')
num += 1
file_object.close()
def test_ip(): # 测试保存ip是否可用,可用保存到新文件ok_ip.txt
ip_file = open('proxy_ip.txt','r+') #打开所有ip
new_file = open('ok_ip.txt','w') #新建文本来保存可用ip
for ip in ip_file:
num = 0
if num < 3: #测试三次
test_url = 'http://httpbin.org/get' #测试http网址
h_ip = 'http://'+str(ip)
try:
resp = requests.get(test_url,proxies = {'http':h_ip},timeout = 1) #超时一秒
r.raise_for_status()
except:
num ++
else:
new_file.write(ip)
break
ip_file.close()
new_file.close()
if __name__ = '__main__':
html = get_html()
ip_list = get_ip_and_port(html)
save_proxy_ip(ip_list)
test_ip()