爬虫【9】建立自己的IP池
爬虫回顾:
- 爬虫【1】打开网站,获取信息
- 爬虫【2】重构UserAgent
- 爬虫【3】URL地址编码
- 爬虫【4】爬取百度贴吧并生成静态页面
- 爬虫【5】爬虫猫眼电影100榜单并保存到csv
- 爬虫【6】链家二手房信息和图片并保存到本地
- 爬虫【7】链家二手房信息和图片并保存到本地
- 爬虫【8】request.get()参数详解
- 爬虫【9】建立自己的IP池
为什么要建立自己的IP池
在做爬虫的过程中免不了要使用代理IP,但是如果是个人用户的化用直接购买的价格就有点贵了,所以我们建立之间的IP池,这样就可以使用不要钱的代理IP了!
爬虫西刺代理
选择国内高匿代理
分析url
这个页面的url非常简单
https://www.xicidaili.com/nn/{}
即可搞定,一共有4000多页,感觉可以爬一天
Xpath 代码
右键解析源代码,发现页面源码全是静态的,通过分析可以得到
ip地址://tr[@class="odd"]/td[position()=2]/text()
post://tr[@class="odd"]/td[position()=3]/text()
类设计
我们设计一个类,类里面开放的接口有update和get,封装好后放入python环境中,以后就可以直接调用了
注: ip写到csv文件中,csv文件一开始不能是空,需初始化如下
ip
222.95.240.13:3000
171.221.79.223:8118
117.88.176.234:3000
117.88.5.41:3000
话不多说,这个爬虫很简单就不多做说明了
"""
获取西刺代理中的ip,并写入csv文件中
"""
from fake_useragent import UserAgent
import pandas as pd
import requests, time, random
from lxml import etree
class ObtainIP:
def __init__(self):
self.url = 'https://www.xicidaili.com/nn/{}'
def __get_useragent(self):
ua = UserAgent()
return ua.random
def __get_html(self, url):
return requests.get(url=url, headers={'User-Agent': self.__get_useragent()}).text
def __parse_html(self, html):
html = etree.HTML(html)
ip = html.xpath('//tr[@class="odd"]/td[position()=2]/text()')
host = html.xpath('//tr[@class="odd"]/td[position()=3]/text()')
return ip, host
def __get_ip(self, ip, host):
assert len(ip) == len(host)
IP = []
for i in range(len(ip)):
IP.append(ip[i] + ':' + host[i])
return IP
def __check_ip(self, IP, mode='new'):
url = 'https://www.baidu.com/?tn=78040160_26_pg&ch=8'
# 获取已经存在的ip
exist_ip = self.get()
# 除去已经存在的ip
for item in IP:
if item in exist_ip:
IP.remove(item)
# 去除不能用的ip
for item in IP:
proxies = {
'http': 'http://' + item,
'https': 'https://' + item,
}
useragent = self.__get_useragent()
headers = {'User-Agent': useragent}
try:
requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
except:
IP.remove(item)
return IP
def __write_into_csv(self, IP):
# 读取csv文件
ips = pd.read_csv('ip.csv')
for item in IP:
item = pd.DataFrame([item], columns=['ip'])
up = ips.loc[:2]
down = ips.loc[3:]
ips = pd.concat([up, item, down], ignore_index=True)
ips.to_csv('ip.csv', index=False)
def update(self):
for i in range(1, 4047):
print('爬虫到第 %i 页' % i)
url = self.url.format(i)
html = self.__get_html(url)
ip, post = self.__parse_html(html)
ip = self.__get_ip(ip, post)
ip = self.__check_ip(ip)
self.__write_into_csv(ip)
time.sleep(random.randint(50, 60))
def sift(self):
"""
筛选以前保存的ip
"""
url = 'https://www.baidu.com/?tn=78040160_26_pg&ch=8'
ips = pd.read_csv('ip.csv')
axises = []
axis = -1
for item in ips['ip']:
axis += 1
proxies = {
'http': 'http://' + item,
'https': 'https://' + item,
}
useragent = self.__get_useragent()
headers = {'User-Agent': useragent}
try:
requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
except:
axises.append(axis)
ips.drop(axis=axises,inplace=True)
ips.to_csv('ip.csv', index=False)
# 获取csv文件中的ip地址
def get(self):
exist_ip = []
ips = pd.read_csv('ip.csv')
for item in ips['ip']:
exist_ip.append(item)
return exist_ip
if __name__ == '__main__':
aaa = ObtainIP()
aaa.update()