换ip是使用爬虫不可避免的一部分,建立属于自己优质的ip池可以大大提高我们的效率
下面是一个非常简单的一个版本,初步了解一下ip池的建立流程,以后还会继续更新
import requests
from lxml import etree
import pymysql
conn = pymysql.connect() #连接数据库
cursor = conn.cursor()
url = "https://www.xicidaili.com/nn/{}"
headers = {
"Referer": "https://www.xicidaili.com/nn",
"Sec-Fetch-Dest": "style",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
#抓取ip
def get_ip(url,num):
proxies_lst = []
for i in range(num):
url = url.format(num)
try: #避免一个网页的请求失败导致整个程序终止
response = requests.get(url,headers=headers).content
html = etree.HTML(response)
ip_lst = html.xpath(".//table[@id='ip_list']//tr/td[2]")
port_lst = html.xpath(".//table[@id='ip_list']//tr/td[3]")
protocol_lst = html.xpath(".//table[@id='ip_list']//tr/td[6]")
for i in range(len(ip_lst)):
protocol = protocol_lst[i]
ip = ip_lst[i]
port = port_lst[i]
agent_ip = "{}://{}:{}".format(protocol,ip,port)
proxy = {protocol:agent_ip}
proxies_lst.append(proxy)
except:
print(url,"failed request")
return proxies_lst #返回抓取的ip列表
#测试ip的可用性
def test_ip(lst):
test_url = "http://ip.chinaz.com/"
for proxies in lst:
try:
response = requests.get(test_url,proxies=proxies)
except:
print(proxies,"无效")
lst.remove(proxies)
print(proxies,response.status_code)
return lst #返回可以用的ip列表
#将可用的ip保存到数据库
def save_ip(callable_lst):
for proxies in callable_lst:
cursor.execute("insert into ipool(ip) values(%s)",str(proxies)) #往数据库中插入数据
conn.commit()
#从数据库中调用ip
def use_ip(num):
cursor.execute("select * from ipool limit %s",num)
conn.commit
ip_tuple = cursor.fetchall()
ip_lst = []
for ip in ip_tuple:
ip_lst.append(dict(ip[1]))
return ip_lst
if __name__ == "__main__":
lst = get_ip(url) #抓取ip
callable_lst = test_ip(lst) #ip清洗
save_ip(callable_lst)
cursor.close()
conn.close()