爬虫代理IP

爬虫获取代理IP

之前说过，因为某些原因，IP被封了，所以回过头排查了一下关于代理IP的问题。

代理IP的作用

代理IP简单的来说，就是代替你访问网站的IP，每台电脑都有自己的IP，在从事爬虫的时候，如果你直接使用自己的IP，被爬的网站检测到，会很快的封掉你的IP，从事违法工作的，甚至定位到你的位置。那么我们在使用爬虫的时候，怎么能不使用自己的IP呢，那么就用代理的IP。

这里用的代理IP是从西刺的网站上爬下来的，西刺提供的代理IP为高可匿的，但是并不是所有的都有效，有效的太少了，需要我们去检验筛选出可用的IP，再去为吾所用。

代理IP的获取

import requests
from bs4 import BeautifulSoup

headers = {
        'Host': 'www.xicidaili.com',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Accept': r'application/json, text/javascript, */*; q=0.01',
        'Referer': r'http://www.xicidaili.com/',
    }
#此处只爬取了第一页的
html = requests.get(r'http://www.xicidaili.com/nn/', headers=headers)
#用到BeautifulSoup进行解析
soup = BeautifulSoup(html.text, features="html.parser")
all_tr = soup.find_all("tr", class_="odd")
#得到所有的带标签的IP
ip_list = []
for i in range(1, len(all_tr)):
    #对IP进行解析
    ip = all_tr[i].contents[3].string+":"+all_tr[i].contents[5].string
    ip_list.append(ip)
#打印第一页获取到的IP数量
print(len(ip_list))

代理IP的检验

#将有效的IP存到文件中，文件名位ip.txt
f = open("ip.txt", "w")
#记录有效个数
sum = 0
#记录到第几个了
i = 0
for proxy in all_id:
    i += 1
    print(i)
    try:
        #设置timeout=10，超过10就不访问了，太浪费时间
        #http://icanhazip.com/网站的作用是返回访问此网站的IP地址
        res = requests.get('http://icanhazip.com/', proxies={"http":"http://"+proxy}, timeout=10)
        #返回访问此网站的IP地址
        print(res.content)
        #访问成功状态码为200
        if( res.status_code ==  200 ):
            #存入
            f.write(proxy+'\n')
            sum += 1
    except :
        print ("connect failed")
#关闭
f.close()

代理IP的使用

#118.190.95.43:9001为爬来的代理IP，使用就可以了
html = requests.get(url, headers=headers, proxies={"http" : "http://118.190.95.43:9001"})

总的代码

# coding=utf-8
from urllib import request
import requests
from bs4 import BeautifulSoup

#爬取
def get_ip():
    headers = {
        'Host': 'www.xicidaili.com',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Accept': r'application/json, text/javascript, */*; q=0.01',
        'Referer': r'http://www.xicidaili.com/',
    }
    # req = request.Request(r'http://www.xicidaili.com/nn/', headers=headers)
    # response = request.urlopen(req)
    # html = response.read().decode('utf-8')

    #requests.get访问不了，用上边三句代替这一句
    html = requests.get(r'http://www.xicidaili.com/nn/', headers=headers)

    soup = BeautifulSoup(html.text, features="html.parser")
    all_tr = soup.find_all("tr", class_="odd")
    ip_list = []
    for i in range(1, len(all_tr)):
        ip = all_tr[i].contents[3].string+":"+all_tr[i].contents[5].string
        ip_list.append(ip)
    print(len(ip_list))
    return  ip_list


#验证
def get_best_ip(all_id):
    f = open("ip.txt", "w")
    sum = 0
    i = 0
    for proxy in all_id:
        i += 1
        print(i)
        try:
            res = requests.get('http://icanhazip.com/', proxies={"http":"http://"+proxy}, timeout=10)
            print(res.content)
            if( res.status_code ==  200 ):
                f.write(proxy+'\n')
                sum += 1
        except :
            print ("connect failed")
    f.close()
    return sum

if __name__ == "__main__":
    all_id = get_ip()
    sum = get_best_ip(all_id)
    print("成功获取", sum, "个可用代理ip")

爬虫 代理IP