版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/cuicanxingchen123456/article/details/84861717
根据ip看到具体位置
openGPS: https://www.opengps.cn/Data/IP/ipplus.aspx
国内透明代理ip:http://www.xicidaili.com/nt
免费代理ip:http://www.xicidaili.com/
获取免费代理ip(验证哪些ip是有效的)
import urllib3
import re
import requests
import time
from threading import Thread
from threading import Lock
from queue import Queue
# 从西刺抓下来的所有代理ip
all_find_list = []
# 将所有抓到的代理压入队列,四个线程可以从队列中获取代理ip
gaoni_queue = Queue()
# 能够成功连接的代理ip
success_list = []
lock = Lock()
def get_proxy(checking_ip):
# 根据得到的代理ip,设置proxy的格式
proxy_ip = 'http://' + checking_ip
proxy_ips = 'https://' + checking_ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
return proxy
def checking_ip():
global gaoni_queue
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
while 1:
# 若从队列1秒内无法获得代理ip,说明所有代理均已检测完成,抛出Empty异常
try:
checking_ip = gaoni_queue.get(True, 1)
except:
gaoni_queue.task_done()
break
proxy = get_proxy(checking_ip)
url = 'https://www.csdn.net/'
# 使用上面的url,测试代理ip是否能够链接
try:
page = requests.get(url, headers=headers, proxies=proxy)
except:
lock.acquire()
print(checking_ip, '失败')
lock.release()
else:
lock.acquire()
print(checking_ip, '成功')
success_list.append(checking_ip)
lock.release()
def get_all():
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
global all_find_list
for i in range(1, 2):
# 从xici网站的高匿页面获取ip
url = 'http://www.xicidaili.com/nn/%d' % i
r = requests.get(url, headers=headers)
data = r.text
# 抓取所需数据的正则表达式
p = r'<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>\s+(.*?)\s+</td>\s+<td class="country">(.*?)</td>'
find_list = re.findall(p, data)
all_find_list += find_list
# 将ip地址与端口组成规定格式
for row in all_find_list:
ip = row[0] + ':' + row[1]
gaoni_queue.put(ip)
if __name__ == '__main__':
get_all()
print
gaoni_queue.qsize()
thread_1 = Thread(target=checking_ip)
thread_2 = Thread(target=checking_ip)
thread_3 = Thread(target=checking_ip)
thread_4 = Thread(target=checking_ip)
thread_1.start()
thread_2.start()
thread_3.start()
thread_4.start()
thread_1.join()
thread_2.join()
thread_3.join()
thread_4.join()
f = open("E:/ip.txt", "w")
for row in success_list:
f.write(row + '\n')
f.close()
使用代理ip去爬取网页
import urllib.request
import random
proxy_list = []
url = 'https://blog.csdn.net/cuicanxingchen123456/article/details/84306382'
f = open("E:/ip.txt")
line = f.readline().strip('\n')
while line:
proxy_list.append(line)
line = f.readline().strip('\n')
f.close()
# iplist = ['115.32.41.100:80','58.30.231.36:80','123.56.90.175:3128']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(proxy_list)})
opener = urllib.request.build_opener(proxy_support)
# opener.addheaders = [('User-Agent','Test_Proxy_Python3.5_maminyao')]
opener.addheaders = [('User-Agent','Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)