python 多线程代理刷csdn浏览量

环境:python3.7

首先拉取自己的所有文章链接

# coding: UTF-8
from bs4 import BeautifulSoup
import urllib.request as urlrequest

inFile = open('csdnlink.txt')


def download(url):  # 下载当前网页内容
    if url is None:
        print("链接为空!")
        return None
    response = urlrequest.urlopen(url)
    if response.getcode() != 200:
        print("访问失败!")
        return None
    return response.read()


class Spider(object):
    def __init__(self):
        self.pages = []
        self.datas = []
        self.root = "https://blog.csdn.net/qq_40548741"  # 替换成自己csdn链接

    def claw(self, startpage, endpage):
        for i in range(startpage, endpage + 1):
            self.pages.append(self.root + "/article/list/%d?" % i)
        for url in self.pages:
            self.getDatas(url)

    def getDatas(self, url):  # 获取当前页所有文章信息
        csdnFile = open('csdnlink.txt', 'a')
        html_cont = download(url)
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='UTF-8')
        articles = soup.find_all('div', class_='article-item-box csdn-tracking-statistics')

        for article in articles:
            tag_a = article.find('h4').find('a')
            url = tag_a['href']
            csdnFile.write(url + "\n")
        print('共%d' % len(articles) + '篇文章,链接写入完毕')


if __name__ == "__main__":
    tmp = open('csdnlink.txt', 'w')
    tmp.write("")
    spider = Spider()
    spider.claw(1, 1)
    inFile.close()

拉取完后
在这里插入图片描述
再把ip代理也按上面格式放入到一个ip.txt文件中,这里就不放如代理ip了
最后放上刷访问量代码

# coding=gbk
import time
import threading
import re
import requests

proxy_list = []
link_list = []


# 读取代理ip列表
def get_proxy_list():
    global proxy_list
    # ip文件
    f = open("ip.txt")
    line = f.readline().strip('\n')
    while line:
        proxy_list.append(line)
        line = f.readline().strip('\n')
    f.close()


# 读取文章列表
def get_link_list():
    global link_list
    f = open("csdnlink.txt")
    line = f.readline().strip('\n')
    while line:
        link_list.append(line)
        line = f.readline().strip('\n')
    f.close()


def process_data(ip):
    headers = {
        'Referer': 'https://blog.csdn.net',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
    }
    # 设置代理,格式如下
    proxy_ip = 'http://' + ip
    proxy_ips = 'https://' + ip
    proxy = {'https': proxy_ips, 'http': proxy_ip}
    for url in link_list:
        try:
            response = requests.get(url, headers=headers, proxies=proxy, timeout=2)
            read_num = int(re.compile('<span.*?read-count.*?(\d+).*?</span>').search(response.text).group(1))
            if read_num:
                print(ip + "-------------代理----->" + url + '当前阅读量:', read_num)
        except requests.exceptions.RequestException:
            print('代理出问题啦:' + ip)
    time.sleep(1)


def start():
    # 无限刷
    while 1:
        threads = []
        for ip in proxy_list:
        	# 速度太快有时候会导致没有访问量,所以这里等待一秒
        	time.sleep(1)
            t = threading.Thread(target=process_data, args=(ip,))
            t.start()
            threads.append(t)
        # 等待所有线程完成
        for t in threads:
            t.join()
        print("执行完毕,休眠60秒")
        time.sleep(60)


if __name__ == '__main__':
    get_proxy_list()  # 代理ip列表
    get_link_list()  # 文章列表
    start()

发布了12 篇原创文章 · 获赞 14 · 访问量 47万+

猜你喜欢

转载自blog.csdn.net/qq_40548741/article/details/104664520