环境:python3.7
首先拉取自己的所有文章链接
# coding: UTF-8
from bs4 import BeautifulSoup
import urllib.request as urlrequest
inFile = open('csdnlink.txt')
def download(url): # 下载当前网页内容
if url is None:
print("链接为空!")
return None
response = urlrequest.urlopen(url)
if response.getcode() != 200:
print("访问失败!")
return None
return response.read()
class Spider(object):
def __init__(self):
self.pages = []
self.datas = []
self.root = "https://blog.csdn.net/qq_40548741" # 替换成自己csdn链接
def claw(self, startpage, endpage):
for i in range(startpage, endpage + 1):
self.pages.append(self.root + "/article/list/%d?" % i)
for url in self.pages:
self.getDatas(url)
def getDatas(self, url): # 获取当前页所有文章信息
csdnFile = open('csdnlink.txt', 'a')
html_cont = download(url)
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='UTF-8')
articles = soup.find_all('div', class_='article-item-box csdn-tracking-statistics')
for article in articles:
tag_a = article.find('h4').find('a')
url = tag_a['href']
csdnFile.write(url + "\n")
print('共%d' % len(articles) + '篇文章,链接写入完毕')
if __name__ == "__main__":
tmp = open('csdnlink.txt', 'w')
tmp.write("")
spider = Spider()
spider.claw(1, 1)
inFile.close()
拉取完后
再把ip代理也按上面格式放入到一个ip.txt文件中,这里就不放如代理ip了
最后放上刷访问量代码
# coding=gbk
import time
import threading
import re
import requests
proxy_list = []
link_list = []
# 读取代理ip列表
def get_proxy_list():
global proxy_list
# ip文件
f = open("ip.txt")
line = f.readline().strip('\n')
while line:
proxy_list.append(line)
line = f.readline().strip('\n')
f.close()
# 读取文章列表
def get_link_list():
global link_list
f = open("csdnlink.txt")
line = f.readline().strip('\n')
while line:
link_list.append(line)
line = f.readline().strip('\n')
f.close()
def process_data(ip):
headers = {
'Referer': 'https://blog.csdn.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
}
# 设置代理,格式如下
proxy_ip = 'http://' + ip
proxy_ips = 'https://' + ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
for url in link_list:
try:
response = requests.get(url, headers=headers, proxies=proxy, timeout=2)
read_num = int(re.compile('<span.*?read-count.*?(\d+).*?</span>').search(response.text).group(1))
if read_num:
print(ip + "-------------代理----->" + url + '当前阅读量:', read_num)
except requests.exceptions.RequestException:
print('代理出问题啦:' + ip)
time.sleep(1)
def start():
# 无限刷
while 1:
threads = []
for ip in proxy_list:
# 速度太快有时候会导致没有访问量,所以这里等待一秒
time.sleep(1)
t = threading.Thread(target=process_data, args=(ip,))
t.start()
threads.append(t)
# 等待所有线程完成
for t in threads:
t.join()
print("执行完毕,休眠60秒")
time.sleep(60)
if __name__ == '__main__':
get_proxy_list() # 代理ip列表
get_link_list() # 文章列表
start()