分析雪球网 https://xueqiu.com/#/property
第一次进去后,第一次Ajax请求得到的是 如下图所示的 max_id=-1, count=10。
然后往下拉,第二次Ajax请求,如下图 发现URL里面就max_id 和count不同,max_id为前一次Ajax的最后一条数据的id,以后的每次请求都是count=15,所以需要对url进行拼接
url拼接如下:
url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={}&count={}&category=111'.format(str(max_id), str(count))
然后就是对数据进行定位操作了 通过python字典,列表,还有json等等,将需要的内容定位处理,然后连接数据库,将数据存储到数据库 代码如下:
-
import requests
-
import json
-
import pymysql
-
# 打开数据库连接
-
db = pymysql.connect(host="localhost", user="root", password="8888", database="test")
-
# 使用 cursor() 方法创建一个游标对象 cursor
-
cursor = db.cursor()
-
# ————————————————————————————————————————————————————————————————
-
i = 1 # 用来控制要爬取的页面数
-
count = 10 # 因为第一页和其他页面的count不同,其他页面为15
-
max_id = -1
-
while i < 10:
-
'''第一个循环用来控制要爬取数据的 总ajax请求的次数'''
-
url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={}&count={}&category=111'.format(str(max_id), str(count))
-
headers = {
-
'User-Agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36
-
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36
-
''',
-
'Cookie': '''aliyungf_tc=AQAAABS6+HG0OAQAUhVFeZTrWYKcrmDe;
-
xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4;
-
xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA;
-
xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a;
-
xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs;
-
_ga=GA1.2.2007990410.1534303926;
-
_gid=GA1.2.1454932696.1534303926;
-
u=781534303927452; device_id=0883ecbffed505f2f843656aec9a0524;
-
Hm_lvt_1db88642e346389874251b5a1eded6e3=1534303929,1534303938;
-
_gat_gtag_UA_16079156_4=1;
-
Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534323392
-
'''
-
}
-
response = requests.get(url, headers=headers)
-
msg = response.content.decode('utf-8')
-
msg_dict = json.loads(msg)
-
# print(type(msg_dict))
-
# print(msg_dict)
-
if count == 10:
-
max_id = msg_dict['next_id']
-
else:
-
max_id = msg_dict['next_max_id']
-
flag = 0
-
while flag < count:
-
data = msg_dict['list'][flag]['data']
-
flag += 1
-
data_dict = json.loads(data)
-
# print(data_dict)
-
uid = data_dict['id']
-
title = data_dict['title']
-
description = data_dict['description']
-
target = data_dict['target']
-
# ---------------------添加到数据库-----------------------------
-
# SQL 插入语句
-
sql = """INSERT INTO xueqiu(uid, title, description, target)
-
VALUES ('{}', '{}', '{}', '{}')""".format(uid, title, description, target)
-
try:
-
# 执行sql语句
-
cursor.execute(sql)
-
# 提交到数据库执行
-
db.commit()
-
except:
-
# 如果发生错误则回滚
-
db.rollback()
-
count = 15
-
i = i + 1
-
# 关闭数据库连接
-
db.close()