1. 数据爬取
爬虫部分主要是调用官方API,本次用到的API主要有两个:
获取评论:
http://music.163.com/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit={每页限制数量}&offset={评论数总偏移}
获取评论对应用户的信息:
https://music.163.com/api/v1/user/detail/{用户ID}
工具:
Python3.6
sublime3
MySQL(数据存储)
scrapy(数据清洗)
pyecharts(可视化工具库)
* 关于网易云音乐官方API,后期会整理一下做个汇总放在GitHub上。
1.1 评论爬取
实际操作过程中,网易云官方对于API的请求是有限制的,有条件的可以采用更换代理IP来防反爬,本次采用的是单线程爬取,所以IP封的并不太频繁,后面会对代码进行重构,实现多线程+更换IP来加快爬取速度。
根据获取评论的API,请求URL有3个可变部分:歌曲ID、每页限制数limit和评论总偏移量offset,通过API分析得知:当offeset=0
时,返回json数据中包含有评论总数量total
,所以根据API可设计爬虫如下:
# -*- coding:utf8 -*-
# python3.6
from urllib import request
import json
import pymysql
from datetime import datetime
import re
ROOT_URL = 'http://music.163.com/api/v1/resource/comments/R_SO_4_%s?limit=%s&offset=%s'
LIMIT_NUMS = 50 # 每页限制爬取数
DATABASE = '' # 数据库名
TABLE = '' # 数据库表名
# 数据表设计如下:
'''
id(int) commentId(varchar)
content(text) likedCount(int)
userId(varchar) time(datetime)
'''
PATTERN = re.compile(r'[\n\t\r\/]') # 替换掉评论中的特殊字符以防插入数据库时报错
def getData(url):
if not url:
return None, None
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
"Host": "music.163.com",
}
print('Crawling>>> ' + url)
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
total = int(js['total'])
datas = []
for c in js['comments']:
data = dict()
data['commentId'] = c['commentId']
data['content'] = PATTERN.sub('', c['content'])
data['time'] = datetime.fromtimestamp(c['time']//1000)
data['likedCount'] = c['likedCount']
data['userId'] = c['user']['userId']
datas.append(data)
return total, datas
except Exception as e:
print('Down err>>> ', e)
pass
def saveData(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='****', passwd='****', db='****', charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储评论中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + TABLE + ' (id,commentId,content,likedCount,time,userId) VALUES (%s,%s,%s,%s,%s,%s)'
for d in data:
try:
cursor.execute('SELECT max(id) FROM '+TABLE)
id_ = cursor.fetchone()[0]
cursor.execute(sql, (id_+1,d['commentId'], d['content'], d['likedCount'], d['time'], d['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ',d['commentId'],e)
pass
cursor.close()
conn.close()
if __name__ == '__main__':
songId = input('歌曲ID:').strip()
total,data = getData(ROOT_URL%(songId, LIMIT_NUMS, 0))
saveData(data)
if total:
for i in range(1, total//EVERY_PAGE_NUMS+1):
_, data = getData(ROOT_URL%(songId, LIMIT_NUMS, i*(LIMIT_NUMS)))
saveData(data)
以上代码实现了单线程爬取网易云音乐某首歌曲的评论并存储进数据库(在这里其实有个坑!!!不过不影响,后面会讲到)。实际上,API返回的不仅仅包含代码中所提到的信息,具体可自行测试,我们还想要得到评论对应的用户的具体信息,但是这个API返回的用户信息不全面,所以接下来,针对评论对应的用户信息进行抓取。
1.2 用户信息爬取
根据获取用户信息的API,请求URL有1个可变部分:用户ID,前一部分已经将每条评论对应的用户ID也存储下来,这里只需要从数据库取用户ID并抓取信息即可,所以根据API可设计爬虫如下:
# -*- coding:utf8 -*-
# python3.6
from urllib import request
import json
import pymysql
import re
ROOT_URL = 'https://music.163.com/api/v1/user/detail/'
DATABASE = '****'
TABLE_USERS = '****'
TABLE_COMMENTS = '****'
# 数据表设计如下:
'''
id(int) userId(varchar)
gender(char) userName(varchar)
age(int) level(int)
city(varchar) sign(text)
eventCount(int) followedCount(int)
followsCount(int) recordCount(int)
avatar(varchar)
'''
PATTERN = re.compile(r'[\n\t\r\/]') # 替换掉签名中的特殊字符以防插入数据库时报错
def getData(url):
if not url:
return None
print('Crawling>>> ' + url)
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
data = {}
if js['code'] == 200:
data['userId'] = js['profile']['userId']
data['userName'] = js['profile']['nickname']
data['avatar'] = js['profile']['avatarUrl']
data['gender'] = js['profile']['gender']
if int(js['profile']['birthday'])<0:
data['age'] = 0
else:
data['age'] =(2018-1970)-(int(js['profile']['birthday'])//(1000*365*24*3600))
if int(data['age'])<0:
data['age'] = 0
data['level'] = js['level']
data['sign'] = PATTERN.sub(' ', js['profile']['signature'])
data['eventCount'] = js['profile']['eventCount']
data['followCount'] = js['profile']['follows']
data['fanCount'] = js['profile']['followeds']
data['city'] = js['profile']['city']
data['recordCount'] = js['listenSongs']
except Exception as e:
print('Down err>>> ', e)
pass
return None
def saveData(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='****', passwd='****', db=DATABASE, charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储签名中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + TABLE + ' (id,userName,gender,age,level,city,sign,eventCount,followsCount,followedCount,recordCount,avatar,userId) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
try:
cursor.execute('SELECT max(id) FROM '+TABLE_USERS)
id_ = cursor.fetchone()[0]
cursor.execute(sql, (id_+1,data['userName'],data['gender'],data['age'],data['level'],data['city'],data['sign'],data['eventCount'],data['followsCount'],data['followedCount'],data['recordCount'],data['avatar'],data['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ',data['userId'],e)
pass
finally:
cursor.close()
conn.close()
def getID():
conn = pymysql.connect(host='localhost', user='****', passwd='****', db=DATABASE, charset='utf8mb4')
cursor = conn.cursor()
sql = 'SELECT userId FROM '+TABLE_COMMENTS
try:
cursor.execute(sql)
res = cursor.fetchall()
return res
except Exception as e:
print('get err>>> ', e)
pass
finally:
cursor.close()
conn.close()
return None
if __name__ == '__main__':
usersID = getID()
for i in usersID:
data = getData(ROOT_URL+i[0].strip())
saveData(data)
以上代码实现了单线程爬取网易云音乐用户信息并存储进数据库。至此,已经完成了歌曲评论和对应用户信息的抓取。接下来,对抓取到的数据进行清洗及可视化分析。
1.3 数据清洗 & 可视化
未完,待续…