#本文使用Python 3实现,笔者于Python 3.6,Python 3.7平台调试可运行。
#由于爬虫技术特殊性,最后一次调试:Nov.27,2018时仍然可用。
#IDE上,我更偏向于Anaconda。conda的数据分析与科学计算能力比较强、自带许多数学库(方便解析爬来的数据)。conda套件里有一个叫Spyder的软件。本期文章使用它实现。
网易云音乐是一款由网易开发的音乐产品,试想这样一种情况:老师/上司给我布置一份任务,为明天的迎新晚会做准备。需要把节目清单上的歌曲下载下来,交给负责此方面的同事。我们该怎样做呢?对照歌单一个一个在搜索框里搜索-下载?不,这看起来效率并不高。
如果有一个程序,能够实现基于给出播放列表的音乐批量下载。取缔现有的人工单个搜索,提高搜索精度。应该会有不少的效率提升。
#根据监管规定,本项目仅用于测试与学习用途。请勿用作商业用途。请在合法范围内使用。
#根据监管规定,本项目仅用于测试与学习用途。请勿用作商业用途。请在合法范围内使用。
#根据监管规定,本项目仅用于测试与学习用途。请勿用作商业用途。请在合法范围内使用。
首先,我们先看一下思路:
爬取网易云这件事你得建一个工程文件夹把所有需要的东西都放在这里面,那既然是工程。就不可避免的有一个main函数。据说许多初涉Python DA的同学不喜欢写main?
本实现基于requests, sys, click, re, base64, binascii, json, os, Crypto.Cipher, http库。
这些库不需要百分百理解,但最好有个8,9成的基础。实践出真知。
hash库后来发现用不到就删掉了。
整个程序整体的框架,在一些疑难点加了注释方便理解。
main:
建立两个类,存放NeteaseMusic的信息和Song的信息(包含歌曲ID,歌曲URL等)。
参考Jack Cherish的算法,使用quiet定位最优结果。
class Netease():
#
def __init__(self, timeout, folder, quiet, cookie_path):
self.finder = finder(timeout, cookie_path)
self.folder = '.' if folder is None else folder
self.quiet = quiet
建立一个文件夹放下载好的歌曲
下载的歌曲列表写在一个music_list文本文件中,并保存为文本文档。
只读方式打开这个文档
调用map根据按行读入的文本(忽略换行等无效字符)生成一个list,传给download_song_by_search函数。
download_song_by_search函数根据字符串查询是否有这首歌
def get_song_url(self, song_id, bit_rate=320000):
#这里加入了320K优先
url = 'http://music.163.com/weapi/song/enhance/player/url?csrf_token='
csrf = ''
params = {'ids': [song_id], 'br': bit_rate, 'csrf_token': csrf}
result = self.post_request(url, params)
# 歌曲下载地址
song_url = result['data'][0]['url']
# 歌曲不存在
if song_url is None:
click.echo('Song {} is not available due to copyright issue.'.format(song_id))
else:
return song_url
调用search函数从网易云搜索API根据需要的结构
def search(self, search_content, search_type, limit=9):
url = 'http://music.163.com/weapi/cloudsearch/get/web?csrf_token='
params = {'s': search_content, 'type': search_type, 'offset': 0, 'sub': 'false', 'limit': limit}
result = self.post_request(url, params)
return result
解析 原理较为复杂,初学者可跳过
class Encrypyed():
#解密算法(学习自Jack-Cherish的GitHub,初学者建议跳过)
def __init__(self):
self.modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
self.nonce = '0CoJUm6Qyw8W8jud'
self.pub_key = '010001'
# 登录加密算法, 基于https://github.com/stkevintan/nw_musicbox脚本实现
def encrypted_request(self, text):
text = json.dumps(text)
sec_key = self.create_secret_key(16)
enc_text = self.aes_encrypt(self.aes_encrypt(text, self.nonce), sec_key.decode('utf-8'))
enc_sec_key = self.rsa_encrpt(sec_key, self.pub_key, self.modulus)
data = {'params': enc_text, 'encSecKey': enc_sec_key}
return data
def aes_encrypt(self, text, secKey):
pad = 16 - len(text) % 16
text = text + chr(pad) * pad
encryptor = AES.new(secKey.encode('utf-8'), AES.MODE_CBC, b'0102030405060708')
ciphertext = encryptor.encrypt(text.encode('utf-8'))
ciphertext = base64.b64encode(ciphertext).decode('utf-8')
return ciphertext
def rsa_encrpt(self, text, pubKey, modulus):
text = text[::-1]
rs = pow(int(binascii.hexlify(text), 16), int(pubKey, 16), int(modulus, 16))
return format(rs, 'x').zfill(256)
def create_secret_key(self, size):
return binascii.hexlify(os.urandom(size))[:16]
def post_request(self, url, params):
"""
Post请求
return: 字典
"""
data = self.ep.encrypted_request(params)
resp = self.session.post(url, data=data, timeout=self.timeout)
result = resp.json()
if result['code'] != 200:
click.echo('post_request error')
else:
return result
返回值为Song的类。
class Song():
#歌曲信息
def __init__(self, song_id, song_name, song_num, song_url=None):
self.song_id = song_id
self.song_name = song_name
self.song_num = song_num
self.song_url = '' if song_url is None else song_url#若URL存在
如果返回存在:
调用download_song_by_id函数下载这首歌
def download_song_by_id(self, song_id, song_name, song_num, folder='.'):
#
try:
url = self.finder.get_song_url(song_id)
#
song_name = song_name.replace('/', '')
song_name = song_name.replace('.', '')#无效字符处理
self.finder.get_song_by_url(url, song_name, song_num, folder)#调用finder.get_song_by_url函数下载
except:
click.echo('download_song_by_id error')#异常处理
重点来了,首先我们需要删除文件名字符串中的无效字符。
通过self.download_session.get()方法 返回里的小包写入到文件
def get_song_by_url(self, song_url, song_name, song_num, folder):
if not os.path.exists(folder):#若保存路径不可用
os.makedirs(folder)
fpath = os.path.join(folder, str(song_num) + '_' + song_name + '.mp3')#创建空mp3文件
if sys.platform == 'win32' or sys.platform == 'cygwin':
valid_name = re.sub(r'[<>:"/\\|?*]', '', song_name)#在Windows下,有些字符不被支持
if valid_name != song_name:
click.echo('{} will be saved as: {}.mp3'.format(song_name, valid_name))
fpath = os.path.join(folder, str(song_num) + '_' + valid_name + '.mp3')
if not os.path.exists(fpath):
resp = self.download_session.get(song_url, timeout=self.timeout, stream=True)
length = int(resp.headers.get('content-length'))
label = 'Downloading {} {}kb'.format(song_name, int(length/1024))
with click.progressbar(length=length, label=label) as progressbar:
with open(fpath, 'wb') as song_file:
for chunk in resp.iter_content(chunk_size=1024):#以1024为单位
if chunk:
song_file.write(chunk)#连续小包写入文件
progressbar.update(1024)
方便理解,给出iter_content方法的源码
def iter_content(self, chunk_size=1, decode_unicode=False):
"""Iterates over the response data. When stream=True is set on the
request, this avoids reading the content at once into memory for
large responses. The chunk size is the number of bytes it should
read into memory. This is not necessarily the length of each item
returned as decoding can take place.
chunk_size must be of type int or None. A value of None will
function differently depending on the value of `stream`.
stream=True will read data as it arrives in whatever size the
chunks are received. If stream=False, data is returned as
a single chunk.
If decode_unicode is True, content will be decoded using the best
available encoding based on the response.
"""
def generate():
# Special case for urllib3.
if hasattr(self.raw, 'stream'):
try:
for chunk in self.raw.stream(chunk_size, decode_content=True):
yield chunk
except ProtocolError as e:
raise ChunkedEncodingError(e)
except DecodeError as e:
raise ContentDecodingError(e)
except ReadTimeoutError as e:
raise ConnectionError(e)
else:
# Standard file-like object.
while True:
chunk = self.raw.read(chunk_size)
if not chunk:
break
yield chunk
self._content_consumed = True
if self._content_consumed and isinstance(self._content, bool):
raise StreamConsumedError()
elif chunk_size is not None and not isinstance(chunk_size, int):
raise TypeError("chunk_size must be an int, it is instead a %s." % type(chunk_size))
# simulate reading small chunks of the content
reused_chunks = iter_slices(self._content, chunk_size)
stream_chunks = generate()
chunks = reused_chunks if self._content_consumed else stream_chunks
if decode_unicode:
chunks = stream_decode_response_unicode(chunks, self)
return chunks
def iter_lines(self, chunk_size=ITER_CHUNK_SIZE, decode_unicode=None, delimiter=None):
"""Iterates over the response data, one line at a time. When
stream=True is set on the request, this avoids reading the
content at once into memory for large responses.
.. note:: This method is not reentrant safe.
"""
pending = None
for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode):
if pending is not None:
chunk = pending + chunk
if delimiter:
lines = chunk.split(delimiter)
else:
lines = chunk.splitlines()
if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
pending = lines.pop()
else:
pending = None
for line in lines:
yield line
if pending is not None:
yield pending
@property
def content(self):
"""Content of the response, in bytes."""
if self._content is False:
# Read the contents.
if self._content_consumed:
raise RuntimeError(
'The content for this response was already consumed')
if self.status_code == 0 or self.raw is None:
self._content = None
else:
self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
self._content_consumed = True
# don't need to release the connection; that's been handled by urllib3
# since we exhausted the data.
return self._content
如果返回不存在:
报错
如果歌单文件不存在
报错
在编写过程中,编写详尽的报错信息尤为重要,它可以极大程度的方便定位错误。
Conda虽强大,Spyder环境稳定性世界闻名。笔者写实现中崩了不下二十次。试想如果每次都是莫名其妙崩了。可能早就失去耐心折腾这玩意了哈哈。
大功告成。