爬取喜马拉雅三国中的前十章音频:
#导入requests模块
import requests
#导入正则表达式
import re
#解决反爬问题,导入UA
header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
#网页源代码中获取的前十章ID
sound_ids = (
64686514,
64689648,
64695831,
64695832,
3218935,
3822581,
3419626,
3513844,
3593277,
3773655)
for s
in
range(
0,
10):
for i
in sound_ids:
# 每个音频的URL
url =
'http://www.ximalaya.com/tracks/'+
str(sound_ids[s])+
'.json'
#网页源代码
html = requests.get(url,
headers=header)
#打印网页源代码
# print(html.text)
def
get_find_url():
#正则匹配ID和对应的URL
reg =
'"id":(.*?),"play_path_64":"(.*?)"'
#最终的音频URL数列
sound_url = re.findall(reg,html.text)
#打印音频URL数列
# print(sound_url)
return sound_url
#ID和音频URL单独取出来
for
id,url_finall
in get_find_url():
#打印最终音频URL
#print('第',s+1,'节:',url1)
#获取音频详细内容
m4a = requests.get(url_finall)
#取音频最后4位数,即就是.m4a作为后缀名
m4a_name = url_finall[-
4:]
print(
'<正在下载第',s+
1,
'节> ',url_finall)
#音频内容存储到本地
with
open(
'第'+
str(s+
1)+
'节'+m4a_name,
'wb')
as f:
f.write(m4a.content)