- 点击搜索指定歌手
-
跳转进入歌单界面进入如下图:
-
找到通过歌手查看歌曲列表接口
- 点击下图进入新页面
- 可以看到下图页面和对应接口,可以看出,该接口的json数据里有歌曲下载链接
代码
import requests
import re
import json
class BaiduMusic():
def __init__(self):
self._getSongUrl = "http://music.taihe.com/search?key={林俊杰}"
self._getDownLoadUrl = "http://play.taihe.com/data/music/songlink"
"""
Referer:表面跳转方式 这里不加将为导致无法显示歌单
Cookie:让服务器认为我们使用浏览器进行访问
"""
self._header = {
"User-Agent": "",
"Referer": "",
"Cookie": ""
}
def GetSongsHtml(self):
res = requests.get(self._getSongUrl, headers=self._header)
return res.content.decode()
def GetSongIdList(self, html):
content = html
expression = r"data-songdata='\{ \"id\": \"(.+?)\" \}" #常用表达式
expression2=r"(?<=)href=\"\/song\/(?=)(.+?)\"" #前瞻后缀表达式
exector=re.compile(expression2)
result = exector.findall(content)
return result
def ConvertToStr(self,songsList):
songsStr=",".join(songsList)#List 切为str表达式
return songsStr
def GetSongsDownloadData(self,songsId):
data={
"type": "m4a,mp3",
"songIds":songsId
}
res=requests.post(self._getDownLoadUrl, data=data, headers=self._header)
songsJon=json.loads(res.content.decode())
return songsJon
def GetSongsList(self,songsJon):
songsList=[]
for songJson in songsJon["data"]["songList"]:
songDict={}
songDict["songName"]=songJson["songName"]
songDict["downLoadUrl"]=songJson["songLink"]
songsList.append(songDict)
return songsList
def GetSong(self,songUrl):
res=requests.get(songUrl,headers=self._header)
return res.content
def saveSongs(self,songsList):
for song in songsList:
songName=song["songName"]
songFile=self.GetSong(song["downLoadUrl"])
with open(songName+".mp3", "wb") as f:
f.write(songFile)
def main():
spider = BaiduMusic()
songHtml = spider.GetSongsHtml()
songsIdList=spider.GetSongIdList(songHtml)
songsIdStr=spider.ConvertToStr(songsIdList)
songsDownloadData=spider.GetSongsDownloadData(songsIdStr)
songsList=spider.GetSongsList(songsDownloadData)
spider.saveSongs(songsList)
if __name__ == '__main__':
main()
运行结果
总结
在爬取数据过程中,若在编码请求未看到返回数据,优先考虑是否是请求头的原因导致服务器认为该请求不是通过浏览器请求