任务
- 使用任意方法爬取王者荣耀赛程
- 爬取如下图所示数据
网页分析
从图中可以看出浏览器中的页面是经过JavaScript处理数据后生成的结果,这些数据是通Ajax加载的。
对于这种情况,无法直接使用requests直接爬取信息,因为原始的页面最初不会包含某些数据,原始页面加载完成后,会再向服务器请求某个接口获取数据,然后数据才被处理从而呈现到网页上。
使用 selenium 爬取
from selenium import webdriver
import time
class match:
def __init__(self):
self.time = '' # 比赛时间
self.status = '' # 比赛状态
self.place = '' # 比赛城市
self.team1 = '' # 队伍1的名字
self.team2 = '' # 队伍2的名字
self.score = '' # 比分
def print_match_info(self):
print(self.time,self.status,self.place,self.team1,self.team2,self.score)
def main():
match_info_list = []
with webdriver.Chrome() as driver:
driver.implicitly_wait(10) # 隐式等待
driver.get("https://pvp.qq.com/match/kpl/index.shtml")
# 点击2020秋季赛常规赛
match_type = driver.find_elements_by_xpath('//ul[@class="kpl_schedule_nav"]/li/a')
for i,m in enumerate(match_type):
m.click()
time.sleep(1)
# 爬取第几周
week = driver.find_elements_by_xpath('//div[@class="kpl_schedule_date clearfix"][%d]/a'%(i+1))
for j,w in enumerate(week):
if j+1 >= 8 and j < len(week):
driver.execute_script("arguments[0].click();", driver.find_element_by_id('dateNext'))
print(w.text)
# 进入下一周
driver.execute_script("arguments[0].click();", w)
time.sleep(1)
if i == 1 and j == 0:
# 2020秋季赛季后赛第一周
for x in range(4):
driver.execute_script("arguments[0].click();", driver.find_element_by_id('matchPrev')) # 向左翻
# 比赛信息
match_info = driver.find_elements_by_xpath('//*[@id="matchList"]/li')
for n,info in enumerate(match_info):
if (n+1) >= 3 and n < len(match_info)-1:
driver.execute_script("arguments[0].click();", driver.find_element_by_id('matchNext')) # 向右翻
# 存储信息结构
match_info_struct = match()
# 比赛地点
match_info_struct.place = info.find_element_by_xpath('./div[@class="match-date"]/i[1]').get_attribute('class')
# 比赛时间
match_time = info.find_element_by_xpath('./div[@class="match-date"]/p[1]')
match_info_struct.time = match_time.find_element_by_xpath('./i').text + ' ' + match_time.find_element_by_xpath('./span').text
# 比赛状态
match_info_struct.status = info.find_element_by_xpath('./div[@class="match-date"]/p[2]').text
# 队伍的名字
team = info.find_elements_by_xpath('./div[@class="match-team pr"]//p[@class="team-info"]')
match_info_struct.team1,match_info_struct.team2 = team[0].text,team[1].text
# 比分
score = info.find_elements_by_xpath('./div[@class="match-team pr"]//div[@class="pa match-score"]/p')
match_info_struct.score = score[0].text + score[1].text + score[2].text
# 保存信息
match_info_list.append(match_info_struct)
# 打印信息
match_info_struct.print_match_info()
# 打印信息
# for info in match_info_list:
# info.print_match_info()
if __name__ == __name__:
main()
使用 requests 爬取
分析Ajax
- 通过分析可以发现begin_time,end_time,stage改变,网页信息就会改变
- ‘https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A+alA=&begin_time={}&end_time={}&seasonid=KPL2020S2&stage={}’
- 我们只要知道begin_time,end_time,stage就可以爬取数据了
- 分析之后发现begin_time,end_time是没有规律的,那就只能把所有的都保存下来
- #秋季常规赛’1600214400’,‘1600646400’,‘1601251200’,‘1601856000’,‘1602460800’,‘1603065600’,‘1603670400’,‘1604275200’,‘1604880000’
- #秋季赛季后赛’1606262400’,‘1606694400’,‘1606953600’,‘1607299200’,‘1608336000’,‘1608422400’
获取 html 内容后可以看到这是一个 json 结构存储的数据
把它解析一下就可以得到我们想要的数据了
完整代码
import requests
class match:
def __init__(self):
self.time = '' # 比赛时间
self.status = '' # 比赛状态
self.place = '' # 比赛城市
self.team1 = '' # 队伍1的名字
self.team2 = '' # 队伍2的名字
self.score = '' # 比分
def print_match_info(self):
print(self.time,self.status,self.place,self.team1,self.team2,self.score)
def getHtml(begin_time,end_time,stage):
url = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A+alA=&begin_time={}&end_time={}&seasonid=KPL2020S2&stage={}'.format(begin_time,end_time,stage)
html = requests.get(url)
return html.json()
def parse_page(json):
if json:
items = json.get('data')
for item in items:
match_info = match()
match_info.time = item.get('match_time')
match_info.status = item.get('match_state')
match_info.place = item.get('region')
match_info.team1 = item.get('hname')
match_info.team2 = item.get('gname')
match_info.score = str(item.get('host_score'))+':'+str(item.get('guest_score'))
#match_info.print_match_info()
yield match_info
def main():
#秋季常规赛
AutumnRegularSeason = ['1600214400','1600646400','1601251200','1601856000','1602460800','1603065600','1603670400','1604275200','1604880000']
#秋季赛季后赛
AutumnPostseason = ['1606262400','1606694400','1606953600','1607299200','1608336000','1608422400']
stages = ['cgs','jhs']
#爬取秋季常规赛
for i in range(len(AutumnRegularSeason)-1):
print(AutumnRegularSeason[i],AutumnRegularSeason[i+1])
html = getHtml(AutumnRegularSeason[i],AutumnRegularSeason[i+1],stages[0])
results = parse_page(html)
for result in results:
result.print_match_info()
#爬取秋季赛季后赛
for i in range(0,len(AutumnPostseason),2):
print(AutumnPostseason[i],AutumnPostseason[i+1])
html = getHtml(AutumnPostseason[i],AutumnPostseason[i+1],stages[1])
results = parse_page(html)
for result in results:
result.print_match_info()
if __name__ == "__main__":
main()
完成效果
selenium 爬取的结果
requests 爬取的结果
常见错误
-
链接点击不了
原因:链接被图片挡住
解决方法:使用 execute_script 来点击 -
使用 selenium 会遇到错误
- 2020秋季赛季后赛那几周会爬不了(网站有bug)
- 2021春季常规赛会报错(没有数据)