selenium爬取斗鱼的房间信息
"""
@author:86135
@file:douyu.py
@time:2020/10/05
@desc:
格式化代码:Ctrl+Alt+L
运行代码:Ctrl+Shift+F10
"""
from selenium import webdriver
import time
class douyuSpider():
def __init__(self):
self.startUrl = 'https://www.douyu.com/directory/all'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
self.driver = webdriver.Chrome(chrome_options=options)
def parse(self):
time.sleep(5)
self.roll()
li_list = self.driver.find_elements_by_xpath("//*[@id='listAll']/section[2]/div[2]/ul/li")
content_list = []
for li in li_list:
item = {
}
item["cate"] = li.find_element_by_xpath(".//span[@class = 'DyListCover-zone']").text
item["name"] = li.find_element_by_xpath(".//div[@class='DyListCover-userName']").text
print(item)
content_list.append(item)
next_url = self.driver.find_elements_by_xpath("//li[@class=' dy-Pagination-next']/span")
print(len(next_url))
next_url = next_url[0] if len(next_url)>0 else None
return content_list, next_url
def save_content(self, content_list):
pass
def roll(self):
for x in range(1,11,2):
time.sleep(0.5)
j = x /10
js = "document.documentElement.scrollTop = document.documentElement.scrollHeight * {}".format(j)
self.driver.execute_script(js)
def run(self):
self.driver.get(self.startUrl)
content_list, next_url = self.parse()
self.save_content(content_list)
while next_url is not None:
next_url.click()
content_list, next_url = self.parse()
self.save_content(content_list)
else:self.driver.quit()
if __name__ == '__main__':
douyu = douyuSpider()
douyu.run()