1.下载chromedriver,记住chromedriver和chrome浏览器版本有对应关系
2.获得动态加载后的界面模拟鼠标操作,获得需要点击等特定操作后才能获得的动态加载的数据
3.源码:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
#下载后的chromedriver地址,我这里是windows版本的
CHROME_DRIVER_PATH = 'D:\\Code\imgageRecognition\\site_scrapy\\chromedriver.exe'
#下载动态界面,返回可被beatifulsoup4解析的数据
def get_dynamic_html(site_url):
print('开始加载',site_url,'动态页面')
chrome_options = webdriver.ChromeOptions()
#ban sandbox
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#use headless
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
#print('dynamic laod web is', site_url)
driver.set_page_load_timeout(100)
#driver.set_script_timeout(100)
try:
driver.get(site_url)
except Exception as e:
driver.execute_script('window.stop()') # 超出时间则不加载
print(e, 'dynamic web load timeout')
action = ActionChains(driver)
womwn_nav_tag = driver.find_element_by_css_selector('.navigation-bar.second-level.clearfix.p_15.active')
nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('.navigation-bar-item')
for tag in nav_tag_list:
print(tag.text)
#模拟移动鼠标获得动态加载后的数据
action.move_to_element(tag).perform()
time.sleep(5)
data = driver.page_source
soup = BeautifulSoup(data, 'html.parser')
try:
driver.quit()
except:
pass
return soup