问题描述
有些网页的反爬虫做得比较好,href="javascript:void(0);"
或href="javascript:;"
,含义是留在原处不跳转
此时无法直接从href中获取链接,链接直接写进监听事件里,从.js文件中也无从(难以)获取
解决方案
使用Selenium模拟用户点击网页进行爬取
注意!!每次调用drvier之前建议调用time.sleep()
,因为程序运行速度远远比浏览器操作快
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def wait(locator, timeout=10):
'''等到元素加载完成'''
WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))
driver = webdriver.Chrome()
driver.get('某链接')
locator = (By.CLASS_NAME, '要爬取的类') # 相当于find_elements_by_class_name
wait(locator)
elements = driver.find_elements_by_class_name('要爬取的类')
link = []
linkNum = len(elements)
for i in range(linkNum):
wait(locator)
elements = driver.find_elements_by_class_name('要爬取的类') # 再次获取元素,预防StaleElementReferenceException
driver.execute_script('arguments[0].click();', elements[i]) # 模拟用户点击
time.sleep(sleep_second)
print(i, driver.current_url)
link.append(driver.current_url)
time.sleep(0.01) # 留时间给页面后退,网不好调大点,此处用driver.implicitly_wait()无效
driver.back()
driver.quit()
print('共{}条链接'.format(len(link)))
封装函数
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_javascript0_links(url, class_name, sleep_second=0.01):
"""
Selenium模拟用户点击爬取url
:param url: 目标页面
:param class_name: 模拟点击的类
:param sleep_second: 留给页面后退的时间
:return: list, 点击class为class_name进去的超链接
"""
def wait(locator, timeout=10):
"""等到元素加载完成"""
WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))
options = Options()
options.add_argument("--headless") # 无界面
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
locator = (By.CLASS_NAME, class_name)
wait(locator)
elements = driver.find_elements_by_class_name(class_name)
link = []
linkNum = len(elements)
for i in range(linkNum):
wait(locator)
elements = driver.find_elements_by_class_name(class_name)
driver.execute_script("arguments[0].click();", elements[i])
time.sleep(sleep_second)
link.append(driver.current_url)
time.sleep(sleep_second)
driver.back()
driver.quit()
return link
if __name__ == "__main__":
url = "目标页面"
class_name = "模拟点击的类"
link = get_javascript0_links(url, class_name)
for i, _link in enumerate(link):
print(i, _link)