问题描述

有些网页的反爬虫做得比较好，href="javascript:void(0);"或href="javascript:;"，含义是留在原处不跳转

此时无法直接从href中获取链接，链接直接写进监听事件里，从.js文件中也无从（难以）获取

解决方案

使用Selenium模拟用户点击网页进行爬取

注意！！每次调用drvier之前建议调用time.sleep()，因为程序运行速度远远比浏览器操作快

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def wait(locator, timeout=10):
    '''等到元素加载完成'''
    WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))


driver = webdriver.Chrome()
driver.get('某链接')

locator = (By.CLASS_NAME, '要爬取的类')  # 相当于find_elements_by_class_name
wait(locator)
elements = driver.find_elements_by_class_name('要爬取的类')
link = []
linkNum = len(elements)
for i in range(linkNum):
    wait(locator)
    elements = driver.find_elements_by_class_name('要爬取的类')  # 再次获取元素，预防StaleElementReferenceException
    driver.execute_script('arguments[0].click();', elements[i])  # 模拟用户点击
    time.sleep(sleep_second)
    print(i, driver.current_url)
    link.append(driver.current_url)
    time.sleep(0.01)  # 留时间给页面后退，网不好调大点，此处用driver.implicitly_wait()无效
    driver.back()

driver.quit()
print('共{}条链接'.format(len(link)))

封装函数

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def get_javascript0_links(url, class_name, sleep_second=0.01):
    """
    Selenium模拟用户点击爬取url
    :param url: 目标页面
    :param class_name: 模拟点击的类
    :param sleep_second: 留给页面后退的时间
    :return: list, 点击class为class_name进去的超链接
    """

    def wait(locator, timeout=10):
        """等到元素加载完成"""
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))

    options = Options()
    options.add_argument("--headless")  # 无界面
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    locator = (By.CLASS_NAME, class_name)
    wait(locator)
    elements = driver.find_elements_by_class_name(class_name)
    link = []
    linkNum = len(elements)
    for i in range(linkNum):
        wait(locator)
        elements = driver.find_elements_by_class_name(class_name)
        driver.execute_script("arguments[0].click();", elements[i])
        time.sleep(sleep_second)
        link.append(driver.current_url)
        time.sleep(sleep_second)
        driver.back()
    driver.quit()
    return link


if __name__ == "__main__":
    url = "目标页面"
    class_name = "模拟点击的类"
    link = get_javascript0_links(url, class_name)
    for i, _link in enumerate(link):
        print(i, _link)

参考文献

XerCis

发布了223 篇原创文章 · 获赞 63 · 访问量 12万+

私信关注

Selenium模拟用户点击爬取javascript void(0)的超链接

问题描述

解决方案

封装函数

参考文献

猜你喜欢