SeleniumMiddleware和隐式等待和设置图片不加载

https://www.aqistudy.cn/historydata/monthdata.php?city=%E6%B7%B1%E5%9C%B3
上面这个网站的数据都是动态加载的所以使用selenium中间件来实现动态加载避免自己来分析js代码,设置图片不加载提高效率,设置隐式等待来实现需要的数据加载完成后程序的快速向下运行, 设置随机请求头绕反爬机制,中间件部分代码如下

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from fake_useragent import UserAgent
from scrapy.http import HtmlResponse

class SeleniumSpiderMiddleware(object):
    def process_request(self, request, spider):
        # 随机请求头
        ua = UserAgent()
        ua_use = ua.random
        # 设置请求头
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        # 图片不加载
        SERVICE_ARGS = ['--disk-cache=true', '--load-images=false']
        dcap["phantomjs.page.settings.userAgent"] = (ua_use)
        # 请求头生效, 图片不加载生效
        driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=SERVICE_ARGS)
        # 请求的url是这个
        url = request.url
        driver.get(url)
        # 设置等待所有的td标签加载完成
        locator = (By.CSS_SELECTOR, 'tbody > tr > td')
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located(locator))
        # 截图看是否有图片加载
        driver.save_screenshot('aqi.png')
        body = driver.page_source
        driver.close()
        # body必须为bytes类型
        response = HtmlResponse(url=url, request=request, encoding='utf8', body=body.encode())
        # 直接return response 直接将response返回到engine不会重新下载
        return response

猜你喜欢

转载自blog.csdn.net/newdas123/article/details/78583414