爬取滚动网站实例---获取内容

# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import CsdnItem
import requests
from lxml import etree
import selenium
from selenium import webdriver
from time import sleep

class CnSpider(scrapy.Spider):
    name = 'cnspider'
    allowed_domains = ['blog.csdn.net']
    start_urls = ['https://blog.csdn.net/nav/cloud']

    def parse(self, response):
        driver = webdriver.Chrome()  # 可以随时查看动态情况,方便确认爬取是否顺利

        # 无界面操作执行
        # options = webdriver.ChromeOptions()
        # options.add_argument('headless')
        # driver = webdriver.Chrome(chrome_options=options)
        driver.get(self.start_urls[0])
        sleep(3)
        # 页面滚动五次
        for i in range(20):
            driver.execute_script('document.documentElement.scrollTop=20000')
            div_list = driver.find_elements_by_xpath("//div[@class='list_con']")
            print('获取到的需要爬取的链接对象有%d个' % len(div_list))
            sleep(3)
            for div in div_list:
                item = CsdnItem()
                item['title'] = div.find_element_by_xpath("./div[@class='title']/h2/a").get_attribute('textContent').replace('\n', '')
                item['content'] = div.find_element_by_xpath("./div[@class='summary oneline']").get_attribute('textContent').replace('\n', '')
                item['url'] = div.find_element_by_xpath("./div[@class='title']/h2/a").get_attribute('href')
                yield item



猜你喜欢

转载自blog.csdn.net/jiangwei1102/article/details/80790246