# -*- coding: utf-8 -*- import scrapy import re from ..items import CsdnItem import requests from lxml import etree import selenium from selenium import webdriver from time import sleep class CnSpider(scrapy.Spider): name = 'cnspider' allowed_domains = ['blog.csdn.net'] start_urls = ['https://blog.csdn.net/nav/cloud'] def parse(self, response): driver = webdriver.Chrome() # 可以随时查看动态情况,方便确认爬取是否顺利 # 无界面操作执行 # options = webdriver.ChromeOptions() # options.add_argument('headless') # driver = webdriver.Chrome(chrome_options=options) driver.get(self.start_urls[0]) sleep(3) # 页面滚动五次 for i in range(20): driver.execute_script('document.documentElement.scrollTop=20000') div_list = driver.find_elements_by_xpath("//div[@class='list_con']") print('获取到的需要爬取的链接对象有%d个' % len(div_list)) sleep(3) for div in div_list: item = CsdnItem() item['title'] = div.find_element_by_xpath("./div[@class='title']/h2/a").get_attribute('textContent').replace('\n', '') item['content'] = div.find_element_by_xpath("./div[@class='summary oneline']").get_attribute('textContent').replace('\n', '') item['url'] = div.find_element_by_xpath("./div[@class='title']/h2/a").get_attribute('href') yield item
爬取滚动网站实例---获取内容
猜你喜欢
转载自blog.csdn.net/jiangwei1102/article/details/80790246
今日推荐
周排行