# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from time import sleep from urllib.request import urlretrieve import random import string # 定义一个用来生成随机名字的函数 def random_str(size=64): base_str = string.ascii_letters + string.digits return ''.join(random.choice(base_str) for _ in base_str) class XgSpider(scrapy.Spider): name = 'xg' allowed_domains = ['www.ixigua.com'] start_urls = ['http://www.ixigua.com/'] def parse(self, response): href_list = [] # 先获取到所有的a标签对象 driver = webdriver.Chrome() # 可以随时查看动态情况,方便确认爬取是否顺利 # 无界面操作执行 # options = webdriver.ChromeOptions() # options.add_argument('headless') # driver = webdriver.Chrome(chrome_options=options) # print(type(response)) # 该response代表对象是网页源代码?????????? url = 'https://www.ixigua.com/' driver.get(url) sleep(3) # 页面滚动五次 for i in range(5): driver.execute_script('document.documentElement.scrollTop=20000') a_list = driver.find_elements_by_xpath('//div[@class="title-box"]/a[@class="link"]') print('获取到的需要爬取的链接对象有%d个' % len(a_list)) sleep(3) # 开始获取详情页的链接 # href_list = [] # for a in a_list: # href = a.get_attribute('href') # href_list.append(href) # # print(href) # https://www.ixigua.com/group/6562496704937984525/ href_list = [a.get_attribute('href') for a in a_list] # 在详情页获取视频的链接地址并保存 for href in href_list: driver.get(href) sleep(2) src = driver.find_element_by_tag_name('video').get_attribute('src') print('当前爬取视频链接为%s' % src) urlretrieve(src, filename='./video/' + random_str(size=5) + '.mp4') print('该视频信息保存成功') print('全部数据保存完毕') driver.close() return href_list
爬取西瓜视频
猜你喜欢
转载自blog.csdn.net/jiangwei1102/article/details/80789933
今日推荐
周排行