利用FilesPipeline 下载视频
1.setting.py
# 保存log信息的文件名 LOG_LEVEL = "INFO" # LOG_STDOUT = True # LOG_ENCODING = 'utf-8' # # 路径 os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # LOG_FILE = "info.log" # 下载延迟 import random DOWNLOAD_DELAY = random.random() + random.random() RANDOMIZE_DOWNLOAD_DELAY = True # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36' # 视频下载路径 import os BASE_DIR = os.path.dirname((os.path.abspath(__file__))) MEDIA_ALLOW_REDIRECTS = True FILES_STORE = os.path.join(BASE_DIR, "videos") # 文件保存路径 FILES_URLS_FIELD = 'file_urls' # 这里对应着item.py文件中的字段 FILES_RESULT_FIELD = 'files' # 同样对应item.py文件中的字段 # 120 days of delay for files expiration # FILES_EXPIRES = 120 # 设置文件失效时间 ITEM_PIPELINES = { 'steam_video.pipelines.SteamVideoPipeline': 300, 'steam_video.pipelines.SteamDownLoadPipeline': 100, # 视频下载的管道 }
2.spider.py
import scrapy, pymysql, re class VideosSpider(scrapy.Spider): name = 'videos' allowed_domains = ['.com'] start_urls = ['https://www.baidu.com/'] def parse(self, response): db = pymysql.connect(host='localhost', port=3306, database='game', user='root', password='root', charset='utf8', autocommit=True) cursor = db.cursor() cursor.execute( 'SELECT id, appid, data_webm_source, data_webm_hd_source, data_mp4_source, data_mp4_hd_source, data_poster from steam_game_video WHERE id<5') # 获取图片url for appid in cursor.fetchall(): for i in range(2, 7): item = {} item['id'] = appid[0] item['appid'] = appid[1] item['file_url'] = appid[i] # 下载视频的url 前面要自己获取到 视频的url 可以自己爬 item['video_name'] = str(item['appid']) + '_' + re.findall(r'/(\d+)/', appid[i])[0] + '_' + \ appid[i].split('/')[-1].split('?')[0] # 后面图片要命名的名称 print(item) print('*' * 100) yield item
3.pipelines.py
# -*- coding: utf-8 -*- from scrapy.pipelines.images import FilesPipeline # 导入文件下载类 import scrapy, os, hashlib from scrapy.http import Request from scrapy.utils.python import to_bytes class SteamVideoPipeline(object): def process_item(self, item, spider): return item # 下载图片管道 class SteamDownLoadPipeline(FilesPipeline): def get_media_requests(self, item, info): return scrapy.Request(item['file_url'], meta={'video_name': item.get('video_name', None)}) # 下载视频 video_name为视频名称 def file_path(self, request, response=None, info=None): def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation media_ext = os.path.splitext(url)[1] # change to request.url after deprecation # 这里我们使用自定义的文件名,如果meta中没有video_name,就使用url的hash值作为文件名 return '{}'.format(request.meta.get('video_name')) # return 'full/%s.mp4' % (''.join(request.meta.get('video_name', media_guid).split(' ')))