#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''
爬虫使用demo
由于麦子视频使用了防盗链,需要在headers头加上Referer来源,不然下载的视频返回403
demo: python maizi.py http://www.maiziedu.com/course/307/
'''
import re
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import Pool
import sys
class MaiZi():
def __init__(self,url):
self.url = url
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Referer":"http://www.maiziedu.com"
}
def parse_next_url(self):
request = requests.get(self.url,headers=self.headers)
request.encoding = request.apparent_encoding
for url in BeautifulSoup(request.text,'lxml').select('ul.lesson-lists li a'):
next_url = 'http://www.maiziedu.com/' + url['href']
yield next_url
def parse_content(self,url):
request = requests.get(url,headers=self.headers)
request.encoding = request.apparent_encoding
regex = re.compile(r'\$lessonUrl = "(.*?)"')
next_url = regex.findall(request.text)[0]
#print(next_url)
content = requests.get(next_url,headers = self.headers).content
title = BeautifulSoup(request.text, 'lxml').select('span.selected')[0]['name'] # 视频的名称
print('#'*20)
print('download...')
with open(title + '.mp4','wb') as e:
e.write(content)
print("下载完成:",title)
def parse_pool(self):
pool = Pool(4)
pool.map(self.parse_content,self.parse_next_url())
pool.close()
if __name__ == '__main__':
#获取要下载的课程地址
course = sys.argv[1]
Run = MaiZi(course)
Run.parse_pool()
复制代码
更多内容请访问 www.zxb8.cc
转载于:https://juejin.im/post/5d0984fc5188255c636e290e