一.
爬虫原理
什么是爬虫:爬虫指的是爬取数据
什么是互联网:由一堆网络设备把一台计算机互联到一起
互联网建立的目的:数据的传递与数据的共享
上网的全过程:
普通用户:
打开浏览器->往目的站点发送请求->接收响应数据->渲染到页面上
爬虫程序
爬虫的全过程:
1 发送请求(请求库)
requests模块
selenium模块
2 获取响应数据(服务器返回)
3 解析并提取数据(解析库)
bs4(Beautiful Soup4)
Xpath
4 保存数据
MongoDB
(1 3 4 需要手动写)
爬虫框架
Scrapy(基于面向对象)
爬取梨视频:
1 分析网站的视频源地址
2 通过requests网站视频源地址发送请求
3 获取视频的二进制流,并保存到本地
二.
爬取梨视频
请求url:
http://www.pearvideo.com/
请求方法:
get
请求头:
user-agent:Mozilla/5.0 (Windows NT 10.0:WOW64)APP1
1 # 对梨视频详情页发送请求,获取响应数据 2 3 response=requests.get(url='http://www.pearvideo.com/') 4 print(response.status_code) 5 print(response.text) 6 7 # re.findall('正则匹配规则','解析文本','正则模式') 8 # re.S 全局模式(对整个文本进行匹配) 9 # .指的是当前位置 10 # *指的是查找所有 11 12 ''' 13 # 获取主页视频详情页ID 14 res=re.findall('<a href = "video_(.*?)"',response.text,re.S) 15 print(res) 16 17 for m_id in res: 18 19 # 拼接详情页URL 20 detail_url='https://www.pearvideo.com/video_'+m_id 21 print(detail_url) 22 23 24 import requests 25 import re # 正则模块 26 # uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串 27 import uuid 28 29 三、 30 # 爬虫三部曲: 31 # 1 发送请求 32 def get_page(url): 33 response=requests.get(url) 34 return response 35 # 2 解析数据 36 # 解析主页获取视频详情页ID 37 def parse_index(text): 38 res=re.findall('<a href="video_(.*?)"',text,re.S) 39 # print(res) 40 detail_url_list=[] 41 for m_id in res: 42 # 拼接详情页url 43 detail_url = 'https://www.pearvideo.com/video_' + m_id 44 # print(detail_url) 45 detail_url_list.append(detail_url) 46 47 # print(detail_url_list) 48 49 return detail_url_list 50 # 解析详情页获取视频url 51 def parse_detail(text): 52 ''' 53 54 (.*?):提取括号里的内容 55 .*?: 这季节匹配 56 57 58 正则:<video.*?src="(.*?)" 59 60 #以上是分析过程 61 62 ''' 63 64 movie_url=re.findall('srcUrl="(.*?)"',text,re.S)[0] 65 # 3 保存数据 66 def save_movie(movie_url): 67 response=requests.get(movie_url) 68 69 # 把视频写到本地 70 with open(f'{uuid.uuid4()}.mp4','wb') as f: 71 f.write(response.content) 72 f.flush() 73 74 if __name__=='__main__': 75 76 # 1)对主页发送请求 77 index_res=get_page(url='https://www.pearvideo.com/') 78 79 # 2) 对主页进行解析、获取详情页id 80 detail_url_list=parse_index(index_res.text) 81 82 # 3) 对每个详情页url发送请求 83 for detail_url in detail_url_list: 84 detail_res=get_page(detail_url) 85 # print(detail_res.text) 86 87 # 4) 解析详情页获取视频url 88 movie_url=parse_detail(detail_res.text) 89 print(movie_url) 90 91 # 5) 保存视频 92 save_movie(movie_url)
四、高性能爬虫
1 import requests 2 import re # 正则模块 3 # uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串 4 5 import uuid 6 # 导入线程池模块 7 from concurrent.futures import ThreadPoolExecutor 8 # 线程池限制50个线程 9 pool=ThreadPoolExecutor(50) 10 11 12 13 # 爬虫三部曲 14 15 # 1 发送请求 16 def get_page(url): 17 print(f'开始异步任务:{url}') 18 response = requests.get(url) 19 return response 20 21 22 # 2 解析数据 23 # 解析主页获取视频详情页ID 24 def parse_index(res): 25 response=res.result() 26 27 # 提取主页所有ID 28 29 id_list = re.findall('<a href="video_(.*?)"', response.text, re.S) 30 # print(res) 31 32 for m_id in id_list: 33 # 拼接详情页url 34 detail_url = 'https://www.pearvideo.com/video_' + m_id 35 # print(detail_url) 36 # 把详情页url提交给get_page函数 37 pool.submit(get_page,detail_url).add_done_callback(parse_detail) 38 39 40 41 # 解析详情页获取视频url 42 def parse_detail(res): 43 response=res.result() 44 movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0] 45 pool.submit(get_page,movie_url).add_done_callback(save_movie) 46 47 48 49 # 3 保存数据 50 def save_movie(res): 51 movie_res=res.result() 52 53 # 把视频写到本地 54 with open(f'{uuid.uuid4()}.mp4', 'wb') as f: 55 f.write(movie_res.content) 56 print(f'视频下载结束:{movie_res.url}') 57 f.flush() 58 59 if __name__=='__mian__': 60 #往get_page发送异步请求,把结果交给parse_index函数 61 url='https://www.pearvideo.com/' 62 pool.submit(get_page,url).add_done_callback(parse_index)
五、requests详细使用
''' 访问知乎发现 请求url: https://www.zhihu.com/explore 请求方式: GET 请求头: user-agent: cookies ''' ''' 1 # 访问知乎 # import requests # response=requests.get(url='https://www.zhihu.com/explore') # print(response.status_code) # 400 # print(response.text) # 返回错误页面 ''' # # 携带请求头参数访问知乎 # import requests # # # 请求头字典 # headers={ # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' # } # # 在get请求内,添加user-agent # response=requests.get(url='https://www.zhihu.com/explore',headers=headers) # print(response.status_code) # 200 # # print(response.text) # with open('zhihu.html','w',encoding='utf-8') as f: # f.write(response.text) ''' 2 params请求参数 访问百度搜索安徽工程大学url http://www.baidu.com/s?wd=安徽工程大学&pn=10 # 百度搜索第二页 http://www.baidu.com/s?wd=安徽工程大学&pn=20 # 百度搜索第三页 ''' import requests from urllib.parse import urlencode # url='https://www.baidu.com/s?ie=UTF-8&wd=%E5%90%B4%E4%B8%96%E5%8B%8B' # url='https://www.baidu.com/s?+urlencode({"wd":"吴世勋"}) url='http://www.baidu.com/s?' headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safar' 'i/537.36' } #print(url) # 在get方法中添加params参数 # response=requests.get(url,headers,params={"wd":安徽工程大学}) response=requests.get(url,headers=headers,params={"wd":"安徽工程大学","pn":"20"}) # print(response.text) with open('daxue.html','w',encoding='utf-8') as f: f.write(response.text) ''' 3 携带cookies 携带登录cookies破解github登录yanzheng 请求url: https://github 请求方式: GET 请求头: user-agent: cookie: ''' # import requests # # 请求url # url='https://home.cnblogs.com/' # # # 请求头 # headers={ # 'user-agent':'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', # # 在请求头中拼接cookies # 'cookie':'_ga=GA1.2.1494095823.1560514836; _gid=GA1.2.2000239107.1560514836; __gads=ID=3457e99144b27181:T=1560515208:S=ALNI_MbgDkjlTzfuOz-OsDnFUhZiH6iY6Q; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrk-pTyYyzsxPw3nNf-6ASgoRbZafuUr2Dv2G1AmPstLCy2jzaj7RxdZXX6XyO6fTYJrOP8syyUMXgGCF1qMHG97Q-TlTPRKus5V-0ybfTox86x6BO95rYxd4S9CXU2ToqaCcXaHNM5_pQKAlyw6Ft-UzR6tiAGen8R1RR4MFCPiTdhjf0D-mHPmwBxH4WyelObmrmhWQNQ_p67Z-cKt4eBgYUARJN7EUgEOwm9A61_OiX3NvYpX1OJsgBXtLYsv9HAI1LKBc-kcsArkqJTqOYDQiTEufIJdA5J1LnO2IyufR5p7mtE5ndOu_1rfkJQGrrosM_BLW-2kudB-m2qPK6wIGiSXmpRkHIO2Ynnk7K3P3LJ2W3UVefcZi8ltEvihTbSCV_Va5cGmRoN8rJIkjhy11Dxk1vUzozHwnteQMxsQoyDaPccmSUMbpXi-nuSQ7xU; .CNBlogsCookie=422A6425E691922C84536778CC298BCCEF63DA25F78D4D50D5B8CDA6303086AD20B2084D31E8A0BB09C035CA5E89E4BB4CABC57499F6032A2A6B91F92580E1A49CDF85BC1A97DDBC02BAA6783521832ABAFD6763; _gat=1' # } # # boke_res=requests.get(url,headers=headers) # # import requests # cookies={ # 'Cookie':'_ga=GA1.2.1494095823.1560514836; _gid=GA1.2.2000239107.1560514836; __gads=ID=3457e99144b27181:T=1560515208:S=ALNI_MbgDkjlTzfuOz-OsDnFUhZiH6iY6Q; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrk-pTyYyzsxPw3nNf-6ASgoRbZafuUr2Dv2G1AmPstLCy2jzaj7RxdZXX6XyO6fTYJrOP8syyUMXgGCF1qMHG97Q-TlTPRKus5V-0ybfTox86x6BO95rYxd4S9CXU2ToqaCcXaHNM5_pQKAlyw6Ft-UzR6tiAGen8R1RR4MFCPiTdhjf0D-mHPmwBxH4WyelObmrmhWQNQ_p67Z-cKt4eBgYUARJN7EUgEOwm9A61_OiX3NvYpX1OJsgBXtLYsv9HAI1LKBc-kcsArkqJTqOYDQiTEufIJdA5J1LnO2IyufR5p7mtE5ndOu_1rfkJQGrrosM_BLW-2kudB-m2qPK6wIGiSXmpRkHIO2Ynnk7K3P3LJ2W3UVefcZi8ltEvihTbSCV_Va5cGmRoN8rJIkjhy11Dxk1vUzozHwnteQMxsQoyDaPccmSUMbpXi-nuSQ7xU; .CNBlogsCookie=422A6425E691922C84536778CC298BCCEF63DA25F78D4D50D5B8CDA6303086AD20B2084D31E8A0BB09C035CA5E89E4BB4CABC57499F6032A2A6B91F92580E1A49CDF85BC1A97DDBC02BAA6783521832ABAFD6763; _gat=1' # } # # boke_res=requests.get(url,headers=headers,cookies=cookies) # print('notfishnot' in boke_res.text)
六、爬取豆瓣电影top250
(这里只显示的是浏览器第一页)
''' 主页: https://movie.douban.com/top250 GET User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 re正则: # 电影详情页url、图片链接、电影名称、电影评分、评价人数 <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价 ''' import requests import re url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } # 1、往豆瓣TOP250发送请求获取响应数据 response = requests.get(url, headers=headers) # print(response.text) # 2、通过正则解析提取数据 # 电影详情页url、图片链接、电影名称、电影评分、评价人数 movie_content_list = re.findall( # 正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价', # 解析文本 response.text, # 匹配模式 re.S) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, point, num = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n' print(data) # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)
七、作业
'''
今日作业:
爬取豆瓣TOP250部电影信息:
https://movie.douban.com/top250
PS: 提取电影详情页url、图片链接、电影名称、
电影评分、评价人数、导演、主演、电影上映时间、简介
'''