原生爬虫(爬取熊猫直播人气排名)

 1 ''''
 2     This is a module
 3 '''
 4 
 5 import re
 6 
 7 from urllib import request
 8 # 断点调试
 9 
10 class Spider():
11     '''
12         This is a class
13     '''
14     # 私有方法
15     # 匹配所有字符 [\s\S]*? 非贪婪
16     url='https://www.panda.tv/all?pdt=1.27.psbar-menu.0.1oj9bbkfjbh'
17     root_pattern = '<div class="video-info">([\w\W]*?)</div>'
18     name_pattern = '</i>([\w\W]*?)</span>'
19     number_pattern = '<span class="video-number">([\w\W]*?)</span>'
20     def __fetch_content(self):
21 
22         # This is a HTTP request
23         r = request.urlopen(Spider.url)  
24         # 字节码
25         htmls = r.read()
26         htmls = str(htmls,encoding='utf-8')
27 
28         return htmls
29 
30     def __analysis(self, htmls):
31         root_html = re.findall(Spider.root_pattern, htmls)
32 
33         anchors = []
34         for html in root_html:
35             name = re.findall(Spider.name_pattern, html)
36             number = re.findall(Spider.number_pattern, html)
37             anchor = {'name':name,'number':number}
38             anchors.append(anchor)
39         # print(root_html[0])
40         # print(anchors[0])
41         # print(anchors)
42         return anchors
43         
44     def __refine(self, anchors):
45         
46         # 匿名函数lambda
47         l = lambda anchor: {'name':anchor['name'][0].strip(),'number':anchor['number'][0]}
48         # r = map(l, anchors)
49         # print(r)
50         return map(l,anchors)
51 
52     def __sort(self, anchors):
53 
54         # 默认增序
55         anchors = sorted(anchors, key = self.__sort_seed, reverse=True)
56 
57         return anchors
58 
59     def __sort_seed(self, anchor):
60         r = re.findall('\d*', anchor['number'])
61         number = float(r[0])
62         if '' in anchor['number']:
63             number *= 10000
64 
65         return number
66 
67     def __show(self, anchors):
68         for rank in range(0, len(anchors)):
69             print('rank'+str(rank+1)+':'+anchors[rank]['name']+' '+anchors[rank]['number'])
70 
71     def go(self):
72         htmls = self.__fetch_content()
73         # self.__analysis(htmls)
74         anchors = self.__analysis(htmls) 
75         # anchors = self.__refine(anchors)
76         anchors = list(self.__refine(anchors))
77         # print(anchors)
78         # anchors = list(self.__refine(anchors))
79         anchors = self.__sort(anchors)
80         self.__show(anchors)
81         # print(anchors)
82 
83 spider = Spider()
84 spider.go()
View Code

猜你喜欢

转载自www.cnblogs.com/KSYoon/p/9656939.html