Python
---小白121的记录笔记
class main_(): 定义一个类 def get_source(self,url): return requests.get(url) def page_num(self,url,total_page): #创造多个 URL 元组 now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1)) page_group = [] for i in range(now_page,total_page+1): link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S) page_group.append(link) return page_group def get_info(self,html): #爬取并过滤出有用信息 info = {} info['title'] = re.search('class="lessonimg" title="(.*?)" alt="',html.text,re.S).group(1).replace('\n','').replace('\t','') info['content'] = re.search('<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',html.text,re.S).group(1).replace('\n','').replace('\t','') info['h_m_person'] = re.search('<em class="learn-number">(.*?)</em>',html.text).group(1).replace('\n','').replace('\t','') info['h_long'] = re.search('<dd class="mar-b8"><i class="time-icon"></i><em>(.*?)</em>',html.text,re.S).group(1).replace('\n','').replace('\t','') return info def save_info(self,all_info): #存储信息 os.chdir('F:\\python测试\\') f = open('极客学院课程info.txt','a',encoding='utf-8') for each in all_info: f.writelines('题目为:' + each['title'] + '\n') f.writelines('内容为:' + each['content'] + '\n') f.writelines('学习人数:' + each['h_m_person'] + '\n') f.writelines('课程时间:' + each['h_long'] + '\n\n\n\n\n') f.close() a = main_()#调用main类 page_num = input('please input crawl web the page : ') #输入爬取最大页数 page_num = int(page_num) classinfo = [] change_page = a.page_num(url,page_num) #调用change_page for i in change_page: html = a.get_source(i) info = a.get_info(html) classinfo.append(info) print('正在提取:%s '%i) a.save_info(classinfo)