【Part1】获取百度搜索前10页链接
getBaiduUrl.py源码:
输入:一个关键词,如"IOS12"
输出:百度搜索前10页的链接(保存到文件BaiduUrls.txt中)
#coding=UTF-8 from urllib.request import Request, urlopen,quote from urllib.error import URLError import chardet from bs4 import BeautifulSoup as BS class GetUrls(object): pageCount = 1 search_url = 'http://www.baidu.com/s?wd=key&pn=pageNum' # req_header = {'User-Agent':'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} req_header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, compress', 'Accept-Language': 'en-us;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } def __init__(self,inputInfo): self.inputInfo = inputInfo GetUrls.search_url = GetUrls.search_url.replace('key',quote(self.inputInfo)) #detect the coding of the html page def __detectCode(self,url): htmlInfo =urlopen(url).info() coding = htmlInfo.getparam('charset') if coding is None: htmlInfo = urlopen(url).read() coding = chardet.detect(htmlInfo)['encoding'] if coding is None: coding = 'utf-8' coding = coding.lower() print(coding) return coding #get the html title def __getTitle(self,url): coding = self.__detectCode(url) try: titleReq = Request(url,None,GetUrls.req_header) titleRes = urlopen(titleReq) html = titleRes.read() titleSoup = BS(html.decode(coding,'ignore')) title = titleSoup.title.string print(title) return title except URLError as e: if hasattr(e, 'reason'): print('We failed to reach a server.') print('Reason: ', e.reason) elif hasattr(e, 'code'): print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) #get the information inside the html page def __getInfo(self,redUrl): #myLock.acquire() with open('info.txt',mode='a') as a_file: title = self.__getTitle(redUrl) if title: #print title.encode('gbk','ignore') a_file.write(title.encode('gbk','ignore') + '\n') a_file.write(redUrl + '\n\n') #myLock.release() # def getMe(self,soup): # print(soup) # # print (len(soup.findAll("table", {"class": "result"}))) # for result_table in soup.findAll("div", {"class": "result c-container "}): # h3 = result_table.find("h3", {"class": "t"}) # h3_a = h3.find("a") # div_abstract = result_table.find("div", {"class": "c-abstract"}) # div_shortcut = result_table.find("div", {"class": "f13"}) # if not div_shortcut is None: # div_shortcut_a = div_shortcut.find("a",{"class": "m"}) # title = str(h3_a.text); # link = ''; # content = ''; # shortCut = ''; # if not h3_a is None: # link = str(h3_a.get("href")); # if not div_abstract is None: # content = str(div_abstract.text); # if not div_shortcut is None: # shortCut = str(div_shortcut_a.get("href")); # # print ("-----标题----\n" + a_click.renderContents())#标题 # print ("◥ 标 题:" + title)# # print (" 链 接:" + link)#链接 # print (" 内 容:" + content)#内容 # print (" 百度快照:" + shortCut)#内容 def __searchUrls(self,url): search_url = url search_url=search_url.replace('pageNum',str((GetUrls.pageCount-1)*10)) # print(search_url) if GetUrls.pageCount > 10:#获取前五页 return else: try: req = Request(search_url,None,GetUrls.req_header) req = Request(search_url) res = urlopen(req) html = res.read() soup = BS(html.decode('utf-8','ignore')) f=open('BaiduUrls.txt', "a+") #写入文件 print ("★ 百度搜索第 "+str(GetUrls.pageCount)+" 页") # print (len(soup.findAll("table", {"class": "result"}))) for result_table in soup.findAll("div", {"class": "result c-container "}): h3 = result_table.find("h3", {"class": "t"}) h3_a = h3.find("a") div_abstract = result_table.find("div", {"class": "c-abstract"}) div_shortcut = result_table.find("div", {"class": "f13"}) if not div_shortcut is None: div_shortcut_a = div_shortcut.find("a",{"class": "m"}) title = str(h3_a.text) link = '' link2 = '' content = '' shortCut = '' if not h3_a is None: link = str(h3_a.get("href")) link2=urlopen(link).geturl() if not div_abstract is None: content = str(div_abstract.text); if not div_shortcut is None: if not div_shortcut_a is None: shortCut = str(div_shortcut_a.get("href")); # print ("-----标题----\n" + a_click.renderContents())#标题 print ("◥ 标 题:" + title)# # print (" 百度链接:" + link)#链接 print (" 原始链接:" + link2)#链接 # print (" 内 容:" + content)#内容 # print (" 百度快照:" + shortCut)#内容 f.write(link2+'\n') except URLError as e: if hasattr(e, 'reason'): print('We failed to reach a server.') print('Reason: ', e.reason) elif hasattr(e, 'code'): print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) GetUrls.pageCount = GetUrls.pageCount+ 1 # print(GetUrls.pageCount) self.__searchUrls(url) def UrlParse(self): self.__searchUrls(GetUrls.search_url) if __name__ == '__main__': getUrlInfo = GetUrls('IOS12') getUrlInfo.UrlParse()
Part2 打开文件 BaiduUrls.txt链接,获取正文,循环读入到Context.txt中
Url2Text.py
输入:存入链接的txt文件
输出:这些链接的正文
#coding=utf-8 from bs4 import BeautifulSoup import sys import re from urllib.request import Request, urlopen,quote from urllib.error import URLError from readability.readability import Document from html2text import html2text with open('Context.txt','a+',encoding='utf8') as f: with open('result.txt','r') as urls: for url in urls: # print(url) try: rqt = Request(url) response = urlopen(rqt) html = response.read() article =Document(html).summary() text = html2text(article) print(text) f.writelines('【'+str(url)+'】\n'+text+'\n') except URLError as e: continue
Part3 用WordCloud分析 得到的正文
#@requires_authorization # from urllib.request import urlopen import urllib.request from bs4 import BeautifulSoup import re import string from collections import OrderedDict import numpy as np import matplotlib.pyplot as plt from matplotlib import mlab from matplotlib import rcParams from os import path from scipy.misc import imread from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator d = path.dirname(__file__) def cleanInput(input): input = re.sub('\n+'," ",input) #去除换行符 input = re.sub('\[[0-9]*\]',"",input) #去除带中括号的数字 input = re.sub('[0-9]',"",input) #去除数字 input = re.sub('[,。.、!:%;”“\[\]]',"",input) #去除中文标点符号 input = re.sub(' +', " ",input) #去除空格 input.strip(string.punctuation) #去除英文标点符号 return input def getngrams(input, n): input = cleanInput(input) output = dict() for i in range(len(input)-n+1): newNGram = "".join(input[i:i+n]) #以指定字符串连接生成新的字符串 if newNGram in output: output[newNGram] += 1 #如果字符出现过则加1 else: output[newNGram] = 1 #没出现过则设置为1 return output def showplt(content,title): ##输入 待分析的文字,标题,输出一张图 ngrams = getngrams(content,2) ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) # datafile = open("tmp.txt",'w+') --把分析结果保存到文件中 count = [] count_label = [] for k in ngrams: print("(%s,%d)" % (k,ngrams[k])) # datafile.write("(%s,%d)\n" % (k,ngrams[k])) ----把分析结果保存到文件中 if(ngrams[k]> 30): count.append(ngrams[k]) count_label.append(k) x = np.arange(len(count))+1 fig1 = plt.figure(1) rects =plt.bar(x,count,width = 0.5,align="center",yerr=0.001) #设置字体 plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False #用来正常显示负号 plt.title(title) def autolabel(rects): for rect in rects: height = rect.get_height() plt.text(rect.get_x(), 1.03*height, '%s' % int(height)) autolabel(rects) plt.xticks(x,count_label,rotation=90) #plt.xticks(x,count_label) plt.show() def show_wordcloud(content,img): #输入:待分析内容,背景图 输出:一张词云图 alice_coloring = imread(path.join(d, img)) wc = WordCloud(background_color="white", #背景颜色<br>#max_words=2000,# 词云显示的最大词数 font_path="simhei.ttf", mask=alice_coloring,#设置背景图片 stopwords=STOPWORDS.add("said"), max_font_size=60, #字体最大值 random_state=50) #上述函数设计了词云格式 # 生成词云, 可以用generate输入全部文本(中文不好分词),也可以我们计算好词频后使用generate_from_frequencies函数 wc.generate(content) #文本词频统计函数,本函数自动统计词的个数,以字典形式内部存储,在显示的时候词频大的,字体也大 # 从背景图片生成颜色值 image_colors = ImageColorGenerator(alice_coloring) # 以下代码显示图片 plt.figure() # recolor wordcloud and show # we could also give color_func=image_colors directly in the constructor plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") plt.show() ############################################### # html = urllib.request.urlopen("http://news.ifeng.com/a/20170305/50754278_0.shtml") # bsObj = BeautifulSoup(html,"lxml") # content = bsObj.find("div",{"id":"main_content"}).get_text() # print(html.read()) # # print(content) # showplt(content,'2017政府工作报告词频统计') # #show_wordcloud(content,'bg.png') ############################################### fp = open('Context.txt','rb') content = fp.read().decode('utf-8') print(content) show_wordcloud(content,'bg.png') ############################################### # html = urllib.request.urlopen("http://kan.china.com/news/socom/272269.html?from=socom") # bsObj = BeautifulSoup(html,"lxml") # content = bsObj.find("div",{"id":"main-content"}).get_text() # print(html.read()) # # print(content) # # showplt(content,'demo') # show_wordcloud(content,'bg.png')