python网络编程项目
根据豆瓣电影url获取电影的ID
如图url中的30306570就是电影《囧妈》的ID
获取每部电影的前10页影评 保存在以电影名称命名的TXT文件中
根据影评 结巴分词 生成词云
运行如图:
词云:(可自定义词云形状)
代码如下:
#-* -coding=UTF-8 -*-
from tkinter import *
import urllib.request
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib.pyplot import imread
import jieba
def getHtml(url):
"""获取url页面"""
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
req = urllib.request.Request(url,headers=headers)
req = urllib.request.urlopen(req)
content = req.read().decode('utf-8')
return content
def getComment(url):
"""解析HTML页面"""
html = getHtml(url)
soupComment = BeautifulSoup(html, 'html.parser')
comments = soupComment.findAll('span', 'short')
onePageComments = []
for comment in comments:
# print(comment.getText()+'\n')
onePageComments.append(comment.getText()+'\n')
return onePageComments
def ciyun(x):
text = open("%r.txt"%(x), "rb").read()
# jieba分词
wordlist = jieba.cut(text, cut_all=True)
wl = " ".join(wordlist)
# 把分词后的txt写入文本文件
# fenciTxt = open("fenciHou.txt","w+")
# fenciTxt.writelines(wl)
# fenciTxt.close()
# 设置词云
wc = WordCloud(background_color="white", #设置背景颜色
# mask=imread('shen.jpg'), #设置背景图片
max_words=2000, #设置最大显示的字数
stopwords=["的", "这种", "这样", "还是", "就是", "这个"], #设置停用词
font_path="C:\\Windows\\Fonts\\simkai.ttf", # 设置为楷体 常规
#设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文)
max_font_size=60, #设置字体最大值
random_state=30, #设置有多少种随机生成状态,即有多少种配色方案
)
myword = wc.generate(wl) #生成词云
wc.to_file('result.jpg')
# 展示词云图
plt.imshow(myword)
plt.axis("off")
plt.show()
def on_click():
x = sheet_text.get()
y = xls_text.get()
print("电影名称:%s 电影ID:%s " %(x,y))
f = open('%r.txt' % (sheet_text.get()), 'w', encoding='utf-8')
for page in range(10): # 获取10页,获取太多需要验证码
url = 'https://movie.douban.com/subject/' + y + '/comments?start=' + str(
20 * page) + '&limit=20&sort=new_score&status=P'
print('第%s页的评论:' % (page + 1))
print(url + '\n')
for i in getComment(url):
f.write(i)
print(i)
print('爬取完成\n')
ciyun(x)
# messagebox.showinfo(title='aaa', message = string)
root = Tk()
root.title("爬取影评")
root.geometry('300x300')#设置界面大小
l1 = Label(root, text="请输入电影ID")
l1.pack()#将其放置到界面
xls_text = StringVar()#获取文本框内容
xls = Entry(root, textvariable = xls_text)
xls_text.set(" ")
xls.pack()
l2 = Label(root, text="请输入电影名称")
l2.pack() #将其放置到界面
sheet_text = StringVar()#获取文本框内容
sheet = Entry(root, textvariable = sheet_text)
sheet_text.set(" ")
sheet.pack()
Button(root, text="爬取", command = on_click).pack()
temp = str(xls_text.get())
print(type(temp))
root.mainloop()