注意事项:
- 生成词云的图片必须是黑白照片
- 豆瓣不登录用户只能访问到影评的前12页的数据
- 豆瓣影评数据只展示24页
- 本方法是上一篇博客的另一种写法
- 爬取豆瓣的全部内容目前还不太容易,真正可行的我还没见到,希望各位大侠能真正的爬出来分享。
import requests
from bs4 import BeautifulSoup
import time
import jieba
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import re
def getHtml(url):
try:
r = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; \
WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360EE'},\
cookies={'cookie':'1012'})
r.raise_for_status()
r.encoding = "utf-8"
return r.text
except:
print("到此为止。原因豆瓣不登录用户只能访问到影评的前12页的数据,\
登录后的用户可以看到前24页的数据(豆瓣影评数据只展示24页")
def getData(html):
f = open("new.txt",'wb+')
soup = BeautifulSoup(html,"html.parser")
comment_list = soup.find('div',attrs={'class':'mod-bd'})
for comment in comment_list.find_all('div',attrs={'class':'comment-item'}):
comment_content = comment.find('span',attrs={'class':'short'}).get_text()
f.write(comment_content.encode('utf-8'))
def seg_sentence():
#实现句子的分词
final = ''
fn1 = open("new.txt", 'r',encoding='utf-8').read() #加载爬取的内容
for k in range(len(fn1)):
final+=str(fn1[k]).strip()
filtrate= re.compile(r'[\u4e00-\u9fa5]')
filterdata = re.findall(filtrate, final)
newtxt=''.join(filterdata)
#print(newtxt)
fn2 = open("new.txt", "w", encoding='utf-8')
fn2.write(newtxt)
def wordcloud(m):
# 加载图片
image = Image.open('fbb'+str(m)+'.png', 'r')
img = np.array(image)
# 词云
cut = open('new.txt', 'r',encoding='utf-8')
cut_txt=cut.read()
cut.close()
newtxtls = jieba.lcut(cut_txt)
txtls=[]
for i in range(len(newtxtls)):
if len(newtxtls[i])!=1:
txtls.append(newtxtls[i])
continue
newtxt=' '.join(txtls)
#print(newtxt)
wordcloud = WordCloud(
mask=img, # 使用该参数自动忽略height,width
background_color='white',
max_words=500, # 设置最大词数
max_font_size=40,
font_path="simhei.ttf").generate(newtxt)
# 显示图片
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # 去掉坐标轴
plt.show() #直接显示
wordcloud .to_file('wc'+str(m)+'.png') #存为图片
plt.axis("off")
def main():
k = 0 #start = k
i = 0
while k <300:
url = 'https://movie.douban.com/subject/26752088/comments?start=' + str(k) + '&limit=20&sort=new_score&status=P'
k += 20
i += 1
print("正在爬取第" + str(i) + "页的数据")
time.sleep(1) # 设置睡眠时间
html = getHtml(url)
getData(html)
seg_sentence()
wordcloud(i)
if __name__ == "__main__":
main()