豆瓣评论数据词云画像()

'''
#这段代码是从一个网`在这里插入代码片`站借用过来的,具体哪个网址一下子忘记了。
#可以直接运行。
'''
from selenium import webdriver
import time
import codecs
import jieba
import jieba.analyse as analyse
from wordcloud import WordCloud
from scipy.misc import imread
from os import path

def get_douban_comments(url):
    # comments_list = [] # 评论列表
    login_url = 'https://accounts.douban.com/login?source=movie'
    user_name = '15527546531'  # 这里替换成你的豆瓣用户名
    password = '15898405110ABCD'  # 这里替换成你的密码
    driver = webdriver.Firefox() # 启动Firefox()
    driver.get(login_url)
    driver.find_element_by_id('email').clear() # 清除输入框
    driver.find_element_by_id('email').send_keys(user_name) # 输入用户名
    driver.find_element_by_id('password').clear()
    driver.find_element_by_id('password').send_keys(password) # 输入密码
    captcha_field = input('请打开浏览器输入验证码:') # 手动填入验证码
    driver.find_element_by_id('captcha_field').send_keys(captcha_field)
    driver.find_element_by_class_name('btn-submit').click() # 点击登录按钮
    time.sleep(5) # 等待跳转到登录之后的页面
    driver.get(url) # 定位到目标页面
    driver.implicitly_wait(3) # 智能等待3秒
    n = 1 # 页数
    count = 0 # 评论数目
# 注意:下次需要做词云的时候 需要重新给一个名字或者吧上次的文件清空,
    `在这里插入代码片`#因为是添加的模式
    file = codecs.open("pjl_comment.txt",mode='a',encoding='utf-8')


    while True:
        try:
            comments_list = []  # 评论列表
            results = driver.find_elements_by_class_name('comment')
            print("results:",len(results))
            for result in results:
                # print(result)
                # author = result.find_elements_by_tag_name('a')[1].text # 作者
                # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 赞同数目
                # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 时间
                comment = result.find_element_by_tag_name('p').text # 评论内容
                print(comment)
                comments_list.append(comment+u'\n')
                print(u"查找到第%d个评论" % count)
                count += 1
            driver.find_element_by_class_name('next').click() # 点击下一页
            print( u'第%d页查找完毕!' % n)
            n += 1
            time.sleep(2)
            file.writelines(comments_list)
        except Exception() as e:
            print(e)
    # with codecs.open('pjl_comment.txt','a',encoding='utf-8') as f:
    #     f.writelines(comments_list)
    # print(u"查找到第%d页,第%d个评论!" %(n,count))


# 得到所有关键词
def get_all_keywords(file_name):
    word_lists = [] # 关键词列表
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists = f.readlines() # 文本列表
        for List in Lists:
            cut_list = list(jieba.cut(List))
            for word in cut_list:
                word_lists.append(word)
    word_lists_set = set(word_lists) # 去除重复元素
    sort_count = []
    word_lists_set = list(word_lists_set)
    length = len(word_lists_set)
    print(u"共有%d个关键词" % length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w+u':'+(word_lists.count(w))+u"次\n")
        print(u"%d---" % k + w+u":"+(word_lists.count(w))+ u"次")
        k += 1
    with codecs.open('count_word.txt','w',encoding='utf-8') as f:
        f.writelines(sort_count)

def get_top_keywords(file_name):
    top_word_lists = [] # 关键词列表
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        texts = f.read() # 读取整个文件作为一个字符串
        Result = analyse.textrank(texts,topK=20,withWeight=True,withFlag=True)
        n = 1
        for result in Result:
            print(u"%d:" % n )
            for C in result[0]: # result[0] 包含关键词和词性
                print(C,u"  ")
            print(u"权重:"+ str(result[1])) # 关键词权重
            n += 1



# 绘制词云
def draw_wordcloud():
   with codecs.open('pjl_comment.txt',encoding='utf-8') as f:
       comment_text = f.read()
   cut_text = " ".join(jieba.cut(comment_text)) # 将jieba分词得到的关键词用空格连接成为字符串
   d = "E:\\pythonStudy_2\\machine-learning" #当前文件文件夹所在目录
   color_mask = imread("E:\\pythonStudy_2\\machine-learning\\tmp.png") # 读取背景图片
   cloud = WordCloud(font_path=path.join(d,'simsun.ttc'),background_color='white',mask=color_mask,max_words=2000,max_font_size=40)
   word_cloud = cloud.generate(cut_text) # 产生词云
   word_cloud.to_file("pjl_cloud2.jpg")



if __name__ == '__main__':

    url = "https://movie.douban.com/subject/26752088/comments?status=P" # 我不是药神
    get_douban_comments(url)

    # file_name = 'pjl_comment.txt'
    # get_top_keywords(file_name)

    # draw_wordcloud()

猜你喜欢

转载自blog.csdn.net/qwertyuiop5rghar/article/details/83690722