京东爬虫与情感分析

紧接着上篇文章

爬取京东商品的评论进行情感分析

1.导入相关库
# 导入requests库(请求和页面抓取)
import requests
# 导入time库(设置抓取Sleep时间)
import time
#  导入random库(生成乱序随机数)
import random
# 导入正则库(从页面代码中提取信息)
import re
import os
import csv
# 导入情感分析库
from snownlp import SnowNLP
# 导入数值计算库(常规计算)
import numpy as np
# 导入科学计算库(拼表及各种分析汇总)
import pandas as pd
# 导入绘制图表库(数据可视化)
import matplotlib.pyplot as plt
# 导入结巴分词(关键词提取)
import jieba.analyse
2.爬取京东商品评论
# 设置请求中头文件的信息
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
            'Accept': '*/*',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Connection': 'close',
            'Referer': 'https://item.jd.com/'
            }

# 设置Cookie的内容
cookie = {
    'unpl': 'V2_ZzNtbRUAQxIgChFQchtVUmJRRVxKVhATJ1tHV3scXAViA0FbclRCFX0URlRnGVoUZwcZXkJcRxVFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHgbVARnAhVVQ2dzEkU4dlF%2bGV0EZjMTbUNnAUEpAUBTfx9fSGQBGlxCVkQddDhHZHg%3d',
    '__jdv': '76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_ff16d3f5838f4cf090b7cb02141141b7|1573975311990',
    '__jdu': '239010690',
    'areaId': '6',
    'ipLoc-djd': '6-303-36781-0',
    'PCSYCityID': 'CN_140000_140100_140106',
    'shshshfpb': 'jqH3At%2FfU9SrlFcfQjnfgMw%3D%3D',
    'shshshfpa': 'jqH3At%2FfU9SrlFcfQjnfgMw%3D%3D',
    '__jda': '122270672.239010690.1573975311.1573975311.1573975312.1',
    '__jdc': '122270672',
    '3AB9D23F7A4B3C9B': 'WVUDLUVYFRMFVKWZOXHTGBDYUMV5HMLY2VYDBX3UYJL672OAJ7LWVIW6GONDLMS7TCDWMSZKMYEAZRERTJXKDOILGU',
    'shshshfp': 'd36a43b250efcf39d0b6add590016f0d',
    '__jdb': '122270672.5.239010690|1.1573975312',
    'shshshsID': '26d62b30ea6b57e4d6469f7de971d0bd_5_1573975729111',
    'JSESSIONID': '63E9CE05E36AD715BF1AEEFF241759B5.s1'

}

productIds = [100003150357, 10265477083, 100004751037, 34664250889, 100003052761, 100004917490]
# 设置URL的第一部分
url1 = 'https://sclub.jd.com/comment/productPageComments.action?callback=callback=fetchJSON_comment98vv2511&productId='
# 设置URL的第二部分
url2 = '&score=0&sortType=5&page='
# 设置URL的第三部分
url3 = '&pageSize=10&isShadowSku=0&rid=0&fold=1'
# 乱序输出0-100的唯一随机数
# 随机选取50页评论进行抓取
ran_num = random.sample(range(100), 50)


def jie_xi(url1, productId, url2, ran_num, url3):
    # 拼接URL并乱序循环抓取页面
    for i in ran_num:
        i = str(i)
        url = (url1+productId+url2+i+url3)
        r = requests.get(url=url, headers=headers, cookies=cookie)
        html = r.content   # .content返回的是bytes型也就是二进制的数据
        print("当前抓取页面:", url, "状态:", r)
        html = str(html, encoding="GBK")  # 对抓取的页面进行编码
        # file = open("page.txt", "w")  # 将编码后的页面输出为txt文本存储
        # file.writelines(html)
        # 使用正则提取userClient字段信息
        user_comment = re.findall(r',.*?"content":(.*?),', html)
        # usera = re.findall(r',.*?"creationTime":(.*?),', html)
        # userd = re.findall(r',.*?"referenceName":(.*?),', html)
        write_to_csv(user_comment, productId)
        # write_to_txt(user_comment, productId)
        print(user_comment)
        time.sleep(3)


def write_to_csv(content, id):
    file = 'D:/数据/评论' + id + '.csv'
    # newline=''确保没有空行
    f = open(file, 'a', encoding='utf-8', newline='')
    writer = csv.writer(f)
    for i in range(len(content)):
        writer.writerow([content[i]])


# 存入txt
def write_to_txt(content, id):
    file = 'D:/数据/评论' + id + '.txt'
    # newline=''确保没有空行
    f = open(file, 'a', encoding='utf-8', newline='')
    for i in range(len(content)):
        f.write(content[i])


def run():
    for productId in productIds:
        time.sleep(3)
        productId = str(productId)
        jie_xi(url1, productId, url2, ran_num, url3)
run()

对于京东商品的评论,解析页面信息我们是获取不到的,必须审查元素
在这里插入图片描述

点击network后刷新页面
在这里插入图片描述

找到了商品的评论信息

在这里插入图片描述

我们便能设置请求头信息了
在这里插入图片描述

这里是重点,每个商品的id都不同,所以需要将url分为几个部分

3.情感分析
# 导入数据
data = pd.read_csv('D:/数据/评论100004917490.csv')

# 数据相关处理
len = data.shape[0]

# 情感分析
def emotion(comemnt):
    s = SnowNLP(comemnt)
    return s.sentiments
data["emotion"] = data.comment.apply(emotion)

# 情感分析得分区间
def process():
    sum_sentiment = 0
    good_count = 0
    just_so_so_count = 0
    bad_count = 0
    for i in range(len):
        if data.emotion.values[i] >= 0.8:
            good_count += 1
        elif data.emotion.values[i] >= 0.4 and data.emotion.values[i] < 0.8:
            just_so_so_count += 1
        else:
            bad_count += 1
        sum_sentiment += data.emotion.values[i]
    print("----------共计" + str(len) + "条评论----------")
    print("----------0.8以上有" + str(good_count) + "条评论----------")
    print("----------0.4-0.8有" + str(just_so_so_count) + "条评论----------")
    print("----------0.4以下有" + str(bad_count) + "条评论----------")
    print("average sentiment is {}".format(sum_sentiment/len))
    
process()

在这里插入图片描述在这里插入图片描述

4.制作词云

我的其他文章已经讲过这部分内容

from os import path
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import  jieba


def GetWordCloud():
   path_txt = 'D:/数据/评论100004917490.txt'
   path_img = "test.jpg"
   f = open(path_txt, 'r', encoding='utf-8').read()
   background_image = np.array(Image.open(path_img))
   # join() 方法用于将序列中的元素以指定的字符连接生成一个新的字符串
   cut_text = " ".join(jieba.cut(f))
   # mask参数=图片背景,必须要写上,另外有mask参数再设定宽高是无效的
   wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", mask=background_image).generate(cut_text)
   # 生成颜色值
   image_colors = ImageColorGenerator(background_image)
   # 下面代码表示显示图片
   plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
   # 获得模块所在的路径的
   d = path.dirname(__file__)
   # os.path.join():  将多个路径组合后返回
   wordcloud.to_file(path.join(d, "圣女的救济1.png"))
   plt.axis("off")
   plt.show()
   
GetWordCloud()

在这里插入图片描述

5.计算准确率,召回率,F值
def calculation():
    # 计算准确率,召回率,F值
    # 对情感及进行标签分类(分为1,2,3)
    def emotion_classification(comemnt):
        s = SnowNLP(comemnt)
        if s.sentiments >= 0.8:
            return 1
        elif s.sentiments >= 0.4 and s.sentiments <0.8:
            return 2
        else:
            return 3
    data["classification"] = data.comment.apply(emotion_classification)

    # 划分测试集和训练集
    from sklearn.model_selection import train_test_split
    # 正确率 = 正确识别的个体总数 / 识别出的个体总数
    # 召回率 = 正确识别的个体总数 / 测试集中存在的个体总数
    # F值 = 正确率 * 召回率 * 2 / (正确率 + 召回率)
    data_class = data['classification']
    train, test = train_test_split(data_class, test_size=0.2)
    x = train.values
    y = test.values
    # 我们统计一下1,2,3出现的次数
    count = pd.value_counts(data_class)
    print(count)
    '''
    1    449
    3     56
    2     48
    Name: classification, dtype: int64
    '''
    # 过抽取的数据来进行计算
    count_sui_ji = pd.value_counts(x)
    accuracy_rate = count_sui_ji.values[0] / count_sui_ji.sum()
    recall = count_sui_ji.values[0] / data_class.shape[0]
    F = (accuracy_rate * recall * 2) / (accuracy_rate + recall)
    print("正确率为:{:.2f}%".format(accuracy_rate * 100))
    print("召回率为:{:.2f}%".format(recall * 100))
    print("F值为:{:.2f}%".format(F * 100))
calculation()

在这里插入图片描述在这里插入图片描述

6.可视化
def plot():
    yun_xing_sum = 0
    jia_ge_sum = 0
    nei_cun_sum = 0
    xing_neng_sum = 0
    for i in range(data.shape[0]):
        if '运行' in data.comment.values[i]:
            yun_xing_sum += 1
        if '价格' in data.comment.values[i]:
            jia_ge_sum += 1
        if '内存' in data.comment.values[i]:
            nei_cun_sum += 1
        if '性能' in data.comment.values[i]:
            xing_neng_sum += 1
    print(yun_xing_sum, jia_ge_sum, nei_cun_sum, xing_neng_sum)

    yun_xing_z = 0
    jia_ge_z = 0
    nei_cun_z = 0
    xing_neng_z = 0
    for i in range(data.shape[0]):
        if "运行" in data.comment.values[i] and "快" in data.comment.values[i]:
            yun_xing_z += 1
        elif "价格" in data.comment.values[i] and "满意" in data.comment.values[i]:
            jia_ge_z += 1
        elif "内存" in data.comment.values[i] and "不错" in data.comment.values[i]:
            nei_cun_z += 1
        elif "性能" in data.comment.values[i] and "不错" in data.comment.values[i]:
            xing_neng_z += 1
    print(yun_xing_z, jia_ge_z, nei_cun_z, xing_neng_z)

    yun_xing_f = yun_xing_sum - yun_xing_z
    jia_ge_f = jia_ge_sum - jia_ge_z
    nei_cun_f = nei_cun_sum - nei_cun_z
    xing_neng_f = xing_neng_sum - xing_neng_z

    # 画图
    import numpy as np
    import matplotlib.pyplot as plt
    plt.style.use('ggplot')    # 使用自带的样式进行美化
    # print(plt.style.available)   # 打印出所有的样式
    # 下面两行代码用于显示中文
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False

    plt.subplot(111)
    x = np.array([1, 2, 3, 4])
    y1 = np.array([yun_xing_z, jia_ge_z, nei_cun_z, xing_neng_z])
    y2 = np.array([yun_xing_f, jia_ge_f, nei_cun_f, xing_neng_f])

    plt.bar(x, y1, width=0.3, label="正面", color='r')
    plt.bar(x+0.3, y2, width=0.3, label="负面", color='b')
    plt.title("电脑性能评价结果分析", color='k')
    for a, b in zip(x, y1):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=11, color='k')
    for a, b in zip(x+0.3, y2):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=11, color='k')
    plt.xlabel('评价属性')
    plt.ylabel('评价数量')
    plt.xticks(x + 0.15, ["运行", "价格", "内存", "性能"])
    plt.grid(False)
    plt.legend(ncol=2, loc='upper center')
plot()

在这里插入图片描述

发布了60 篇原创文章 · 获赞 6 · 访问量 7771

猜你喜欢

转载自blog.csdn.net/qq_44205272/article/details/103297735