版权声明:欢迎转载与留言提问 https://blog.csdn.net/qq_25439417/article/details/82592881
用kmeans对有标注少量文本做了个无监督分类,效果一般般。原因可能是
1.停用词不全面
2.kmeans等分类器对于 高纬度向量 分类能力交叉,可以考虑降维再分类
3.部分词频在多个类型的文本中重复高频出现
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 13:23:31 2018
@author: Lenovo
"""
import jieba as jb
import numpy as np
import lightgbm as lgb
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from gensim import corpora,models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.similarities.docsim import Similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
#content_id,content,subject,sentiment_value,sentiment_word
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test_public.csv')
print(train_csv)
corpus = []
'''
subject
0 价格
1 配置
2 操控
3 舒适性
4 油耗
5 动力
6 内饰
7 安全性
8 空间
9 外观
'''
from keras.utils import np_utils
label = list()
for i in range(len(train_csv)):
c = train_csv.content[i]
w = ' '.join(jb.cut(c))
corpus.append(w)
l = train_csv.subject[i]
if l=='价格':
label.append(0)
if l=='配置':
label.append(1)
if l=='操控':
label.append(2)
if l=='舒适性':
label.append(3)
if l=='油耗':
label.append(4)
if l=='动力':
label.append(5)
if l=='内饰':
label.append(6)
if l=='安全性':
label.append(7)
if l=='空间':
label.append(8)
if l=='外观':
label.append(9)
typelist=['价格','配置','操控','舒适性','油耗','动力','内饰','安全性','空间','外观']
typelist = np.array(typelist)
for i in range(len(test_csv)):
c = test_csv.content[i]
w = ' '.join(jb.cut(c))
corpus.append(w)
#label = np_utils.to_categorical(label)
#
#cv = CountVectorizer()
#
#tfidft =TfidfTransformer()
#bow = cv.fit_transform(corpus)
stop_words=list()
stop_words.append(',')
stop_words.append('、')
stop_words.append('。')
stop_words.append('!')
stop_words.append(':')
stop_words.append(';')
stop_words.append('?')
stop_words.append('.')
stop_words.append('!')
stop_words.append(',')
stop_words.append('?')
stop_words.append(';')
stop_words.append('吧')
stop_words.append('你')
stop_words.append('我')
stop_words.append('他')
stop_words.append('貌似')
stop_words.append('的')
stop_words.append('了')
stop_words.append('弄')
stop_words.append('因为')
stop_words.append('看')
stop_words.append('应该')
stop_words.append('(')
stop_words.append('求')
stop_words.append('手')
stop_words.append('用')
stop_words.append('听过')
stop_words.append('开始')
stop_words.append('到手')
stop_words.append(')')
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",stop_words=stop_words,min_df=25,ngram_range=(1,3),max_df=0.8,norm='l2')
tfidf = vectorizer.fit_transform(corpus)
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=10)
s = clf.fit(tfidf.toarray())
k_cla=['油耗','价格','配置','舒适性','油耗','动力','内饰','安全性','空间','外观']
print(s)
print(clf.cluster_centers_)
print(clf.labels_)
for i in range(len(clf.labels_)):
if clf.labels_[i]==6:
print(train_csv.subject[i])