文章目录
一、sklearn20类新闻分类
有人可能会碰到20newsgroups数据集加载超时问题,我的解决办法是科学上网
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
# 特征抽取
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
if __name__ == '__main__':
# 加载数据集
news = fetch_20newsgroups(subset='all')
# 目标值与特征值
X = news.data
y = news.target
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
# 特征抽取
tdidf = TfidfVectorizer()
X_train = tdidf.fit_transform(X_train)
# 打印单词列表
print(tdidf.get_feature_names())
X_test = tdidf.transform(X_test)
# 朴素贝叶斯
model = MultinomialNB(alpha=1.0)
print(X_train.toarray())
# 训练
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 准确率
print('准确率=', model.score(X_test, y_test))
准确率= 0.832507958967103
二、垃圾邮件识别
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# 获得词汇列表
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
# 词汇表与set取并集
vocabSet = vocabSet | set(document)
# 返回一个经过自然排序的词汇表
return sorted(list(vocabSet))
# 对邮件内容进行预处理
def textParse(bigString):
import re
list0fTokens = re.split(r'\W*', bigString)
# 返回长度大于2并转化为小写
return [tok.lower() for tok in list0fTokens if len(tok) > 2]
# 词袋模型
def bag0fWords2Vec(vocabList, inputSet):
# 初始化向量,其长度为词汇表程度相同
returnVec = [0] * len(vocabList)
for word in inputSet:
# 在词汇表对应位置上相加
returnVec[vocabList.index(word)] += 1
return returnVec
# 读取邮件
def loaddata():
docList = []
classList = []
# 读取垃圾邮件
num = 26
for i in range(1, num):
wordList = textParse(open('data/email/ham/%d.txt' % i).read())
docList.append(wordList)
classList.append(1)
# 读取非垃圾邮件
wordList = textParse(open('data/email/spam/%d.txt' % i).read())
docList.append(wordList)
classList.append(0)
vocabList = createVocabList(docList)
X = []
for docIndex in range(len(docList)):
X.append(bag0fWords2Vec(vocabList, docList[docIndex]))
return X, classList, vocabList
if __name__ == '__main__':
# 读取邮件
X, y, vocabList = loaddata()
# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
# model
model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)
print('accuracy =', accuracy_score(y_test, y_hat))
accuracy = 1.0