用R进行朴素贝叶斯分类

Example: Filtering spam SMS messages ----

Step 1: Exploring and preparing the data ----

在这里插入图片描述

#read the sms data into the sms data frame
sms_raw <- read.csv("F:\\rwork\\Machine Learning with R (2nd Ed.)\\Chapter 04\\sms_spam.csv", stringsAsFactors = FALSE)

#examine the structure of the sms data
str(sms_raw)

#convert spam/ham to factor.
sms_raw$type <- factor( sms_raw$type )

#examine the type variable more carefully
str(sms_raw$type)
table(sms_raw$type)

#build a corpus using the text mining ('tm') package
library('tm')
sms_corpus <- VCorpus(VectorSource(sms_raw$text))#建立语料库(短信内容)
head(sms_corpus)

#examine the sms corpus
print(sms_corpus)
inspect(sms_corpus[1:2])#观察第一条和第二条短信的概要

as.character(sms_corpus[[1]])#观看第一条短信内容
lapply(sms_corpus[1:2], as.character)

#clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))#全部小写

#show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]])
as.character(sms_corpus_clean[[1]])

#删除数字
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers

#删除填充词
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) 
stopwords里面是to,and,but等一些自带的一些词

#删除标点符号
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) 

#自定义函数
removePunctuation("hello...world")
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }#gsub函数用空格代替任何标点符号
replacePunctuation("hello...world")

#illustration of word stemming
library(SnowballC)
wordStem(c("learn", "learned", "learning", "learns"))#此函数将单词全部变为原形

#为了使得wordStem应用于整个文本语料库
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)

#删除多于空格
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) 

#观察一些变换前后的结果
lapply(sms_corpus[1:3], as.character)
lapply(sms_corpus_clean[1:3], as.character)

#create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)

#alternative solution: create a document-term sparse matrix directly from the SMS corpus
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))

#上述结果中sms_dtm和sms_dtm2略有不一样,主要是 不同的停用词 去除功能

#alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = function(x) { removeWords(x, stopwords()) },
  removePunctuation = TRUE,
  stemming = TRUE
))
#自定义函数,修改stopwords,获得与sms_dtm2一样的结果

#compare the result
sms_dtm
sms_dtm2
sms_dtm3

#creating training and test datasets
一部分作为训练数据,一部分作为测试数据
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test  <- sms_dtm[4170:5559, ]

#also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels  <- sms_raw[4170:5559, ]$type

#check that the proportion of spam is similar
prop.table(table(sms_train_labels))
prop.table(table(sms_test_labels))#计算所占比例

#word cloud visualization
library(wordcloud)
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)
#画云图
#在语料库最小次数50次;不随机排列

#subset the training data into spam and ham groups
#获得子集
spam <- subset(sms_raw, type == "spam") 
ham  <- subset(sms_raw, type == "ham")

wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))#最常见的40个单词
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

#剔除训练数据中出现次数少于记录总数0.1%的单词
sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train

#indicator features for frequent words
#找出最少出现5次的单词
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)

#create DTMs with only the frequent terms
#查看上面筛选过后的词的频率
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]

#convert counts to a factor
#如果大于1就是Yes
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")}

#apply() convert_counts() to columns of train/test data
#将语料中不是0的全部换为Yes
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

Step 2: Training a model on the data ----

#分类模型建立
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)

Step 3: Evaluating model performance ----

sms_test_pred <- predict(sms_classifier, sms_test)
library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

在这里插入图片描述
发现有6+34=40条短信未被正确分类

Step 4: Improving model performance ----

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

在这里插入图片描述
改善模型,将拉普拉斯(laplace)值设为1,发现有6+32=38条短信未被正确分类。
虽然从40到38看上去是一个很小的变化,考虑到模型的准确性已经相当好了,这其实是很大的提高。
最终该模型将超过97%的短信正确分成垃圾短信和非垃圾短信。

本人是一名大三学生,自学看书,如果有错误欢迎指正哦~~~撒花
(可私信要数据哦~)

发布了2 篇原创文章 · 获赞 3 · 访问量 194

猜你喜欢

转载自blog.csdn.net/qq_44658157/article/details/103929429