用R怎样做词云

par(mfrow=c(1,2))
setwd("C:/Users/11565/Desktop")
se_raw <- read.csv("垃圾邮件.csv",header = T,stringsAsFactors = FALSE)
str(se_raw)
se_raw$type <- factor(se_raw$type,levels = c("ham","spam"),labels = c("ham1","spam1"))
table(se_raw$type)
prop.table(table(se_raw$type))
round(prop.table(table(se_raw$type))*100,digits = 1)
#用tm函数前需要将文本进行转换成向量，字符串转换成向量
#if (require(tm))和命令library是一样的
if (require(tm)){
se_corpus <- Corpus(VectorSource(se_raw$text))
#直接用tolower和content_transformer是一样的

#转换成大小写
s1 <- tm_map(se_corpus,content_transformer(tolower))
corpus_x11 <- tm_map(se_corpus,tolower)

#去掉数字
corpus_x11 <- tm_map(corpus_x11,removeNumbers)

#去掉停用词
corpus_x11 <- tm_map(corpus_x11,removeWords,stopwords())

#去掉标点符号
corpus_x11 <- tm_map(corpus_x11,removePunctuation)
}
#查看去掉停用词之后的二者对比
inspect(corpus_x11[1:3])
inspect(se_corpus[1:3])
#这一步就可以做云图了，wordcloud是对文本做的词云不是矩阵
wordcloud(corpus_x11)
#做稀疏矩阵是为了下次贝叶斯判断做准备，记住洗漱矩阵不是矩阵，不能直接用它做词云
se_dtm <- TermDocumentMatrix(corpus_x11)
se_dtm <- as.matrix(se_dtm)
#做了稀疏矩阵后可以用以下算法做词云
#转换成矩阵，转换成矩阵后每行的名字每个特征词，通过每行的和可以算出每个单词的词频
m <- as.matrix(se_dtm)
v <- sort(rowSums(m),decreasing=TRUE)#v是数值型的
class(v)#检验数据类型
word <- names(v)#取名字
d <- data.frame(word=names(v),freq=v)
#wordcloud第一个参数是词的名字，第二个参数是名字的词频
wordcloud(d$word,d$freq)

猜你喜欢