代码:
library(tm)
library(ggplot2)
spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham/"
hardham2.path<-"data/hard_ham_2/"
get.msg<-function(path){
con<-file(path,open="rt")
text<-readLines(con,warn=FALSE)
msg<-text[seq(which(text=="")[1]+1,length(text),1)]
close(con)
return(paste(msg,collapse="\n"))
}
spam.docs<-dir(spam.path)
spam.docs<-spam.docs[which(spam.docs!="cmds")]
all.spam<-sapply(spam.docs,
function(p) get.msg(paste(spam.path,p,sep="")))
#head(all.spam)
get.tdm<-function(doc.vec){
doc.corpus<-Corpus(VectorSource(doc.vec))
control<-list(stopwords=TRUE,
removePunctuation=TRUE,
removeNumbers=TRUE,
minDocFreq=2)
doc.dtm<-TermDocumentMatrix(doc.corpus,control)
return(doc.dtm)
}
spam.tdm<-get.tdm(all.spam)
报错:
Error in tolower(txt) : invalid multibyte string 6
解决办法:
终端里输入
Sys.setlocale(category = "LC_ALL", locale = "us")