《机器学习:实用案例解析》第三章 (3)

代码:

library(tm)
library(ggplot2)
spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham/"
hardham2.path<-"data/hard_ham_2/"

get.msg<-function(path){
  con<-file(path,open="rt")
  text<-readLines(con,warn=FALSE)
  msg<-text[seq(which(text=="")[1]+1,length(text),1)]
  close(con)
  return(paste(msg,collapse="\n"))
}

spam.docs<-dir(spam.path)
spam.docs<-spam.docs[which(spam.docs!="cmds")]
all.spam<-sapply(spam.docs,
                 function(p) get.msg(paste(spam.path,p,sep="")))
#head(all.spam)

get.tdm<-function(doc.vec){
  doc.corpus<-Corpus(VectorSource(doc.vec))
  control<-list(stopwords=TRUE,
                removePunctuation=TRUE,
                removeNumbers=TRUE,
                minDocFreq=2)
  doc.dtm<-TermDocumentMatrix(doc.corpus,control)
  return(doc.dtm)
}

spam.tdm<-get.tdm(all.spam)

报错:

 Error in tolower(txt) : invalid multibyte string 6 

解决办法:

终端里输入

Sys.setlocale(category = "LC_ALL", locale = "us")
发布了5 篇原创文章 · 获赞 1 · 访问量 1185

猜你喜欢

转载自blog.csdn.net/m0_37325106/article/details/79582515