R分类模型

## problem 1 (Logistic Regression and KNN)
library(ISLR)
attach(Auto)

## creating binary variable for MPG
MPG = rep("Below", length(mpg))
MPG[mpg > mean(mpg)] = "Above"
Auto = data.frame(Auto, MPG)

## a.
# 80% training set
set.seed(1)
train=sample(nrow(Auto),nrow(Auto)*0.8)

## logistic regression
glm.fit=glm(as.factor(MPG)~weight+displacement+horsepower+acceleration,family="binomial",subset=train)
summary(glm.fit)

## except for acceleration, weight, displacement and horsepower are appeared to be statically
## significant since their P values are smaller than the level of significance.

## odds ratio
exp(coef(glm.fit))
## weight: 1.001645e+00
## displacement: 1.013375e+00
## horsepower: 1.052735e+00
## acceleration: 1.079167e+00

###############################

## b.
Auto.test=Auto[-train, ]
test.truevalue=MPG[-train]

## making prediction on testing data set
glm.probs=predict(glm.fit,Auto.test, type="response")

## 0.5 as threshold
glm.pred=rep("Below",nrow(Auto.test))
glm.pred[glm.probs>.5]="Above"

table(glm.pred,test.truevalue) ## create a confusion matrix
mean(glm.pred==test.truevalue) ## overall prediction accuracy

扫描二维码关注公众号，回复： 13533135 查看本文章

## test.truevalue
# glm.pred Above Below
# Above 2 38
# Below 33 6

# accuracy: 0.1012658

#################

## c.

## 5-fold cross-validation with logistic regression
set.seed(1)
k=5
folds=sample(1:k,nrow(Auto),replace=TRUE)

accuracy=rep(0,k)
for(i in 1:k)
{
glm.fit1=glm(as.factor(MPG)~weight+displacement+horsepower+acceleration,family="binomial",data=Auto[folds!=i,])
Auto.test=Auto[folds==i, ]
glm.probs1 =predict(glm.fit1,Auto.test, type="response")
glm.pred1=rep("Below",nrow(Auto[folds==i,]))
glm.pred1[glm.probs1>.5]="Above" ## convert "Below" to "Above" if probability is greater than 0.5

test.truevalue=MPG[folds==i]
accuracy[i]=mean(glm.pred1==test.truevalue)
}

mean(accuracy)

## accuracy: 0.1191508

## 5-fold cv with 10-NN
library(class)
standardized.weight=scale(weight)
standardized.displacement=scale(displacement)
standardized.horsepower=scale(horsepower)
standardized.acceleration=scale(acceleration)

Input.standard=cbind(standardized.weight,standardized.displacement,standardized.horsepower,standardized.acceleration)
accuracy=rep(0,5)

set.seed(1)

folds=sample(1:5,nrow(Input.standard),replace=TRUE)

k=10
for(i in 1:5)
{
train.standard=Input.standard[folds!=i,]
test.standard=Input.standard[folds==i,]
train.truevalue=MPG[folds!=i]
test.truevalue=MPG[folds==i]

knn.pred=knn(train.standard,test.standard,train.truevalue,k=10)

accuracy[i]=mean(knn.pred==test.truevalue)

}

mean(accuracy)

## 10-NN accuracy: 0.9126281

### 5-fold cross validation with 10-NN has higher accuracy,
### thus, better than 5-fold cv with using logistic regression.

#######################################

# problem 2 (Classification Tree)
library(tree)
attach(OJ)

## a.

set.seed(1)
train=sample(nrow(OJ),700)
OJ.train=OJ[train,]
OJ.test=OJ[-train,]
Purchase.test=Purchase[-train]

# OJ.tree=tree(Purchase~.,data=OJ.train)
OJ.tree=tree(Purchase~.,OJ,subset=train)
summary(OJ.tree)
plot(OJ.tree)
text(OJ.tree, pretty=0)

## this tree has 10 terminal nodes

##########

## b.
## Apply the cv.tree() function with ten-fold cross-validation to the training
## set to determine the optimal tree size
cv.model=cv.tree(OJ.tree,K=10,FUN=prune.misclass)
cv.model

## optimal tree size: 10, smallest $dev

## produce pruned tree
prune.model=prune.tree(OJ.tree,best=10)
plot(prune.model)
text(prune.model,pretty=0)

## There is no difference between the pruned and unpruned tree.

###################

## c. make prediction on the testing data and show confusion matrix
prunetree.pred=predict(prune.model,OJ.test,type="class")
table(prunetree.pred,Purchase.test)

## Purchase.test
## prunetree.pred CH MM
## CH 202 49
## MM 21 98

mean(prunetree.pred==Purchase.test) ## 0.8108108
## prediction accuracy: (202+98)/(202+49+21+98) = 30/37 = 0.8108108

###################

## d. random forest

library(randomForest)

oj.test=OJ[-train,"Purchase"] ## true value in the testing dataset

set.seed(1)
rf.OJ=randomForest(Purchase~.,data=OJ,subset=train,mtry=4,importance=TRUE) ## mtry = sqrt of 17
yhat.rf = predict(rf.OJ,newdata=OJ[-train,]) ## predicted value
table(oj.test,yhat.rf)
rf.accuracy=mean(oj.test==yhat.rf) ## 0.8054054
rf.accuracy
## yhat.rf
## oj.test CH MM
## CH 201 22
## MM 50 97
## random forest accuracy is 0.8054054 which is lower than tree with 10-fold cross validation.

猜你喜欢