## problem 1 (Logistic Regression and KNN)
library(ISLR)
attach(Auto)
## creating binary variable for MPG
MPG = rep("Below", length(mpg))
MPG[mpg > mean(mpg)] = "Above"
Auto = data.frame(Auto, MPG)
## a.
# 80% training set
set.seed(1)
train=sample(nrow(Auto),nrow(Auto)*0.8)
## logistic regression
glm.fit=glm(as.factor(MPG)~weight+displacement+horsepower+acceleration,family="binomial",subset=train)
summary(glm.fit)
## except for acceleration, weight, displacement and horsepower are appeared to be statically
## significant since their P values are smaller than the level of significance.
## odds ratio
exp(coef(glm.fit))
## weight: 1.001645e+00
## displacement: 1.013375e+00
## horsepower: 1.052735e+00
## acceleration: 1.079167e+00
###############################
## b.
Auto.test=Auto[-train, ]
test.truevalue=MPG[-train]
## making prediction on testing data set
glm.probs=predict(glm.fit,Auto.test, type="response")
## 0.5 as threshold
glm.pred=rep("Below",nrow(Auto.test))
glm.pred[glm.probs>.5]="Above"
table(glm.pred,test.truevalue) ## create a confusion matrix
mean(glm.pred==test.truevalue) ## overall prediction accuracy
## test.truevalue
# glm.pred Above Below
# Above 2 38
# Below 33 6
# accuracy: 0.1012658
#################
## c.
## 5-fold cross-validation with logistic regression
set.seed(1)
k=5
folds=sample(1:k,nrow(Auto),replace=TRUE)
accuracy=rep(0,k)
for(i in 1:k)
{
glm.fit1=glm(as.factor(MPG)~weight+displacement+horsepower+acceleration,family="binomial",data=Auto[folds!=i,])
Auto.test=Auto[folds==i, ]
glm.probs1 =predict(glm.fit1,Auto.test, type="response")
glm.pred1=rep("Below",nrow(Auto[folds==i,]))
glm.pred1[glm.probs1>.5]="Above" ## convert "Below" to "Above" if probability is greater than 0.5
test.truevalue=MPG[folds==i]
accuracy[i]=mean(glm.pred1==test.truevalue)
}
mean(accuracy)
## accuracy: 0.1191508
## 5-fold cv with 10-NN
library(class)
standardized.weight=scale(weight)
standardized.displacement=scale(displacement)
standardized.horsepower=scale(horsepower)
standardized.acceleration=scale(acceleration)
Input.standard=cbind(standardized.weight,standardized.displacement,standardized.horsepower,standardized.acceleration)
accuracy=rep(0,5)
set.seed(1)
folds=sample(1:5,nrow(Input.standard),replace=TRUE)
k=10
for(i in 1:5)
{
train.standard=Input.standard[folds!=i,]
test.standard=Input.standard[folds==i,]
train.truevalue=MPG[folds!=i]
test.truevalue=MPG[folds==i]
knn.pred=knn(train.standard,test.standard,train.truevalue,k=10)
accuracy[i]=mean(knn.pred==test.truevalue)
}
mean(accuracy)
## 10-NN accuracy: 0.9126281
### 5-fold cross validation with 10-NN has higher accuracy,
### thus, better than 5-fold cv with using logistic regression.
#######################################
# problem 2 (Classification Tree)
library(tree)
attach(OJ)
## a.
set.seed(1)
train=sample(nrow(OJ),700)
OJ.train=OJ[train,]
OJ.test=OJ[-train,]
Purchase.test=Purchase[-train]
# OJ.tree=tree(Purchase~.,data=OJ.train)
OJ.tree=tree(Purchase~.,OJ,subset=train)
summary(OJ.tree)
plot(OJ.tree)
text(OJ.tree, pretty=0)
## this tree has 10 terminal nodes
##########
## b.
## Apply the cv.tree() function with ten-fold cross-validation to the training
## set to determine the optimal tree size
cv.model=cv.tree(OJ.tree,K=10,FUN=prune.misclass)
cv.model
## optimal tree size: 10, smallest $dev
## produce pruned tree
prune.model=prune.tree(OJ.tree,best=10)
plot(prune.model)
text(prune.model,pretty=0)
## There is no difference between the pruned and unpruned tree.
###################
## c. make prediction on the testing data and show confusion matrix
prunetree.pred=predict(prune.model,OJ.test,type="class")
table(prunetree.pred,Purchase.test)
## Purchase.test
## prunetree.pred CH MM
## CH 202 49
## MM 21 98
mean(prunetree.pred==Purchase.test) ## 0.8108108
## prediction accuracy: (202+98)/(202+49+21+98) = 30/37 = 0.8108108
###################
## d. random forest
library(randomForest)
oj.test=OJ[-train,"Purchase"] ## true value in the testing dataset
set.seed(1)
rf.OJ=randomForest(Purchase~.,data=OJ,subset=train,mtry=4,importance=TRUE) ## mtry = sqrt of 17
yhat.rf = predict(rf.OJ,newdata=OJ[-train,]) ## predicted value
table(oj.test,yhat.rf)
rf.accuracy=mean(oj.test==yhat.rf) ## 0.8054054
rf.accuracy
## yhat.rf
## oj.test CH MM
## CH 201 22
## MM 50 97
## random forest accuracy is 0.8054054 which is lower than tree with 10-fold cross validation.