- 数据变换
- 数据标准化
#scale 返回矩阵
> cbind(as.data.frame(scale(iris[,1:4],center=TRUE,scale=TRUE)),iris$Species)
Sepal.Length Sepal.Width Petal.Length Petal.Width iris$Species
1 -0.89767388 1.01560199 -1.33575163 -1.3110521482 setosa
2 -1.13920048 -0.13153881 -1.33575163 -1.3110521482 setosa
3 -1.38072709 0.32731751 -1.39239929 -1.3110521482 setosa
4 -1.50149039 0.09788935 -1.27910398 -1.3110521482 setosa
5 -1.01843718 1.24503015 -1.33575163 -1.3110521482 setosa
6 -0.53538397 1.93331463 -1.16580868 -1.0486667950 setosa
7 -1.50149039 0.78617383 -1.33575163 -1.1798594716 setosa
8 -1.01843718 0.78617383 -1.27910398 -1.3110521482 setosa
9 -1.74301699 -0.36096697 -1.33575163 -1.3110521482 setosa
- 主成分分析
princomp(x,cor=ALSE)
summary()用于查看结果
scores 保存主成分分数(坐标)
- 独热编码
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",
"./german.data")
data <- read.table("./german.data")
> str(data)
'data.frame': 1000 obs. of 21 variables:
$ V1 : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
$ V2 : int 6 48 12 42 24 36 24 36 12 30 ...
$ V3 : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
$ V4 : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
$ V5 : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
$ V6 : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
$ V7 : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
$ V8 : int 4 2 2 2 3 2 3 2 2 4 ...
$ V9 : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
$ V10: Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
$ V11: int 4 2 3 4 4 4 4 2 4 2 ...
$ V12: Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
$ V13: int 67 22 49 45 53 35 53 35 61 28 ...
$ V14: Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
$ V15: Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
$ V16: int 2 1 1 1 2 1 1 1 1 2 ...
$ V17: Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
$ V18: int 1 1 2 2 2 2 1 1 1 1 ...
$ V19: Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
$ V20: Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
$ V21: int 1 2 1 1 2 1 1 1 1 2 ...
> head(data)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
1 A11 6 A34 A43 1169 A65 A75 4 A93 A101 4 A121 67 A143 A152 2 A173 1 A192 A201 1
2 A12 48 A32 A43 5951 A61 A73 2 A92 A101 2 A121 22 A143 A152 1 A173 1 A191 A201 2
3 A14 12 A34 A46 2096 A61 A74 2 A93 A101 3 A121 49 A143 A152 1 A172 2 A191 A201 1
4 A11 42 A32 A42 7882 A61 A74 2 A93 A103 4 A122 45 A143 A153 1 A173 2 A191 A201 1
5 A11 24 A33 A40 4870 A61 A73 3 A93 A101 4 A124 53 A143 A153 2 A173 2 A191 A201 2
6 A14 36 A32 A46 9055 A65 A73 2 A93 A101 4 A124 35 A143 A153 1 A172 2 A192 A201 1
> modelData <- model.matrix(~V1 + V2 + V5 + V8 + V21, data)#model.matrix函数对数值型和分类Level=2的类别型变量没有影响
> head(modelData)
(Intercept) V1A12 V1A13 V1A14 V2 V5 V8 V21
1 1 0 0 0 6 1169 4 1
2 1 1 0 0 48 5951 2 2
3 1 0 0 1 12 2096 2 1
4 1 0 0 0 42 7882 2 1
5 1 0 0 0 24 4870 3 2
6 1 0 0 1 36 9055 2 1
> NNModel <- neuralnet(V21 ~ V1A12 + V1A13 + V1A14 + V2 + V5 + V8, modelData)
## 变量哑变量处理
> #caret包中有一个dummyVars函数,可用变量虚拟化批处理。dummyVars()函数的使用格式为:
> # dummyVars(formula, data, sep = ".", levelsOnly = FALSE, fullRank = FALSE, ...)
> customers <- data.frame(
+ id=c(10,20,30,40,50),
+ gender=c('male','female','female','male','female'),
+ mood=c('happy','sad','happy','sad','happy'),
+ outcome=c(1,1,0,0,0))
> customers
id gender mood outcome
1 10 male happy 1
2 20 female sad 1
3 30 female happy 0
4 40 male sad 0
5 50 female happy 0
> library(caret)
> # 哑变量处理
> dmy <- dummyVars(" ~ .", data = customers)
> trsf <- data.frame(predict(dmy, newdata = customers))
> print(trsf)
id gender.female gender.male mood.happy mood.sad outcome
1 10 0 1 1 0 1
2 20 1 0 0 1 1
3 30 1 0 1 0 0
4 40 0 1 0 1 0
5 50 1 0 1 0 0
>
- 缺失值处理
> data(sleep,package='VIM')
> str(sleep)
'data.frame': 62 obs. of 10 variables:
$ BodyWgt : num 6654 1 3.38 0.92 2547 ...
$ BrainWgt: num 5712 6.6 44.5 5.7 4603 ...
$ NonD : num NA 6.3 NA NA 2.1 9.1 15.8 5.2 10.9 8.3 ...
$ Dream : num NA 2 NA NA 1.8 0.7 3.9 1 3.6 1.4 ...
$ Sleep : num 3.3 8.3 12.5 16.5 3.9 9.8 19.7 6.2 14.5 9.7 ...
$ Span : num 38.6 4.5 14 NA 69 27 19 30.4 28 50 ...
$ Gest : num 645 42 60 25 624 180 35 392 63 230 ...
$ Pred : int 3 3 1 5 3 4 1 4 1 1 ...
$ Exp : int 5 1 1 2 5 4 1 5 2 1 ...
$ Danger : int 3 3 1 3 4 4 1 4 1 1 ...
> #图形化探究缺失值
> library(mice)
> md.pattern(sleep)
BodyWgt BrainWgt Pred Exp Danger Sleep Span Gest Dream NonD
42 1 1 1 1 1 1 1 1 1 1 0
2 1 1 1 1 1 1 0 1 1 1 1
3 1 1 1 1 1 1 1 0 1 1 1
9 1 1 1 1 1 1 1 1 0 0 2
2 1 1 1 1 1 0 1 1 1 0 2
1 1 1 1 1 1 1 0 0 1 1 2
2 1 1 1 1 1 0 1 1 0 0 3
1 1 1 1 1 1 1 0 1 0 0 3
0 0 0 0 0 4 4 4 12 14 38
>
> sleep[!complete.cases(sleep),]
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.0 NA NA 3.3 38.6 645 3 5 3
3 3.385 44.5 NA NA 12.5 14.0 60 1 1 1
4 0.920 5.7 NA NA 16.5 NA 25 5 2 3
13 0.550 2.4 7.6 2.7 10.3 NA NA 2 1 2
14 187.100 419.0 NA NA 3.1 40.0 365 5 5 5
19 1.410 17.5 4.8 1.3 6.1 34.0 NA 1 2 1
20 60.000 81.0 12.0 6.1 18.1 7.0 NA 1 1 1
21 529.000 680.0 NA 0.3 NA 28.0 400 5 5 5
24 207.000 406.0 NA NA 12.0 39.3 252 1 4 1
26 36.330 119.5 NA NA 13.0 16.2 63 1 1 1
30 100.000 157.0 NA NA 10.8 22.4 100 1 1 1
31 35.000 56.0 NA NA NA 16.3 33 3 5 4
35 0.122 3.0 8.2 2.4 10.6 NA 30 2 1 1
36 1.350 8.1 8.4 2.8 11.2 NA 45 3 1 3
41 250.000 490.0 NA 1.0 NA 23.6 440 5 5 5
47 4.288 39.2 NA NA 12.5 13.7 63 2 2 2
53 14.830 98.2 NA NA 2.6 17.0 150 5 5 5
55 1.400 12.5 NA NA 11.0 12.7 90 2 2 2
56 0.060 1.0 8.1 2.2 10.3 3.5 NA 3 1 2
62 4.050 17.0 NA NA NA 13.0 38 3 1 1
> nrow(sleep[complete.cases(sleep),])
[1] 42
> nrow(sleep[!complete.cases(sleep),])
[1] 20
>
> is.na(sleep$Dream)
[1] TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[25] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[49] FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
[61] FALSE TRUE
> sleep[is.na(sleep$Dream),]
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.0 NA NA 3.3 38.6 645 3 5 3
3 3.385 44.5 NA NA 12.5 14.0 60 1 1 1
4 0.920 5.7 NA NA 16.5 NA 25 5 2 3
14 187.100 419.0 NA NA 3.1 40.0 365 5 5 5
24 207.000 406.0 NA NA 12.0 39.3 252 1 4 1
26 36.330 119.5 NA NA 13.0 16.2 63 1 1 1
30 100.000 157.0 NA NA 10.8 22.4 100 1 1 1
31 35.000 56.0 NA NA NA 16.3 33 3 5 4
47 4.288 39.2 NA NA 12.5 13.7 63 2 2 2
53 14.830 98.2 NA NA 2.6 17.0 150 5 5 5
55 1.400 12.5 NA NA 11.0 12.7 90 2 2 2
62 4.050 17.0 NA NA NA 13.0 38 3 1 1
>
> table(is.na(sleep$Dream))
FALSE TRUE
50 12
> mean(is.na(sleep$Dream))
[1] 0.1935484
> 12/62
[1] 0.1935484
> sum(is.na(sleep$Dream))
[1] 12
> mean(!complete.cases(sleep))
[1] 0.3225806
> 20/62
[1] 0.3225806
>
> library(VIM)
Loading required package: colorspace
Loading required package: data.table
data.table 1.10.4.3
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
VIM is ready to use.
Since version 4.0.0 the GUI is in its own package VIMGUI.
Please use the package to use the new (and old) GUI.
Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
Attaching package: ‘VIM’
The following object is masked from ‘package:DMwR’:
kNN
The following object is masked from ‘package:datasets’:
sleep
> aggr(sleep,prop=TRUE,numbers=TRUE)
>
> #方式一
> sleeps <- sleep
> cor(na.omit(sleeps))
BodyWgt BrainWgt NonD Dream Sleep
BodyWgt 1.00000000 0.95584875 -0.3936373 -0.07488845 -0.3428373
BrainWgt 0.95584875 1.00000000 -0.3867947 -0.07427740 -0.3370815
NonD -0.39363729 -0.38679474 1.0000000 0.51824287 0.9676730
Dream -0.07488845 -0.07427740 0.5182429 1.00000000 0.7171864
Sleep -0.34283732 -0.33708151 0.9676730 0.71718643 1.0000000
Span 0.46982146 0.62938940 -0.3722345 -0.26834006 -0.3824462
Gest 0.71434413 0.73353206 -0.6061048 -0.40893177 -0.6144743
Pred 0.09588524 -0.01538017 -0.3526558 -0.39795310 -0.4047155
Exp 0.40563880 0.32318968 -0.5802789 -0.50363338 -0.6213578
Danger 0.25932512 0.15093686 -0.5346247 -0.57194862 -0.6043029
Span Gest Pred Exp Danger
BodyWgt 0.46982146 0.71434413 0.09588524 0.4056388 0.25932512
BrainWgt 0.62938940 0.73353206 -0.01538017 0.3231897 0.15093686
NonD -0.37223446 -0.60610477 -0.35265576 -0.5802789 -0.53462471
Dream -0.26834006 -0.40893177 -0.39795310 -0.5036334 -0.57194862
Sleep -0.38244618 -0.61447431 -0.40471545 -0.6213578 -0.60430286
Span 1.00000000 0.64638866 -0.16973575 0.3157456 0.01468596
Gest 0.64638866 1.00000000 0.09079823 0.5734727 0.30623551
Pred -0.16973575 0.09079823 1.00000000 0.6256876 0.92731729
Exp 0.31574564 0.57347265 0.62568764 1.0000000 0.78980702
Danger 0.01468596 0.30623551 0.92731729 0.7898070 1.00000000
> cor(sleep,use='complete.obs')
BodyWgt BrainWgt NonD Dream Sleep
BodyWgt 1.00000000 0.95584875 -0.3936373 -0.07488845 -0.3428373
BrainWgt 0.95584875 1.00000000 -0.3867947 -0.07427740 -0.3370815
NonD -0.39363729 -0.38679474 1.0000000 0.51824287 0.9676730
Dream -0.07488845 -0.07427740 0.5182429 1.00000000 0.7171864
Sleep -0.34283732 -0.33708151 0.9676730 0.71718643 1.0000000
Span 0.46982146 0.62938940 -0.3722345 -0.26834006 -0.3824462
Gest 0.71434413 0.73353206 -0.6061048 -0.40893177 -0.6144743
Pred 0.09588524 -0.01538017 -0.3526558 -0.39795310 -0.4047155
Exp 0.40563880 0.32318968 -0.5802789 -0.50363338 -0.6213578
Danger 0.25932512 0.15093686 -0.5346247 -0.57194862 -0.6043029
Span Gest Pred Exp Danger
BodyWgt 0.46982146 0.71434413 0.09588524 0.4056388 0.25932512
BrainWgt 0.62938940 0.73353206 -0.01538017 0.3231897 0.15093686
NonD -0.37223446 -0.60610477 -0.35265576 -0.5802789 -0.53462471
Dream -0.26834006 -0.40893177 -0.39795310 -0.5036334 -0.57194862
Sleep -0.38244618 -0.61447431 -0.40471545 -0.6213578 -0.60430286
Span 1.00000000 0.64638866 -0.16973575 0.3157456 0.01468596
Gest 0.64638866 1.00000000 0.09079823 0.5734727 0.30623551
Pred -0.16973575 0.09079823 1.00000000 0.6256876 0.92731729
Exp 0.31574564 0.57347265 0.62568764 1.0000000 0.78980702
Danger 0.01468596 0.30623551 0.92731729 0.7898070 1.00000000
>
> identical(na.omit(sleeps),sleep[complete.cases(sleep),])
[1] FALSE
>
>
> #方式二
> library(DMwR)
> mapply(median,sleep,na.rm=TRUE)
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred
3.3425 17.2500 8.3500 1.8000 10.4500 15.1000 79.0000 3.0000
Exp Danger
2.0000 2.0000
> centralImputation(sleeps)[which(!complete.cases(sleep)),]
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.0 8.35 1.8 3.30 38.6 645 3 5 3
3 3.385 44.5 8.35 1.8 12.50 14.0 60 1 1 1
4 0.920 5.7 8.35 1.8 16.50 15.1 25 5 2 3
13 0.550 2.4 7.60 2.7 10.30 15.1 79 2 1 2
14 187.100 419.0 8.35 1.8 3.10 40.0 365 5 5 5
19 1.410 17.5 4.80 1.3 6.10 34.0 79 1 2 1
20 60.000 81.0 12.00 6.1 18.10 7.0 79 1 1 1
21 529.000 680.0 8.35 0.3 10.45 28.0 400 5 5 5
24 207.000 406.0 8.35 1.8 12.00 39.3 252 1 4 1
26 36.330 119.5 8.35 1.8 13.00 16.2 63 1 1 1
30 100.000 157.0 8.35 1.8 10.80 22.4 100 1 1 1
31 35.000 56.0 8.35 1.8 10.45 16.3 33 3 5 4
35 0.122 3.0 8.20 2.4 10.60 15.1 30 2 1 1
36 1.350 8.1 8.40 2.8 11.20 15.1 45 3 1 3
41 250.000 490.0 8.35 1.0 10.45 23.6 440 5 5 5
47 4.288 39.2 8.35 1.8 12.50 13.7 63 2 2 2
53 14.830 98.2 8.35 1.8 2.60 17.0 150 5 5 5
55 1.400 12.5 8.35 1.8 11.00 12.7 90 2 2 2
56 0.060 1.0 8.10 2.2 10.30 3.5 79 3 1 2
62 4.050 17.0 8.35 1.8 10.45 13.0 38 3 1 1
> centralImputation(sleeps)[which(is.na(sleep$Dream)==TRUE),'Dream',drop=FALSE]
Dream
1 1.8
3 1.8
4 1.8
14 1.8
24 1.8
26 1.8
30 1.8
31 1.8
47 1.8
53 1.8
55 1.8
62 1.8
>
> #方式三
> library(DMwR)
> knnImputation(sleep)[which(is.na(sleep$Dream)==TRUE),'Dream',drop=FALSE]
Dream
1 1.7484445
3 2.5797456
4 2.1686672
14 0.8080861
24 1.6971290
26 2.6147850
30 2.3707976
31 1.0049226
47 2.5278459
53 0.7370004
55 2.2017654
62 3.4317903