1 创建数据框
manager <- c(1,2,3,4,5) date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09") country <- c("US", "US", "UK", "UK", "UK") gender <- c("M", "F", "F", "M", "F") age <- c(32, 45, 25, 39, 99) q1 <- c(5, 3, 3, 3, 2) q2 <- c(4, 5, 5, 3, 2) q3 <- c(5, 2, 5, 4, 1) q4 <- c(5, 5, 5, NA, 2) q5 <- c(5, 5, 2, NA, 1) leadership <- data.frame(manager, date, country, gender, age, q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
2.创建新变量有三种方法(从原有的上变换)
2.1
mydata <- data.frame(x1 = c(2,2,6,4), x2 = c(3,4,2,8)) mydata$sumx <- mydata$x1+mydata$x2 mydata$minx <- (mydata$x1 + mydata$x2)/2
2.2
attach(mydata) mydata$sumx <- x1 + x2 mydata$minx <- (x1 +x2)/2 detach(mydata)
2.3
mydata <- transform(mydata, sumx = x1 + x2, means = (x1 + x2)/2)
#推荐使用
3变量的重编码(创建新的变量)
leadership$age[leadership$age == 99] <- NA #年级太大,可能是错的。leadership$age == 99 输出的是逻辑变量(FALSE FALSE FALSE FALSE TRUE)
leadership$agecat[leadership$age > 75] <- "Elder" leadership$agecat[leadership$age >= 55 & leadership$age <= 75] <- "Middle Aged" leadership$agecat[leadership$age < 55] <- "Young"
第二种方法
leadership <- within(leadership,{ agecat<-NA agecat[age>75] <-"Elder" agecat[age>=55 &age<=75] <-"Middle Aged" agecat[age<55] <-"Young" })
4变量的重命名
4.1 fix(leadership)
4.2names(leadership)[2] <- "testData
4.3用plyr包
library(plyr)
leadership <- rename(leadership,
c(manager="managerID",date="testDate"))
5缺失值
is.na() #not available
is.infinite()
is.nan() #not a number
6日期值
strdates <- as.Date(c("2007-06-22","2004-02-13")) a <- c("2007-06-22","2004-02-13") date <- as.Date(a,"%Y/%m/%d")
Sys.Date()
date()
format(as.Date("1990-10-27"),format="%a")
difftime(x,z,units = "days")
leadership$date <-as.Date(leadership$date,"%m/%d/%y")
7 类型转换
判断 is.numeric() is.charater()
转换 as.numeric() as.vector()
8 数据排列
a <- leadership[order(gender,-age),]
9 数据集的合并
9.1向数据框中添加列
total <- merge(dataframeA,dataframeB, by="ID" #inner join
total <- cbind() #强行合并
9.2向数据框中添加行
total <- rbind(dataframeA,dataframeB) #列不相同时,要增加列,值为NA
10.数据集取子集
10.1选入变量
1 newdata <- leadership[,c(6:10)]
2 newdata<- leadership[c("q1","q2","q3","q4","q5")]
3 cc <- paste("q",1:5,sep="")
newdata<- leadership[cc]
10.2 剔除变量
myvars <- names(leadership) %in% c("q3","q4") #返回逻辑向量,包含q3,q4是true,否则为FALSE
newdata <- leadership[!myvars] #剔除了q3q4后的leadership
如果知道是q3q4是第8,9个变量
newdata <- leadership[c(-8,-9)]
newdata$q2 <- NULL #删除变量
10.3选入观测
newdata <- leadership[1:3,]
newdata <- leadership[leadership$gender=="M"&leadership$age > 30,]
newdata <- leadership[leadership$gender=="M"&leadership$age > 30,]
leadership$date <- as.Date(leadership$date,"%m/%d/%y")
A <- as.Date("2009/1/1")
B <- as.Date("2009/12/31")
newda <- leadership[A<leadership$date&leadership$date<B,]
10.4 subset()
newdata <- subset(leadership,age>=35|age <24,select = c(q1,q2,q3,q4))
newdata <- subset(leadership,gender=="M"&age>25,select = gender:q4)
10.5随机抽样
sample(1:nrow(leadership),3,replace = FALSE)