(1)请首先剔除数据中实际出发时间(dep_time)缺失的观测值,然后生成一个新的变量dep_interval,用来将数据中的实际出发时间分为上午(6:01-12:00)和下午(12:01-18:00),晚上(18:01-24:00)和凌晨(0:01-6:00)四组,将每一年每一月每一天内每个dep_interval分组(year,month,day,dep_interval),计算每一组的平均到时延误时间和平均到达的机场数量 以及到达延误的方差。
alter_flights <- flights %>%
filter(!is.na(dep_time)) %>%
mutate(
dep_interval=dep_time %/%600
) %>%
group_by(year,month,day,dep_interval) %>%
summarise(
ave_arr_delay=mean(arr_delay,na.rm = T),
ave_dest=length(unique(dest)),
var_arr_delay=var(arr_delay,na.rm = T)
)
(2)选取平均到达延误在30分钟之内同时平均到达数量不低于50个的小组,并按照平均到达延误由大到小排列,最后输出到平均延误最严重的前十个观测值,还包括(year,month,day)和只保留含有dep和arr的变量。
alter_flights %>%
filter(ave_arr_delay<=30 & ave_dest>=50) %>%
arrange(desc(ave_arr_delay)) %>%
head(10)
Practice referring to non-syntactic names in the following data frame by:
Extracting the variable called 1.
Plotting a scatterplot of 1 vs 2.
Creating a new column called 3 which is 2 divided by 1.
Renaming the columns to one, two and three.
annoying <- tibble(
`1` = 1:10,
`2` = `1` * 2 + rnorm(length(`1`))
)
##(1)
annoying$`1`
annoying$"1"
annoying[["1"]]
#annoying[[`1`]] is invalid
##(2)
ggplot(annoying, aes(`1`, `2`)) +
geom_point()
##(3)
mutate(annoying, `3` = `2` / `1`)
annoying[["3"]] <- annoying[["2"]] / annoying[["1"]]
##(4)
annoying <- rename(annoying, one = `1`, two = `2`, three = `3`)
In Australia,most of the defaults values are valid, except that the date format is “(d)d/mm/yyyy”, meaning that January 2, 2006 is written as 02/01/2006
.Create a new locale object that encapsulates the settings for the types of file Australians read most commonly.
au_locale <- locale(date_format = "%d/%m/%Y")
parse_date("02/01/2006", locale = au_locale)
#> [1] "2006-01-02"
Compute the rate
for table2
. You will need to perform four operations:
Extract the number of TB cases per country per year.
Extract the matching population per country per year.
Divide cases by population, and multiply by 10000.
Store back in the appropriate place.
t2_cases_per_cap <- table2 %>%
pivot_wider(names_from = "type",values_from = "count") %>%
mutate(
count=cases/population*10000,
type = "cases_per_cap"
) %>%
select(country,year,type,count)
bind_rows(table2, t2_cases_per_cap) %>%
arrange(country, year, type, count)
Widen this table with a new column to uniquely identify each value?
people <- tribble(
~name, ~key, ~value,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
people %>%
group_by(name, key) %>%
mutate(obs = row_number()) %>%
pivot_wider(names_from="name", values_from = "value")
Write both_na()
, a function that takes two vectors of the same length and returns the number of positions that have an NA
in both vectors.
both_na <- function(x, y) {
sum(is.na(x) & is.na(y))
}
both_na(
c(NA, NA, 1, 2),
c(NA, 1, NA, 2)
)
#> [1] 1
Implement a fizzbuzz()
function. It takes a single number as input. If the number is divisible by three, it returns “fizz”. If it’s divisible by five it returns “buzz”. If it’s divisible by three and five, it returns “fizzbuzz”. Otherwise, it returns the number. Make sure you first write working code before you create the function.
fizzbuzz <- function(x) {
# these two lines check that x is a valid input
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
if (!(x %% 3) && !(x %% 5)) {
"fizzbuzz"
} else if (!(x %% 3)) {
"fizz"
} else if (!(x %% 5)) {
"buzz"
} else {
# ensure that the function returns a character vector
as.character(x)
}
}
#(1)
iris %>%
count(iris$Species)
#(2)
set.seed(100)
New.Species <- vector("character",nrow(iris))
for(i in 1:150){
New.Species[[i]] <-
if(iris$Petal.Length[[i]]<1.5) {
"setosa"
}else if(iris$Sepal.Length[[i]]+iris$Sepal.Width[[i]]>9.0) {
"versicolor"
}else if(iris$Petal.Width[[i]]<1.75) {
"virginica"
}else {
sample(c("setosa","virginica","versicolor"),1,prob = c(1/3,1/3,1/3))
}
}
iris <- iris %>%
mutate(
New.Species=New.Species
)
#(3)
iris_p <- iris %>%
mutate(pin = Species==New.Species) %>%
group_by(Species) %>%
summarise(pin_rate=mean(pin))
view(iris_p)
sum(iris_p$pin_rate)
Write a function that turns (e.g.) a vector c("a", "b", "c") into the string "a, b, and c".This function needs to handle four cases.
n == 0
: an empty string, e.g. ""
.
n == 1
: the original vector, e.g. "a"
.
n == 2
: return the two elements separated by “and”, e.g. "a and b"
.
n > 2
: return the first n - 1
elements separated by commas, and the last element separated by a comma and “and”, e.g. "a, b, and c"
.
str_commasep <- function(x, delim = ",") {
n <- length(x)
if (n == 0) {
""
} else if (n == 1) {
x
} else if (n == 2) {
# no comma before and when n == 2
str_c(x[[1]], "and", x[[2]], sep = " ")
} else {
# commas after all n - 1 elements
not_last <- str_c(x[seq(n - 1)], delim)
#here `,` is directly connected to each element of `x[seq(n-1)]`,with the by-default `sep = ""` omitted
# prepend "and" to the last element
last <- str_c("and", x[[n]], sep = " ")
# combine parts with spaces
str_c(c(not_last, last), collapse = " ")
}
}
在数据集sim1中生成新的变量new,低于或等于x的中位数记为“low x”, 高于中位数记为“high x”
judge_median <- function(a,b){
if(a > b) "high x"
else "low x"
}
sim1 %>%
mutate(
new = map2(sim1$x,median(sim1$x),judge_median) %>% as_vector()
)
#也可用
new = ifelse(a>b,"high x","low x")
#用此法可分为三类
ifelse(df$popularity <= 3 , "low",ifelse(df$popularity <= 6,"middle","high"))
What are the five most common words in sentences
?
tibble(word = unlist(sentences %>% str_extract_all(boundary("word")))) %>%
mutate(word = str_to_lower(word)) %>%
count(word) %>%
arrange(desc(n)) %>%
head(5)
将下列数据分别按三类画在并排的三张子图上,并在每张子图上画出各类点的质心(颜色为蓝色,size = 4)
df <- data.frame(
x = rnorm(120,c(2,0,4)),
y = rnorm(120,c(1,2,1)),
z = letters[1:3]
)
centroids <- aggregate(cbind(df$x,df$y)~df$z,df,mean) %>%
rename(
x = "V1",
y = "V2",
z = "df$z"
)
ggplot(df,aes(x,y)) +
geom_point() +
geom_point(data = centroids,color="red",size = 4) +
facet_grid(~z)