若干题目

（1）请首先剔除数据中实际出发时间（dep_time)缺失的观测值，然后生成一个新的变量dep_interval,用来将数据中的实际出发时间分为上午（6:01-12:00）和下午（12:01-18:00），晚上（18:01-24:00）和凌晨（0:01-6:00）四组，将每一年每一月每一天内每个dep_interval分组（year,month,day,dep_interval),计算每一组的平均到时延误时间和平均到达的机场数量以及到达延误的方差。

alter_flights <- flights %>%
  filter(!is.na(dep_time)) %>%
  mutate(
    dep_interval=dep_time %/%600
  ) %>%
  group_by(year,month,day,dep_interval) %>%
  summarise(
    ave_arr_delay=mean(arr_delay,na.rm = T),
    ave_dest=length(unique(dest)),
    var_arr_delay=var(arr_delay,na.rm = T)
  )

（2）选取平均到达延误在30分钟之内同时平均到达数量不低于50个的小组，并按照平均到达延误由大到小排列，最后输出到平均延误最严重的前十个观测值，还包括（year,month,day)和只保留含有dep和arr的变量。

alter_flights %>%
  filter(ave_arr_delay<=30 & ave_dest>=50) %>%
  arrange(desc(ave_arr_delay)) %>%
  head(10)

Practice referring to non-syntactic names in the following data frame by:

Extracting the variable called 1.

Plotting a scatterplot of 1 vs 2.

Creating a new column called 3 which is 2 divided by 1.

Renaming the columns to one, two and three.

annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)

##(1)
annoying$`1`

annoying$"1"

annoying[["1"]]
#annoying[[`1`]] is invalid

##(2)
ggplot(annoying, aes(`1`, `2`)) +
geom_point()

##(3)
mutate(annoying, `3` = `2` / `1`)

annoying[["3"]] <- annoying[["2"]] / annoying[["1"]]

##(4)
annoying <- rename(annoying, one = `1`, two = `2`, three = `3`)

In Australia,most of the defaults values are valid, except that the date format is “(d)d/mm/yyyy”, meaning that January 2, 2006 is written as 02/01/2006.Create a new locale object that encapsulates the settings for the types of file Australians read most commonly.

au_locale <- locale(date_format = "%d/%m/%Y")
parse_date("02/01/2006", locale = au_locale)
#> [1] "2006-01-02"

Compute the rate for table2. You will need to perform four operations:

Extract the number of TB cases per country per year.

Extract the matching population per country per year.

Divide cases by population, and multiply by 10000.

Store back in the appropriate place.

t2_cases_per_cap <- table2 %>%
  pivot_wider(names_from = "type",values_from = "count") %>%
  mutate(
    count=cases/population*10000,
    type = "cases_per_cap"
         ) %>%
  select(country,year,type,count)
  
bind_rows(table2, t2_cases_per_cap) %>%
  arrange(country, year, type, count)

Widen this table with a new column to uniquely identify each value?

people <- tribble(
  ~name, ~key, ~value,
  #-----------------|--------|------
  "Phillip Woods",  "age", 45,
  "Phillip Woods", "height", 186,
  "Phillip Woods", "age", 50,
  "Jessica Cordero", "age", 37,
  "Jessica Cordero", "height", 156
)

people %>%
  group_by(name, key) %>%
  mutate(obs = row_number()) %>%
  pivot_wider(names_from="name", values_from = "value")

Write both_na(), a function that takes two vectors of the same length and returns the number of positions that have an NA in both vectors.


both_na <- function(x, y) {
    
    
  sum(is.na(x) & is.na(y))
}
both_na(
  c(NA, NA, 1, 2),
  c(NA, 1, NA, 2)
)
#> [1] 1

Implement a fizzbuzz() function. It takes a single number as input. If the number is divisible by three, it returns “fizz”. If it’s divisible by five it returns “buzz”. If it’s divisible by three and five, it returns “fizzbuzz”. Otherwise, it returns the number. Make sure you first write working code before you create the function.

fizzbuzz <- function(x) {
  # these two lines check that x is a valid input
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))
  if (!(x %% 3) && !(x %% 5)) {
    "fizzbuzz"
  } else if (!(x %% 3)) {
    "fizz"
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    # ensure that the function returns a character vector
    as.character(x)
  }
}

使用 datasets 包中的数据框 iris，该数据框记录了 150 朵鸢尾花的信息。请使

用该数据框完成以下题目：

（1）iris$Species 表示每朵鸢尾花的品种，请计算出每种鸢尾花的数量；

（2）某人构造了以下决策树，用于识别鸢尾花的品种：

Petal.Length<1.5? setosa

Sepal.Length+Sepal.width>9.0 versicolor

Petal.Width<1.75? virginica

均匀随机分配为三个品种之一

注：随机数初始种子设置为100请用 iris 中的数据对上面的算法进行检验，将算法识别出的品种作为新列添加到 iris 中，将这一列命名为 New.Species；

（3）请比较 Species 和 New.Species，计算出每种鸢尾花被正确识别的比率，以及算法总的正确率。

#(1)
iris %>%
  count(iris$Species) 
#(2)
set.seed(100)
New.Species <- vector("character",nrow(iris))
for(i in 1:150){
New.Species[[i]] <-
  if(iris$Petal.Length[[i]]<1.5) {
    "setosa"
  }else if(iris$Sepal.Length[[i]]+iris$Sepal.Width[[i]]>9.0) {
      "versicolor"
  }else if(iris$Petal.Width[[i]]<1.75) {
      "virginica"
  }else {
      sample(c("setosa","virginica","versicolor"),1,prob = c(1/3,1/3,1/3))
  }
}
iris <- iris %>%
  mutate(
    New.Species=New.Species
  ) 
#(3)
iris_p <- iris %>%
  mutate(pin = Species==New.Species) %>%
  group_by(Species) %>%
  summarise(pin_rate=mean(pin))
view(iris_p)
sum(iris_p$pin_rate)

Write a function that turns (e.g.) a vector c("a", "b", "c") into the string "a, b, and c".This function needs to handle four cases.

n == 0: an empty string, e.g. "".

n == 1: the original vector, e.g. "a".

n == 2: return the two elements separated by “and”, e.g. "a and b".

n > 2: return the first n - 1 elements separated by commas, and the last element separated by a comma and “and”, e.g. "a, b, and c".

str_commasep <- function(x, delim = ",") {
  n <- length(x)
  if (n == 0) {
    ""
  } else if (n == 1) {
    x
  } else if (n == 2) {
    # no comma before and when n == 2
    str_c(x[[1]], "and", x[[2]], sep = " ")
  } else {
    # commas after all n - 1 elements
    not_last <- str_c(x[seq(n - 1)], delim) 
    #here `,` is directly connected to each element of `x[seq(n-1)]`,with the by-default `sep = ""` omitted 
    # prepend "and" to the last element
    last <- str_c("and", x[[n]], sep = " ")
    # combine parts with spaces
    str_c(c(not_last, last), collapse = " ")
  }
}

在数据集sim1中生成新的变量new，低于或等于x的中位数记为“low x”，高于中位数记为“high x”

judge_median <- function(a,b){
  if(a > b) "high x"
  else "low x"
}
sim1 %>%
  mutate(
    new = map2(sim1$x,median(sim1$x),judge_median) %>% as_vector()
  )


#也可用
new = ifelse(a>b,"high x","low x")

#用此法可分为三类
ifelse(df$popularity <= 3 , "low",ifelse(df$popularity <= 6,"middle","high"))

What are the five most common words in sentences?

tibble(word = unlist(sentences %>% str_extract_all(boundary("word")))) %>%
  mutate(word = str_to_lower(word)) %>%
  count(word) %>%
  arrange(desc(n)) %>%
  head(5)

将下列数据分别按三类画在并排的三张子图上，并在每张子图上画出各类点的质心（颜色为蓝色，size = 4）

df <- data.frame(
  x = rnorm(120,c(2,0,4)),
  y = rnorm(120,c(1,2,1)),
  z = letters[1:3]
)

centroids <- aggregate(cbind(df$x,df$y)~df$z,df,mean) %>%
  rename(
    x = "V1",
    y = "V2",
    z = "df$z"
  )
ggplot(df,aes(x,y)) +
  geom_point() +
  geom_point(data = centroids,color="red",size = 4) +
  facet_grid(~z)

猜你喜欢