nycflights13
flights
datalibrary(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
sample_n(size = 500) %>%
mutate(season = case_when(
month %in% c(10:12, 1:3) ~ "winter",
month %in% c(4:9) ~ "summer"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, season,
day_hour, origin, carrier) %>%
filter(., complete.cases(.))
arr_delay
, dep_delay
season
("winter"
, "summer"
),day_hour
("morning"
, "not morning"
)origin
("EWR"
, "JFK"
, "LGA"
)carrier
x_bar <- fli_small %>%
summarize(mean(dep_delay)) %>%
pull()
null <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 10) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = x_bar, color = "red")
null %>%
summarize(p_value = mean(stat > x_bar) * 2)
p_value |
---|
0.618 |
x_tilde <- fli_small %>%
summarize(median(dep_delay)) %>%
pull()
null <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", med = 0) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "median")
ggplot(null, aes(x = stat)) +
geom_bar() +
geom_vline(xintercept = x_tilde, color = "red")
null %>%
summarize(p_value = mean(stat < x_tilde) * 2)
p_value |
---|
0 |
p_hat <- fli_small %>%
summarize(mean(day_hour == "morning")) %>%
pull()
null <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "prop")
ggplot(null, aes(x = stat)) +
geom_bar() +
geom_vline(xintercept = p_hat, color = "red")
null %>%
summarize(p_value = mean(stat < p_hat) * 2)
p_value |
---|
0.02 |
Logical variables will be coerced to factors:
null <- fli_small %>%
mutate(day_hour_logical = (day_hour == "morning")) %>%
specify(response = day_hour_logical, success = "TRUE") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "prop")
d_hat <- fli_small %>%
group_by(season) %>%
summarize(prop = mean(day_hour == "morning")) %>%
summarize(diff(prop)) %>%
pull()
null <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in props", order = c("summer", "winter"))
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, color = "red")
null %>%
summarize(p_value = mean(stat < d_hat) * 2)
p_value |
---|
0.564 |
Chisq_hat <- chisq.test(table(fli_small$origin))$stat
null <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "Chisq")
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = Chisq_hat, color = "red")
null %>%
summarize(p_value = mean(stat > Chisq_hat))
p_value |
---|
0.018 |
Chisq_hat <- chisq.test(table(fli_small$day_hour, fli_small$origin))$stat
null <- fli_small %>%
specify(day_hour ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = Chisq_hat, color = "red")
null %>%
summarize(p_value = mean(stat > Chisq_hat))
p_value |
---|
0.05 |
d_hat <- fli_small %>%
group_by(season) %>%
summarize(mean_stat = mean(dep_delay)) %>%
summarize(diff(mean_stat)) %>%
pull()
null <- fli_small %>%
specify(dep_delay ~ season) %>% # alt: response = dep_delay,
# explanatory = season
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, color = "red")
null %>%
summarize(p_value = mean(stat > d_hat) * 2)
p_value |
---|
1.606 |
d_hat <- fli_small %>%
group_by(season) %>%
summarize(median_stat = median(dep_delay)) %>%
summarize(diff(median_stat)) %>%
pull()
null <- fli_small %>%
specify(dep_delay ~ season) %>% # alt: response = dep_delay,
# explanatory = season
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in medians", order = c("summer", "winter"))
ggplot(null, aes(x = stat)) +
geom_bar() +
geom_vline(xintercept = d_hat, color = "red")
null %>%
summarize(p_value = mean(stat > d_hat) * 2)
p_value |
---|
0.042 |
F_hat <- anova(
aov(formula = arr_delay ~ origin, data = fli_small)
)$`F value`[1]
null <- fli_small %>%
specify(arr_delay ~ origin) %>% # alt: response = arr_delay,
# explanatory = origin
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "F")
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = F_hat, color = "red")
null %>%
summarize(p_value = mean(stat > F_hat))
p_value |
---|
0.301 |
slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>%
broom::tidy() %>%
filter(term == "dep_delay") %>%
select(estimate) %>%
pull()
null <- fli_small %>%
specify(arr_delay ~ dep_delay) %>% # alt: response = arr_delay,
# explanatory = dep_delay
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = slope_hat, color = "red")
null %>%
summarize(p_value = mean(stat > slope_hat) * 2)
p_value |
---|
0 |
x_bar <- fli_small %>%
summarize(mean(arr_delay)) %>%
pull()
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean") %>%
pull()
c(lower = x_bar - 2 * sd(boot),
upper = x_bar + 2 * sd(boot))
## lower upper
## -0.6169776 6.7613075
p_hat <- fli_small %>%
summarize(mean(day_hour == "morning")) %>%
pull()
boot <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
pull()
c(lower = p_hat - 2 * sd(boot),
upper = p_hat + 2 * sd(boot))
## lower upper
## 0.4057081 0.4973847
d_hat <- fli_small %>%
group_by(season) %>%
summarize(mean_stat = mean(arr_delay)) %>%
summarize(diff(mean_stat)) %>%
pull()
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("summer", "winter")) %>%
pull()
c(lower = d_hat - 2 * sd(boot),
upper = d_hat + 2 * sd(boot))
## lower upper
## -9.034099 5.195664
d_hat <- fli_small %>%
group_by(season) %>%
summarize(prop = mean(day_hour == "morning")) %>%
summarize(diff(prop)) %>%
pull()
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("summer", "winter")) %>%
pull()
c(lower = d_hat - 2 * sd(boot),
upper = d_hat + 2 * sd(boot))
## lower upper
## -0.11165849 0.06829114
slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>%
broom::tidy() %>%
filter(term == "dep_delay") %>%
select(estimate) %>%
pull()
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope") %>%
pull()
c(lower = slope_hat - 2 * sd(boot),
upper = slope_hat + 2 * sd(boot))
## lower upper
## 0.9541501 1.0735226