nycflights13
flights
datalibrary(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
na.omit() %>%
sample_n(size = 500) %>%
mutate(season = case_when(
month %in% c(10:12, 1:3) ~ "winter",
month %in% c(4:9) ~ "summer"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, season,
day_hour, origin, carrier)
arr_delay
, dep_delay
season
("winter"
, "summer"
),day_hour
("morning"
, "not morning"
)origin
("EWR"
, "JFK"
, "LGA"
)carrier
Observed stat
( x_bar <- fli_small %>%
specify(response = dep_delay) %>%
calculate(stat = "mean") )
stat |
---|
10.4 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 10) %>%
generate(reps = 1000) %>%
calculate(stat = "mean")
## Setting `type = "bootstrap"` in `generate()`.
visualize(null_distn) +
shade_p_value(obs_stat = x_bar, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = x_bar, direction = "two_sided")
p_value |
---|
0.794 |
Observed stat
( t_bar <- fli_small %>%
specify(response = dep_delay) %>%
calculate(stat = "t") )
stat |
---|
6.93 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 8) %>%
generate(reps = 1000) %>%
calculate(stat = "t")
## Setting `type = "bootstrap"` in `generate()`.
visualize(null_distn) +
shade_p_value(obs_stat = t_bar, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = t_bar, direction = "two_sided")
p_value |
---|
0 |
Observed stat
( x_tilde <- fli_small %>%
specify(response = dep_delay) %>%
calculate(stat = "median") )
stat |
---|
-2 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", med = -1) %>%
generate(reps = 1000) %>%
calculate(stat = "median")
## Setting `type = "bootstrap"` in `generate()`.
visualize(null_distn) +
shade_p_value(obs_stat = x_tilde, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = x_tilde, direction = "two_sided")
p_value |
---|
0.15 |
Observed stat
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )
stat |
---|
0.466 |
null_distn <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")
## Setting `type = "simulate"` in `generate()`.
visualize(null_distn) +
shade_p_value(obs_stat = p_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = p_hat, direction = "two_sided")
p_value |
---|
0.11 |
Logical variables will be coerced to factors:
null_distn <- fli_small %>%
mutate(day_hour_logical = (day_hour == "morning")) %>%
specify(response = day_hour_logical, success = "TRUE") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")
## Setting `type = "simulate"` in `generate()`.
Not yet implemented.
Observed stat
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("winter", "summer")) )
stat |
---|
-0.0205 |
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "diff in props", order = c("winter", "summer"))
## Setting `type = "permute"` in `generate()`.
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = d_hat, direction = "two_sided")
p_value |
---|
0.708 |
Standardized observed stat
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("winter", "summer")) )
stat |
---|
-0.4605 |
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "z", order = c("winter", "summer"))
## Setting `type = "permute"` in `generate()`.
visualize(null_distn) +
shade_p_value(obs_stat = z_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = z_hat, direction = "two_sided")
p_value |
---|
0.684 |
Note the similarities in this plot and the previous one.
Observed stat
Note the need to add in the hypothesized values here to compute the observed statistic.
( Chisq_hat <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
calculate(stat = "Chisq") )
stat |
---|
10.4 |
null_distn <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")
null_distn %>%
get_p_value(obs_stat = Chisq_hat, direction = "greater")
p_value |
---|
0.005 |
Observed stat
( Chisq_hat <- fli_small %>%
specify(formula = day_hour ~ origin) %>%
calculate(stat = "Chisq") )
stat |
---|
9.027 |
null_distn <- fli_small %>%
specify(day_hour ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")
null_distn %>%
get_p_value(obs_stat = Chisq_hat, direction = "greater")
p_value |
---|
0.007 |
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )
stat |
---|
2.266 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = d_hat, direction = "two_sided")
p_value |
---|
0.488 |
Standardized observed stat
( t_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )
stat |
---|
0.7542 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = t_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = t_hat, direction = "two_sided")
p_value |
---|
0.49 |
Note the similarities in this plot and the previous one.
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in medians", order = c("summer", "winter")) )
stat |
---|
2 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>% # alt: response = dep_delay,
# explanatory = season
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in medians", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = d_hat, direction = "two_sided")
p_value |
---|
0.084 |
Observed stat
( F_hat <- fli_small %>%
specify(arr_delay ~ origin) %>%
calculate(stat = "F") )
stat |
---|
1.084 |
null_distn <- fli_small %>%
specify(arr_delay ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "F")
visualize(null_distn) +
shade_p_value(obs_stat = F_hat, direction = "greater")
null_distn %>%
get_p_value(obs_stat = F_hat, direction = "greater")
p_value |
---|
0.353 |
Observed stat
( slope_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "slope") )
stat |
---|
1.017 |
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")
visualize(null_distn) +
shade_p_value(obs_stat = slope_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = slope_hat, direction = "two_sided")
p_value |
---|
0 |
Observed stat
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )
stat |
---|
0.8943 |
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "correlation")
visualize(null_distn) +
shade_p_value(obs_stat = correlation_hat, direction = "two_sided")
null_distn %>%
get_p_value(obs_stat = correlation_hat, direction = "two_sided")
p_value |
---|
0 |
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.
Point estimate
( x_bar <- fli_small %>%
specify(response = arr_delay) %>%
calculate(stat = "mean") )
stat |
---|
4.572 |
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
1.436 | 7.819 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) )
lower | upper |
---|---|
1.267 | 7.877 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Point estimate
( t_hat <- fli_small %>%
specify(response = arr_delay) %>%
calculate(stat = "t") )
stat |
---|
2.679 |
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.9338 | 4.362 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )
lower | upper |
---|---|
0.9141 | 4.444 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Point estimate
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )
stat |
---|
0.466 |
boot <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.42 | 0.508 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) )
lower | upper |
---|---|
0.4218 | 0.5102 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Not yet implemented.
Point estimate
( d_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )
stat |
---|
-0.7452 |
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-7.167 | 6.079 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )
lower | upper |
---|---|
-7.296 | 5.806 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Standardized point estimate
( t_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )
stat |
---|
-0.2182 |
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-2.236 | 1.718 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )
lower | upper |
---|---|
-2.183 | 1.746 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Point estimate
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("summer", "winter")) )
stat |
---|
0.0205 |
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-0.0648 | 0.1083 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )
lower | upper |
---|---|
-0.0676 | 0.1087 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Standardized point estimate
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("summer", "winter")) )
stat |
---|
0.4605 |
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
-1.479 | 2.501 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) )
lower | upper |
---|---|
-1.522 | 2.443 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Point estimate
( slope_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "slope") )
stat |
---|
1.017 |
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.9728 | 1.074 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) )
lower | upper |
---|---|
0.9653 | 1.069 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Point estimate
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )
stat |
---|
0.8943 |
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )
2.5% | 97.5% |
---|---|
0.8502 | 0.9218 |
visualize(boot) +
shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se",
point_estimate = correlation_hat) )
lower | upper |
---|---|
0.858 | 0.9306 |
visualize(boot) +
shade_confidence_interval(endpoints = standard_error_ci)
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.