Examples using mtcars data

Chester Ismay and Andrew bray

2018-01-05

Data preparation

library(infer)
library(dplyr)
mtcars <- mtcars %>%
  mutate(cyl = factor(cyl),
         vs = factor(vs),
         am = factor(am),
         gear = factor(gear),
         carb = factor(carb))
# For reproducibility         
set.seed(2018)         

One numerical variable (mean)

mtcars %>%
  specify(response = mpg) %>% # formula alt: mpg ~ NULL
  hypothesize(null = "point", mu = 25) %>% 
  generate(reps = 100, type = "bootstrap") %>% 
  calculate(stat = "mean")
## Response: mpg (numeric)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  26.6
##  2         2  25.1
##  3         3  25.2
##  4         4  24.7
##  5         5  24.6
##  6         6  25.8
##  7         7  24.7
##  8         8  25.6
##  9         9  25.0
## 10        10  25.1
## # ... with 90 more rows

One numerical variable (median)

mtcars %>%
  specify(response = mpg) %>% # formula alt: mpg ~ NULL
  hypothesize(null = "point", med = 26) %>% 
  generate(reps = 100, type = "bootstrap") %>% 
  calculate(stat = "median")
## Response: mpg (numeric)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  28.2
##  2         2  27.2
##  3         3  26.2
##  4         4  26.0
##  5         5  26.5
##  6         6  24.5
##  7         7  26.0
##  8         8  28.2
##  9         9  28.2
## 10        10  23.2
## # ... with 90 more rows

One categorical (2 level) variable

mtcars %>%
  specify(response = am, success = "1") %>% # formula alt: am ~ NULL
  hypothesize(null = "point", p = .25) %>% 
  generate(reps = 100, type = "simulate") %>% 
  calculate(stat = "prop")
## Response: am (factor)
## Null Hypothesis:  point 
## # A tibble: 100 x 2
##    replicate   stat
##    <fct>      <dbl>
##  1 1         0.375 
##  2 2         0.0625
##  3 3         0.125 
##  4 4         0.250 
##  5 5         0.188 
##  6 6         0.406 
##  7 7         0.219 
##  8 8         0.375 
##  9 9         0.344 
## 10 10        0.188 
## # ... with 90 more rows

Two categorical (2 level) variables

mtcars %>%
  specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "diff in props", order = c("0", "1"))
## Response: am (factor)
## Explanatory: vs (factor)
## Null Hypothesis:  independence 
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.421 
##  2         2 -0.167 
##  3         3 -0.421 
##  4         4 -0.0397
##  5         5  0.0873
##  6         6 -0.0397
##  7         7 -0.0397
##  8         8 -0.0397
##  9         9  0.0873
## 10        10 -0.167 
## # ... with 90 more rows

One categorical (>2 level) - GoF

mtcars %>%
  specify(cyl ~ NULL) %>% # alt: response = cyl
  hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>%
  generate(reps = 100, type = "simulate") %>%
  calculate(stat = "Chisq")
## Response: cyl (factor)
## Null Hypothesis:  point 
## # A tibble: 100 x 2
##    replicate  stat
##    <fct>     <dbl>
##  1 1         6.75 
##  2 2         1.69 
##  3 3         3.19 
##  4 4         1.69 
##  5 5         6.00 
##  6 6         2.69 
##  7 7         4.75 
##  8 8         0.750
##  9 9         0.688
## 10 10        3.69 
## # ... with 90 more rows

Two categorical (>2 level) variables

mtcars %>%
  specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "Chisq")
## Response: cyl (factor)
## Explanatory: am (factor)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.34 
##  2         2 1.63 
##  3         3 1.63 
##  4         4 2.63 
##  5         5 3.90 
##  6         6 1.74 
##  7         7 0.126
##  8         8 1.74 
##  9         9 1.34 
## 10        10 1.34 
## # ... with 90 more rows

One numerical variable one categorical (2 levels) (diff in means)

mtcars %>%
  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "diff in means", order = c("0", "1"))
## Response: mpg (numeric)
## Explanatory: am (factor)
## Null Hypothesis:  independence 
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 -1.10 
##  2         2  0.217
##  3         3 -1.08 
##  4         4 -3.80 
##  5         5  3.08 
##  6         6  0.489
##  7         7  2.34 
##  8         8  4.10 
##  9         9 -1.86 
## 10        10 -0.210
## # ... with 90 more rows

One numerical variable one categorical (2 levels) (diff in medians)

mtcars %>%
  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "diff in medians", order = c("0", "1"))
## Response: mpg (numeric)
## Explanatory: am (factor)
## Null Hypothesis:  independence 
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  0.500
##  2         2 -1.10 
##  3         3  5.20 
##  4         4  1.80 
##  5         5  0.500
##  6         6  3.30 
##  7         7 -1.60 
##  8         8 -2.30 
##  9         9  2.90 
## 10        10 -0.500
## # ... with 90 more rows

One numerical one categorical (>2 levels) - ANOVA

mtcars %>%
  specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "F")
## Response: mpg (numeric)
## Explanatory: cyl (factor)
## Null Hypothesis:  independence 
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.43 
##  2         2 1.65 
##  3         3 0.318
##  4         4 0.393
##  5         5 1.05 
##  6         6 0.826
##  7         7 1.32 
##  8         8 0.833
##  9         9 0.144
## 10        10 0.365
## # ... with 90 more rows

Two numerical vars - SLR

mtcars %>%
  specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "slope")
## Response: mpg (numeric)
## Explanatory: hp (numeric)
## Null Hypothesis:  independence 
## # A tibble: 100 x 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.0151 
##  2         2  0.00224
##  3         3 -0.0120 
##  4         4  0.00292
##  5         5  0.0203 
##  6         6 -0.00730
##  7         7 -0.0246 
##  8         8  0.00555
##  9         9  0.0109 
## 10        10  0.0176 
## # ... with 90 more rows

One numerical variable (standard deviation)

Not currently implemented

mtcars %>%
  specify(response = mpg) %>% # formula alt: mpg ~ NULL
  hypothesize(null = "point", sigma = 5) %>% 
  generate(reps = 100, type = "bootstrap") %>% 
  calculate(stat = "sd")

Confidence intervals

One numerical (one mean)

mtcars %>%
  specify(response = mpg) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "mean")
## Response: mpg (numeric)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  19.6
##  2         2  21.8
##  3         3  18.7
##  4         4  19.2
##  5         5  21.6
##  6         6  19.9
##  7         7  20.7
##  8         8  19.3
##  9         9  21.2
## 10        10  21.3
## # ... with 90 more rows

One numerical (one median)

mtcars %>%
  specify(response = mpg) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "median")
## Response: mpg (numeric)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  19.2
##  2         2  20.1
##  3         3  21.0
##  4         4  17.8
##  5         5  20.1
##  6         6  19.2
##  7         7  18.4
##  8         8  19.2
##  9         9  19.2
## 10        10  18.0
## # ... with 90 more rows

One numerical (standard deviation)

mtcars %>%
  specify(response = mpg) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "sd")
## Response: mpg (numeric)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  5.28
##  2         2  6.74
##  3         3  5.29
##  4         4  5.41
##  5         5  5.56
##  6         6  5.65
##  7         7  6.17
##  8         8  6.40
##  9         9  6.31
## 10        10  6.11
## # ... with 90 more rows

One categorical (one proportion)

mtcars %>%
  specify(response = am, success = "1") %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "prop")
## Response: am (factor)
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.375
##  2         2 0.406
##  3         3 0.406
##  4         4 0.312
##  5         5 0.312
##  6         6 0.469
##  7         7 0.438
##  8         8 0.281
##  9         9 0.438
## 10        10 0.500
## # ... with 90 more rows

One numerical variable one categorical (2 levels) (diff in means)

mtcars %>%
  specify(mpg ~ am) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "diff in means", order = c("0", "1"))
## Response: mpg (numeric)
## Explanatory: am (factor)
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 - 9.38
##  2         2 - 5.11
##  3         3 - 4.88
##  4         4 - 5.39
##  5         5 - 9.19
##  6         6 - 7.20
##  7         7 - 5.34
##  8         8 - 3.20
##  9         9 - 5.95
## 10        10 -11.0 
## # ... with 90 more rows

Two categorical variables (diff in proportions)

mtcars %>%
  specify(am ~ vs, success = "1") %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "diff in props", order = c("0", "1"))
## Response: am (factor)
## Explanatory: vs (factor)
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 -0.352
##  2         2 -0.150
##  3         3 -0.294
##  4         4 -0.254
##  5         5 -0.438
##  6         6 -0.126
##  7         7 -0.188
##  8         8  0.167
##  9         9 -0.143
## 10        10 -0.500
## # ... with 90 more rows

Two numerical vars - SLR

mtcars %>%
  specify(mpg ~ hp) %>% 
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "slope")
## Response: mpg (numeric)
## Explanatory: hp (numeric)
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0850
##  2         2 -0.0512
##  3         3 -0.0736
##  4         4 -0.0569
##  5         5 -0.0930
##  6         6 -0.0659
##  7         7 -0.0710
##  8         8 -0.0767
##  9         9 -0.0556
## 10        10 -0.0627
## # ... with 90 more rows