Examples using mtcars data

Chester Ismay and Andrew bray

2018-01-05

Data preparation

library(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
mtcars <- as.data.frame(mtcars) %>%
  mutate(cyl = factor(cyl),
         vs = factor(vs),
         am = factor(am),
         gear = factor(gear),
         carb = factor(carb))

One numerical variable (mean)

mtcars %>%
  specify(response = mpg) %>% # formula alt: mpg ~ NULL
  hypothesize(null = "point", mu = 25) %>% 
  generate(reps = 100, type = "bootstrap") %>% 
  calculate(stat = "mean")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  25.5
##  2         2  25.1
##  3         3  22.4
##  4         4  26.4
##  5         5  26.1
##  6         6  23.5
##  7         7  25.1
##  8         8  25.1
##  9         9  23.7
## 10        10  24.7
## # ... with 90 more rows

One numerical variable (median)

mtcars %>%
  specify(response = mpg) %>% # formula alt: mpg ~ NULL
  hypothesize(null = "point", med = 26) %>% 
  generate(reps = 100, type = "bootstrap") %>% 
  calculate(stat = "median")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  28.2
##  2         2  28.2
##  3         3  27.8
##  4         4  25.5
##  5         5  24.5
##  6         6  26.0
##  7         7  26.0
##  8         8  27.8
##  9         9  26.2
## 10        10  27.8
## # ... with 90 more rows

One numerical variable (standard deviation)

mtcars %>%
  specify(response = mpg) %>% # formula alt: mpg ~ NULL
  hypothesize(null = "point", sigma = 5) %>% 
  generate(reps = 100, type = "bootstrap") %>% 
  calculate(stat = "sd")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  6.66
##  2         2  5.55
##  3         3  6.21
##  4         4  4.08
##  5         5  4.58
##  6         6  6.51
##  7         7  7.60
##  8         8  6.85
##  9         9  5.77
## 10        10  6.96
## # ... with 90 more rows

One categorical (2 level) variable

mtcars %>%
  specify(response = am, success = "1") %>% # formula alt: am ~ NULL
  hypothesize(null = "point", p = .25) %>% 
  generate(reps = 100, type = "simulate") %>% 
  calculate(stat = "prop")
## # A tibble: 100 x 2
##    replicate   stat
##    <fctr>     <dbl>
##  1 1         0.188 
##  2 2         0.0625
##  3 3         0.281 
##  4 4         0.344 
##  5 5         0.219 
##  6 6         0.250 
##  7 7         0.312 
##  8 8         0.250 
##  9 9         0.281 
## 10 10        0.156 
## # ... with 90 more rows

Two categorical (2 level) variables

mtcars %>%
  specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "diff in props", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.0873
##  2         2 -0.0397
##  3         3  0.317 
##  4         4 -0.0952
##  5         5  0.230 
##  6         6 -0.0238
##  7         7  0.183 
##  8         8  0.0714
##  9         9 -0.238 
## 10        10  0.0952
## # ... with 90 more rows

One categorical (>2 level) - GoF

mtcars %>%
  specify(cyl ~ NULL) %>% # alt: response = cyl
  hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>%
  generate(reps = 100, type = "simulate") %>%
  calculate(stat = "Chisq")
## # A tibble: 100 x 2
##    replicate  stat
##    <fctr>    <dbl>
##  1 1         2.75 
##  2 2         1.69 
##  3 3         1.00 
##  4 4         4.19 
##  5 5         0.688
##  6 6         1.69 
##  7 7         1.69 
##  8 8         3.69 
##  9 9         2.00 
## 10 10        0.188
## # ... with 90 more rows

Two categorical (>2 level) variables

mtcars %>%
  specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "Chisq")
## # A tibble: 100 x 2
##    replicate  stat
##    <fctr>    <dbl>
##  1 1         5.73 
##  2 2         0.513
##  3 3         1.36 
##  4 4         4.16 
##  5 5         1.26 
##  6 6         0.134
##  7 7         0.172
##  8 8         0.164
##  9 9         0.592
## 10 10        0.296
## # ... with 90 more rows

One numerical variable one categorical (2 levels) (diff in means)

mtcars %>%
  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "diff in means", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  2.17 
##  2         2  0.344
##  3         3  1.67 
##  4         4  0.376
##  5         5 -1.47 
##  6         6 -2.03 
##  7         7  0.615
##  8         8  0.153
##  9         9  1.40 
## 10        10 -0.872
## # ... with 90 more rows

One numerical variable one categorical (2 levels) (diff in medians)

mtcars %>%
  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "diff in medians", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  0.600
##  2         2  0    
##  3         3 -3.20 
##  4         4 -1.90 
##  5         5 -1.10 
##  6         6  0    
##  7         7 -1.10 
##  8         8 -1.80 
##  9         9 -3.90 
## 10        10 -2.40 
## # ... with 90 more rows

One numerical one categorical (>2 levels) - ANOVA

mtcars %>%
  specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "F")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.129
##  2         2 2.33 
##  3         3 1.82 
##  4         4 0.628
##  5         5 0.235
##  6         6 0.378
##  7         7 0.431
##  8         8 1.24 
##  9         9 0.988
## 10        10 0.642
## # ... with 90 more rows

Two numerical vars - SLR

mtcars %>%
  specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl
  hypothesize(null = "independence") %>%
  generate(reps = 100, type = "permute") %>%
  calculate(stat = "slope")
## # A tibble: 100 x 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.00473
##  2         2 -0.00982
##  3         3  0.00359
##  4         4  0.00231
##  5         5 -0.00980
##  6         6 -0.0200 
##  7         7  0.0128 
##  8         8  0.00150
##  9         9 -0.0149 
## 10        10 -0.0187 
## # ... with 90 more rows

Confidence intervals

One numerical (one mean)

mtcars %>%
  specify(response = mpg) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "mean")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  20.9
##  2         2  19.6
##  3         3  21.4
##  4         4  21.2
##  5         5  17.9
##  6         6  20.9
##  7         7  17.9
##  8         8  20.5
##  9         9  21.5
## 10        10  19.2
## # ... with 90 more rows

One numerical (one median)

mtcars %>%
  specify(response = mpg) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "median")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  19.7
##  2         2  18.6
##  3         3  21.4
##  4         4  17.7
##  5         5  17.7
##  6         6  20.4
##  7         7  21.0
##  8         8  19.0
##  9         9  17.3
## 10        10  20.4
## # ... with 90 more rows

One numerical (standard deviation)

mtcars %>%
  specify(response = mpg) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "sd")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  5.86
##  2         2  6.20
##  3         3  6.41
##  4         4  5.67
##  5         5  6.12
##  6         6  5.80
##  7         7  6.58
##  8         8  5.02
##  9         9  6.55
## 10        10  5.96
## # ... with 90 more rows

One categorical (one proportion)

mtcars %>%
  specify(response = am, success = "1") %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "prop")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.531
##  2         2 0.250
##  3         3 0.375
##  4         4 0.344
##  5         5 0.406
##  6         6 0.594
##  7         7 0.531
##  8         8 0.562
##  9         9 0.531
## 10        10 0.469
## # ... with 90 more rows

One numerical variable one categorical (2 levels) (diff in means)

mtcars %>%
  specify(mpg ~ am) %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "diff in means", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 - 7.36
##  2         2 - 9.03
##  3         3 -10.8 
##  4         4 - 7.36
##  5         5 - 3.03
##  6         6 - 5.74
##  7         7 - 9.80
##  8         8 - 6.70
##  9         9 - 7.71
## 10        10 - 7.26
## # ... with 90 more rows

Two categorical variables (diff in proportions)

mtcars %>%
  specify(am ~ vs, success = "1") %>%
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "diff in props", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.217 
##  2         2 -0.0688
##  3         3  0.203 
##  4         4 -0.125 
##  5         5 -0.151 
##  6         6  0.0405
##  7         7  0.0635
##  8         8 -0.116 
##  9         9 -0.188 
## 10        10 -0.0931
## # ... with 90 more rows

Two numerical vars - SLR

mtcars %>%
  specify(mpg ~ hp) %>% 
  generate(reps = 100, type = "bootstrap") %>%
  calculate(stat = "slope")
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.107 
##  2         2 -0.0727
##  3         3 -0.0439
##  4         4 -0.0947
##  5         5 -0.0860
##  6         6 -0.0714
##  7         7 -0.0633
##  8         8 -0.0821
##  9         9 -0.0865
## 10        10 -0.0593
## # ... with 90 more rows