Updated JSS code

2021-05-01

Preliminaries

library(HLMdiag)
library(ggplot2)
library(lme4)
#> Loading required package: Matrix

Exam data

data("Exam", package = "mlmRev")
head(Exam)
#>   school   normexam schgend    schavg      vr     intake   standLRT sex type
#> 1      1  0.2613242   mixed 0.1661752 mid 50% bottom 25%  0.6190592   F  Mxd
#> 2      1  0.1340672   mixed 0.1661752 mid 50%    mid 50%  0.2058022   F  Mxd
#> 3      1 -1.7238820   mixed 0.1661752 mid 50%    top 25% -1.3645760   M  Mxd
#> 4      1  0.9675862   mixed 0.1661752 mid 50%    mid 50%  0.2058022   F  Mxd
#> 5      1  0.5443412   mixed 0.1661752 mid 50%    mid 50%  0.3711052   F  Mxd
#> 6      1  1.7348992   mixed 0.1661752 mid 50% bottom 25%  2.1894372   M  Mxd
#>   student
#> 1     143
#> 2     145
#> 3     142
#> 4     141
#> 5     138
#> 6     155

Residual analysis

# Model fm1 on page 6
(fm1 <- lmer(normexam ~ standLRT + (1 | school), Exam, REML = FALSE))
#> Linear mixed model fit by maximum likelihood  ['lmerMod']
#> Formula: normexam ~ standLRT + (1 | school)
#>    Data: Exam
#>       AIC       BIC    logLik  deviance  df.resid 
#>  9365.243  9390.478 -4678.622  9357.243      4055 
#> Random effects:
#>  Groups   Name        Std.Dev.
#>  school   (Intercept) 0.3035  
#>  Residual             0.7522  
#> Number of obs: 4059, groups:  school, 65
#> Fixed Effects:
#> (Intercept)     standLRT  
#>    0.002391     0.563371

# Extract level-1 residuals
# Residual plot from page 7
resid_fm1 <- hlm_resid(fm1)
head(resid_fm1)
#> # A tibble: 6 x 10
#>      id normexam standLRT school  .resid .fitted .ls.resid .ls.fitted .mar.resid
#>   <dbl>    <dbl>    <dbl> <fct>    <dbl>   <dbl>     <dbl>      <dbl>      <dbl>
#> 1     1    0.261    0.619 1      -0.464    0.725    -0.561      0.822    -0.0898
#> 2     2    0.134    0.206 1      -0.358    0.492    -0.395      0.529     0.0157
#> 3     3   -1.72    -1.36  1      -1.33    -0.393    -1.14      -0.585    -0.958 
#> 4     4    0.968    0.206 1       0.475    0.492     0.438      0.529     0.849 
#> 5     5    0.544    0.371 1      -0.0409   0.585    -0.102      0.647     0.333 
#> 6     6    1.73     2.19  1       0.125    1.61     -0.201      1.94      0.499 
#> # … with 1 more variable: .mar.fitted <dbl>

ggplot(resid_fm1, aes(x = standLRT, y = .ls.resid)) + 
  geom_point() + 
  geom_smooth() +
  labs(y = "LS level-1 residuals")
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Model fm2 on page 7
fm2 <- lmer(normexam ~ standLRT + I(standLRT^2) + I(standLRT^3) + 
              (1 | school), Exam, REML = FALSE)


resid_fm2 <- hlm_resid(fm2)
#> Warning in LSresids.lmerMod(object, level = 1, standardize = standardize): LS residuals might be inaccurate as one or more groups are rank deficient.
#> Use the 'include.ls = FALSE' parameter to get EB residuals only.


# Figure 2 page 8
ggplot(resid_fm2, aes(x = `I(standLRT^2)`, y = .ls.resid)) + 
  geom_hline(yintercept = 0, color = "blue") + 
  geom_point() +
  labs( y = "LS residuals", x = "standLRT2")

ggplot_qqnorm() function is now defunct, Q-Q plots are now much easier to create in ggplot2 directly than when the package was first created.

ggplot(resid_fm2, aes(sample = .ls.resid)) +
  geom_qq() + 
  geom_qq_line() +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles")

A better alternative if found in the qqplotr package

library(qqplotr)
#> 
#> Attaching package: 'qqplotr'
#> The following objects are masked from 'package:ggplot2':
#> 
#>     StatQqLine, stat_qq_line
ggplot(resid_fm2, aes(sample = .ls.resid)) +
  stat_qq_band() +
  stat_qq_line() + 
  stat_qq_point() +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles")

EB residuals are now called .ranef. residuals

# Model fm3, page 11
fm3 <- lmer(normexam ~ standLRT + I(standLRT^2) + I(standLRT^3) + sex +
              (standLRT | school), Exam, REML = FALSE)

## Extract level-2 EB residuals
resid_fm3 <- hlm_resid(fm3, level = "school")
#> Warning in LSresids.lmerMod(object, level = level, stand = standardize): The model matrix is likely rank deficient. Some LS residuals cannot be calculated.
#> It is recommended to use EB (.ranef) group level residuals for this model.
resid_fm3
#> # A tibble: 65 x 5
#>    school .ranef.intercept .ranef.stand_lrt .ls.intercept .ls.stand_lrt
#>    <chr>             <dbl>            <dbl>         <dbl>         <dbl>
#>  1 1                0.404            0.127          0.533       0.348  
#>  2 2                0.401            0.159         NA          NA      
#>  3 3                0.495            0.0780         0.523       0.00905
#>  4 4                0.0597           0.120          0.265       0.186  
#>  5 5                0.251            0.0711         0.186       0.0503 
#>  6 6                0.448            0.0482        NA          NA      
#>  7 7                0.297           -0.151         NA          NA      
#>  8 8               -0.0788           0.0131        NA          NA      
#>  9 9               -0.157           -0.0660        -0.282      -0.251  
#> 10 10              -0.282           -0.137         -0.172      -0.451  
#> # … with 55 more rows
## Construct school-level data set
library(dplyr)
SchoolExam <- Exam %>%
  group_by(school) %>%
  dplyr::summarize(
    size = length(school),
    schgend = unique(schgend),
    schavg = unique(schavg),
    type = unique(type), 
    schLRT = mean(standLRT)
  )

SchoolExam <- SchoolExam %>% left_join(resid_fm3, by = "school")
SchoolExam
#> # A tibble: 65 x 10
#>    school  size schgend  schavg type   schLRT .ranef.intercept .ranef.stand_lrt
#>    <chr>  <int> <fct>     <dbl> <fct>   <dbl>            <dbl>            <dbl>
#>  1 1         73 mixed    0.166  Mxd    0.166            0.404            0.127 
#>  2 2         55 girls    0.395  Sngl   0.395            0.401            0.159 
#>  3 3         52 mixed    0.514  Mxd    0.514            0.495            0.0780
#>  4 4         79 mixed    0.0918 Mxd    0.0918           0.0597           0.120 
#>  5 5         35 mixed    0.211  Mxd    0.211            0.251            0.0711
#>  6 6         80 girls    0.638  Sngl   0.638            0.448            0.0482
#>  7 7         88 girls   -0.0290 Sngl  -0.0290           0.297           -0.151 
#>  8 8        102 girls   -0.0405 Sngl  -0.0405          -0.0788           0.0131
#>  9 9         34 mixed   -0.494  Mxd   -0.494           -0.157           -0.0660
#> 10 10        50 mixed    0.189  Mxd    0.189           -0.282           -0.137 
#> # … with 55 more rows, and 2 more variables: .ls.intercept <dbl>,
#> #   .ls.stand_lrt <dbl>

## Left panel -- figure 5
ggplot(
  SchoolExam, 
  aes(
    x = reorder(schgend, .ranef.intercept, median), 
    y = .ranef.intercept)
  ) +
  geom_boxplot() +
  labs(x = "school gender", y = "level-2 residual (Intercept)")
  
## Right panel -- figure 5
ggplot(SchoolExam, aes(x = schavg, y = .ranef.intercept)) +
  geom_point() +
  geom_smooth() +
  labs(x = "average intake score", y = "level-2 residual (Intercept)")
## Model fm4, page 12
fm4 <- lmer(normexam ~ standLRT + I(standLRT^2) + I(standLRT^3) +
              sex + schgend + schavg + (standLRT | school), 
            data = Exam, REML = FALSE)

resid_fm4 <- hlm_resid(fm4, level = "school", include.ls = FALSE)

## Figure 6, page 13
ggplot(resid_fm4, aes(sample = .ranef.intercept)) +
  stat_qq_band() +
  stat_qq_line() + 
  stat_qq_point() +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles")

ggplot(resid_fm4, aes(sample = .ranef.stand_lrt)) +
  stat_qq_band() +
  stat_qq_line() + 
  stat_qq_point() +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles")

Influence analysis

We can now use hlm_influence to pull off all of the case-deletion diagnostics for the fixed effects at the specified level:

# Calculating influence diagnostics for model fm4
influence_fm4 <- hlm_influence(fm4, level = "school")
influence_fm4
#> # A tibble: 65 x 6
#>    school  cooksd mdffits covtrace covratio leverage.overall
#>    <fct>    <dbl>   <dbl>    <dbl>    <dbl>            <dbl>
#>  1 1      0.0216  0.0210    0.141      1.15           0.0217
#>  2 2      0.0144  0.0141    0.137      1.14           0.0267
#>  3 3      0.0384  0.0361    0.140      1.15           0.0257
#>  4 4      0.0165  0.0160    0.133      1.14           0.0201
#>  5 5      0.00889 0.00868   0.0677     1.07           0.0313
#>  6 6      0.0190  0.0171    0.166      1.17           0.0179
#>  7 7      0.0413  0.0394    0.111      1.12           0.0174
#>  8 8      0.00712 0.00668   0.205      1.22           0.0172
#>  9 9      0.00418 0.00408   0.140      1.15           0.0400
#> 10 10     0.00956 0.00935   0.0819     1.08           0.0249
#> # … with 55 more rows

dotplot_diag(influence_fm4$cooksd, cutoff = "internal", name = "cooks.distance")
dotplot_diag(influence_fm4$cooksd, cutoff = "internal", name = "cooks.distance", modify = "dotplot")
#> Warning: Removed 4 rows containing missing values (geom_text_repel).
beta_cdd <- cooks.distance(fm4, level = "school", include.attr = TRUE)
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
beta_cdd[25,]
#> # A tibble: 1 x 9
#>   cooksd  beta_1   beta_2   beta_3  beta_4   beta_5  beta_6  beta_7 beta_8
#>    <dbl>   <dbl>    <dbl>    <dbl>   <dbl>    <dbl>   <dbl>   <dbl>  <dbl>
#> 1 0.0863 0.00390 -0.00987 -0.00333 0.00321 0.000980 0.00232 -0.0168 0.0221

Diagnostics for variance components

To calculate the relative variance change, we use hlm_influence(), but approx = FALSE must be specified:

hlm_influence(fm4, approx = FALSE)