-
Notifications
You must be signed in to change notification settings - Fork 1
/
bayes-proof-concept.Rmd
1273 lines (1059 loc) · 88.9 KB
/
bayes-proof-concept.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Bayesian Psychometrics for Diagnostic Assessments: A Proof of Concept"
shorttitle: "Bayesian Modeling for DCMs"
subtitle: "Research Report #19-01"
program: "DLM"
date: "November 2019"
knit: "bookdown::render_book"
site: bookdown::bookdown_site
output: ratlas::techreport_pdf
bibliography: ["bib/refs.bib", "bib/packages.bib"]
biblio-style: apa
biblatexoptions:
- sortcites
csl: csl/apa.csl
link-citations: yes
lot: true
lof: true
subparagraph: yes
mainfont: Palatino LT Std
fontsize: 11pt
acknowledgements: >
`r if (knitr::is_latex_output()) ratlas::inc("front-matter/preface.Rmd")`
---
```{r setup, include=FALSE}
needed_packages <- c("ratlas", "knitr", "english", "kableExtra", "tidyverse",
"rstan", "loo", "tidybayes",
"here", "glue", "fs")
load_packages <- function(x) {
if (!(x %in% installed.packages())) {
install.packages(x, repos = "https://cran.rstudio.com/")
}
suppressPackageStartupMessages(require(x, character.only = TRUE))
}
vapply(needed_packages, load_packages, logical(1))
extrafont::loadfonts(quiet = TRUE)
set_theme(font = "Palatino")
options(knitr.kable.NA = "")
options(knitr.table.format = "latex")
knitr::opts_chunk$set(cache = TRUE)
if (!dir_exists(here("data", "estimated-models"))) {
dir_create(here("data", "estimated-models"))
}
```
```{r functions, include = FALSE}
logit <- function(x) {
log(x / (1 - x))
}
inv_logit <- function(x) {
exp(x) / (1 + exp(x))
}
trunc_sample <- function(func, n, lb = -Inf, ub = Inf, ...) {
full_sample <- func(n = n, ...)
trunc_sample <- full_sample[between(full_sample, lb, ub)]
while(length(trunc_sample) < n) {
full_sample <- func(n = n, ...)
trunc_sample <- c(trunc_sample, full_sample[between(full_sample, lb, ub)])
}
sample(trunc_sample, size = n, replace = FALSE)
}
rep_data <- function(model, obs) {
draws <- model %>%
spread_draws(nu[c], pi[i,c]) %>%
ungroup()
tidy_draws <- full_join(
draws %>%
select(.chain:.draw, nu, c) %>%
distinct() %>%
spread(key = c, value = nu) %>%
rename(nm_nu = `1`, ms_nu = `2`),
draws %>%
select(.chain:.draw, pi, i, c) %>%
distinct() %>%
spread(key = c, value = pi) %>%
rename(nm_pi = `1`, ms_pi = `2`),
by = c(".chain", ".iteration", ".draw")
)
replicated_data <- tidy_draws %>%
group_nest(.chain, .iteration, .draw, .key = "params") %>%
mutate(data_rep = map(params, function(x, obs) {
master_probs <- obs %>%
left_join(x, by = c("item_id" = "i")) %>%
mutate(log_nm = (score * log(nm_pi)) + ((1 - score) * log(1 - nm_pi)),
log_ms = (score * log(ms_pi)) + ((1 - score) * log(1 - ms_pi))) %>%
group_by(stu_id, nm_nu, ms_nu) %>%
summarize(log_nm = sum(log_nm), log_ms = sum(log_ms)) %>%
ungroup() %>%
mutate(prob_nm = nm_nu * exp(log_nm),
prob_ms = ms_nu * exp(log_ms),
master_prob = prob_ms / (prob_nm + prob_ms)) %>%
select(stu_id, master_prob)
rand_strc_mastery <- distinct(obs, stu_id) %>%
mutate(ms_nu = unique(x$ms_nu),
rand = runif(n(), min = 0, max = 1),
master = rand <= ms_nu) %>%
select(stu_id, master)
ppmc_data <- obs %>%
select(-score) %>%
left_join(master_probs, by = "stu_id") %>%
left_join(select(x, i, ms_nu, nm_pi, ms_pi),
by = c("item_id" = "i")) %>%
# Mastery status for PPMC
## Option 1: Random from structural parameter
left_join(rand_strc_mastery, by = "stu_id") %>%
## Option 2: Determined by individual mastery probability
## mutate(master = master_prob > 0.5) %>%
# Correct score calculation
## Option 1:
mutate(prob_correct = case_when(master ~ ms_pi,
TRUE ~ nm_pi)) %>%
## Option 2: Mixture model (does not currently work with Option 1 above)
## mutate(prob_correct = (master_prob * ms_pi) +
## ((1 - master_prob) * nm_pi)) %>%
mutate(rand = runif(n = nrow(.), min = 0, max = 1),
score = case_when(rand <= prob_correct ~ 1L,
TRUE ~ 0L)) %>%
select(stu_id, master, item_id, score)
pvals <- list(
ppmc_data %>%
group_by(item_id) %>%
summarize(ppmc_pval = mean(score)) %>%
ungroup(),
obs %>%
group_by(item_id) %>%
summarize(obs_pval = mean(score)) %>%
ungroup(),
ppmc_data %>%
mutate(master = case_when(master ~ "ms", TRUE ~ "nm")) %>%
group_by(item_id, master) %>%
summarize(pval = mean(score)) %>%
pivot_wider(names_from = master, values_from = pval) %>%
rename_at(vars(ms, nm), ~paste0(., "_ppmc_pval")) %>%
ungroup(),
obs %>%
left_join(master_probs, by = "stu_id") %>%
mutate(master = case_when(master_prob > 0.5 ~ "ms", TRUE ~ "nm")) %>%
select(-master_prob) %>%
group_by(item_id, master) %>%
summarize(pval = mean(score)) %>%
pivot_wider(names_from = master, values_from = pval) %>%
rename_at(vars(ms, nm), ~paste0(., "_obs_pval")) %>%
ungroup()
) %>%
reduce(full_join, by = "item_id")
ppmc_data <- select(ppmc_data, -master)
ret_df <- tibble(
mastery_probs = list(master_probs),
ppmc_data = list(ppmc_data),
pvals = list(pvals)
)
return(ret_df)
}, obs = obs)) %>%
unnest(cols = c(data_rep))
return(replicated_data)
}
```
```{r ggplot2-extras, include = FALSE}
StatBin2 <- ggproto(
"StatBin2",
StatBin,
compute_group = function (data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, origin = NULL, right = NULL,
drop = NULL, width = NULL) {
if (!is.null(breaks)) {
if (!scales$x$is_discrete()) {
breaks <- scales$x$transform(breaks)
}
bins <- ggplot2:::bin_breaks(breaks, closed)
}
else if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(data$x)
}
bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth,
center = center, boundary = boundary,
closed = closed)
}
else {
bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins,
center = center, boundary = boundary,
closed = closed)
}
res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)
# drop 0-count bins completely before returning the dataframe
res <- res[res$x <= max(res[res$count > 0, "x"]) & res$x >= min(res[res$count > 0, "x"]), ]
res
})
```
# Executive Summary {-}
Diagnostic assessments measure the knowledge, skills, and understandings of students at a smaller and more actionable grain size than traditional scale-score assessments. Results of diagnostic assessments are reported as a mastery profile, indicating which knowledge, skills, and understandings the student has mastered and which ones may need more instruction. These mastery decisions are based on probabilities of mastery derived from diagnostic classification models (DCMs).
This report outlines a Bayesian framework for the estimation and evaluation of DCMs. Specifically, this report describes the following:
* a model definition that allows for various parameter equality constraints within a consistent conceptual framework
* the role of prior distributions in the model building process
* an estimation process utilizing the popular *Stan* programming language
* the assessment of estimation diagnostics, such as the $\widehat{R}$ and effective sample size
* the evaluation of model fit using posterior predictive model checks
* model comparison using the cross-validation approximations and model averaging
Findings illustrate the utility of the Bayesian framework for estimating and evaluating DCMs in applied settings. Specifically, the findings demonstrate how a variety of DCMs can be defined within the same conceptual framework. Additionally, using this framework, the evaluation of model fit is more straightforward and easier to interpret with intuitive graphics. Throughout, recommendations are made for specific implementation decisions for the estimation process and the assessment of model fit.
# Implications for the Field {-}
DCMs offer many benefits over traditional scale-score reporting methods. For example, DCMs can provide more actionable results through a fine-grained mastery profile [@bradshaw_sr; @clark_sr] and more reliable scores with a shorter test length [@tb_reli; @wang_reli]. However, despite a growing field of literature describing the benefits of DCM-based assessments, these models have not seen wide-spread use in applied or operational settings [@sessoms_2018]. One reason put forward for this gap between the theory and practice of DCMs is a lack a clarity in the applied research community for how these models should be estimated and evaluated [@ravand_2015; @ravand_2019; @rupp_2018]. This report attempts to bridge the gap between theory and practice by describing a Bayesian framework for estimating DCM models using the *Stan* programming language and evaluating model fit using posterior predictive model checks.
This framework, which is used in an applied setting for the Dynamic Learning Maps^®^ (DLM^®^) alternate assessment, provides a flexible method for defining different types of DCMs. Additionally, the model estimation processes and model fit measures are applicable to the variations in model definition. That is, the same estimation and evaluation procedures can be applied to a wide range of DCMs. Thus, this report provides a practical guide for applied researchers in order to integrate DCMs into their own work.
\newpage
# Purpose of the Report
Diagnostic classification models (DCMs) are able to provide fine-grained and actionable scores for a set of assessed skills or attributes [@rupp_dcm; @bradshaw_dcm]. However, because this class of models is relatively new to operational use, many psychometric properties require further investigation to support the use of the assessments. One key feature that is not well-defined in the literature is how best to assess the model fit of DCMs [@chen_2013; @hu_2016; @rupp_dcm]. Most evaluations of model fit rely solely on measures of relative fit [@sen_2017], which are limited in that these indices are unable to evaluate the fit of the model to the data. Rather, these measures can only make judgments relative to alternative comparison models. The other widely used method for evaluating model fit is limited-information fit indices [e.g., @liu_2016]. In general, these methods consist of univariate, bivariate, and trivariate item tests that rely on $\chi^2$ tests that are known to be asymptotically incorrect [@maydeu_2006]. The $M_2$ statistic developed by @maydeu_2005 can correct for the distributional assumptions, but that statistic is still only based on limited information (i.e., limited sets of items), and therefore may fail to capture higher-order characteristics of the data.
Due to these concerns, this document investigates a Bayesian framework for the estimation of this class of models. This approach allows for the estimation of alternative methods for the evaluation of model fit through posterior predictive model checking.
# Defining the Bayesian Model {#model-def}
The general form of DCMs can be seen in equation \@ref(eq:dcm), where the probability of respondent $j$ providing a given item response can be modeled as shown in equation \@ref(eq:dcm).
\begin{equation}
P(\text{X}_j = \text{x}_j) = \sum_{c=1}^C\nu_c\prod_{i=1}^{I}\pi_{ic}^{x_{ij}}(1 - \pi_{ic})^{1-x_{ij}}
(\#eq:dcm)
\end{equation}
In equation \@ref(eq:dcm), $\pi_{ic}$ is the probability of a respondent in class $c$ providing a correct response to item $i$, and $x_{ij}$ is the observed response (i.e., 0, 1) of respondent $j$ to item $i$. Thus, $\pi_{ic}^{x_{ij}}(1 - \pi_{ic})^{1-x_{ij}}$ represents the probability of a respondent in class $c$ providing the observed response to item $i$. These probabilities are then multiplied across all items, giving the probability of a respondent in class $c$ providing the observed response pattern. Finally, this probability is multiplied by $\nu_c$, which is the base rate probability that any given respondent belongs to class $c$. Thus, this product represents the probability that a given respondent is in class $c$ and provides the observed response pattern.
Although DCMs can be estimated with multiple attributes that have more than two latent categories [@bradshaw_dcm], for illustrative purposes, this paper limits the discussion to single-attribute DCMs with a binary latent trait. Thus, for each model, there are two potential mastery profiles for each respondent (e.g., master and non-master). Note, however, that the methods presented in this paper do generalize to models with multiple attributes with nonbinary latent categories.
Where different types of DCMs differ is in how $\pi_{ic}$ is defined. For example, the log-linear cognitive diagnosis model [LCDM; @lcdm] defines $\pi_{ic}$ similar to the way generalized linear models with a logit link function are defined. Specifically, $\pi_{ic}$ is defined as seen in equation \@ref(eq:meas-lcdm), where $\alpha_c$ is a binary indicator of the mastery status for a respondent in class $c$.
\begin{equation}
\pi_{ic} = P(\text{X}_{ic}=1\ |\ \alpha_c) = \frac{\exp(\lambda_{i,0} + \lambda_{i,1,1}\alpha_c)}{1 + \exp(\lambda_{i,0} + \lambda_{i,1,1}\alpha_c)}
(\#eq:meas-lcdm)
\end{equation}
When using this notation introduced by @rupp_dcm, the $\lambda$ subscripts follow the order of item, effect, then attribute. That is, the first subscript identifies the item for the parameter (noted as $i$). The second subscript denotes the type of effect. Because this discussion is limited to single-attribute models, there are only two types of effects where zero identifies an intercept and one identifies a main effect. In models with multiple attributes, there may be additional effects for two-, three-, or *A*-way interactions. Finally, the last element of the subscript identifies the attribute or attributes. Again, as these are single-attribute models, this element is either nonexistent (for intercept terms where no attribute is involved) or 1 (for all other effects). It is included here only for consistency with the notation in @rupp_dcm.
For additional flexibility, equation \@ref(eq:meas-lcdm) can be modified slightly in order to include both attribute- and item-level effects, similar to multilevel models.
\begin{equation}
\pi_{ic} = P(\text{X}_{ic}=1\ |\ \alpha_c) = \frac{\exp[\lambda_{0} + b_{i,0} + (\lambda_{1,1} + b_{i,1,1})\alpha_c]}{1 + \exp[\lambda_{0} + b_{i,0} + (\lambda_{1,1} + b_{i,1,1})\alpha_c]}
(\#eq:dlm-lcdm)
\end{equation}
Equation \@ref(eq:dlm-lcdm) shows the similarity to multilevel models. In this model, $\lambda_0$ and $\lambda_{1,1}$ represent the attribute-level intercept and main effect, respectively. These are akin to the average intercept and main effect for all items (the fixed effects in the multilevel model literature). In addition to the attribute-level parameters, there are also item-level intercepts ($b_{i,0}$) and main effects ($b_{i,1,1}$). These parameters represent the deviation from the attribute-level effect for each item. Thus, the full intercept for item one would be calculated as $\lambda_0 + b_{1,0}$. This is similar to the estimation of random intercepts and slopes for each item [@stroup_glmm]. The difference between the proposed model and multilevel models is the treatment of the variance of these item-level parameters. In multilevel models, the variance of these effects would be estimated. However, the variance of the item-level parameters can also be fixed to pre-specified values.
If the item-level parameters are constrained to be zero, then all items will have parameters equal to the attribute-level parameter (i.e., all of the $b_{i,0}$ and $b_{i,1,1}$ parameters would be zero). This is mathematically equivalent to what is referred to here as the *fungible* model. Alternatively, the item-level parameters can be allowed to vary freely with no constraints (i.e., a *non-fungible* model). Conceptually, these two models can be thought of as using a zero-variance prior (i.e., $\mathcal{N}(0,\ 0)$) or infinite-variance or flat prior (e.g., $\mathcal{N}(0, \infty)$), respectively. Finally, a non-flat prior can be placed on the item-level parameters, such that the parameters are not constrained to be zero but also not allowed to vary completely freely either.
## Prior Specification for Attribute-Level Effects {#attr-priors}
In equation \@ref(eq:dlm-lcdm), there are two attribute-level effects that require prior specifications. The first attribute-level effect is $\lambda_{0}$, which represents the average intercept across all items. Thus, this parameter also represents the log-odds (due to the logit link function) of a non-master providing a correct response to an average item. For this parameter, a $\mathcal{N}(\mu = 0,\ \sigma=2)$ distribution was used as the prior. This prior distribution was chosen because 99% of the distribution encompasses the plausible values for this parameter. Specifically, the middle 99% of the distribution consists of the log-odds range -5.15 to 5.15, which covers nearly all of the probability scale when other parameters are equal to zero, as seen in Figure \@ref(fig:log-odds).
```{r log-odds, fig.cap = "Log-odds to probability conversion."}
tibble(x = seq(-5, 5, by = 0.01)) %>%
mutate(y = 1 / (1 + exp(-x))) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
scale_x_continuous(breaks = seq(-8, 8, by = 2)) +
labs(x = "Log-odds", y = "Probability") -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
The main effect parameters in the LCDM are constrained to be positive, thus ensuring monotonicity in the model [e.g., masters always have a higher probability of providing a correct response; @lcdm]. Thus, the attribute-level main effect, $\lambda_{1,1}$, uses a lognormal prior: $\text{Lognormal}(\mu = 0, \sigma = 1)$. Similar to the attribute-level intercept, this distribution was chosen because 99% of the distribution covers the range of plausible values. Specifically, the lower 99% of this distribution covers the log-odds range of 0 to 10.24. An upper limit of approximately 10 was desired, as a main effect of 10 would allow for an estimated probability of providing a correct response near 1.0 in the extreme case where the intercept was -5 (the lower tail of the attribute-level intercept prior distribution).
The distributions for these parameters are visualized in Figure \@ref(fig:attr-prior-dist).
```{r attr-prior-dist, fig.cap = "Prior distributions for attribute-level effects."}
bind_rows(
tibble(.variable = "lambda[0]", x = seq(-10, 10, by = 0.01)) %>%
mutate(y = dnorm(x, mean = 0, sd = 2)),
tibble(.variable = "lambda[list(1,1)]", x = seq(0, 10, by = 0.01)) %>%
mutate(y = dlnorm(x, meanlog = 0, sdlog = 1))
) %>%
ggplot(aes(x = x, y = y)) +
facet_wrap(~ .variable, nrow = 1, scales = "free", labeller = label_parsed) +
geom_line() +
labs(x = "Parameter Value", y = "Density") +
theme(strip.text = element_text(size = 20)) -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
## Prior Specification for Item-Level Effects {#item-priors}
The prior distributions for the item-level effects, $b_{i,0}$ and $b_{i,1,1}$, are determined by the type of model that is being estimated. For this proof of concept, three models are considered: fungible, non-fungible, and partial equivalency.
In the fungible model, it is assumed that all items measuring the attribute have the same item parameters. That is, the item-level effects are equivalent to the attribute-level effect. Thus, the item-level deviations from the attribute-level effects are all equal to 0. Conceptually, this means using a $\mathcal{N}(\mu=0,\ \sigma=0)$ prior for all $b_{i,0}$ and $b_{i,1,1}$ terms. In practice, to increase computational efficiency, these terms are left out of the model, and only the attribute-level effects are estimated.
In contrast, the non-fungible model assumes that the item parameters are independent of one another. In other words, the parameters for one item do not dictate the parameters of other items. Conceptually, this means that the item-level deviations from the attribute-level effects are unconstrained, and thus an infinite uniform prior, $\mathcal{U}(-\infty,\ +\infty)$, would be used for all $b_{i,0}$ and $b_{i,1,1}$ terms. In practice, it is more efficient to directly estimate individual parameters for each item rather than attribute-level effects with unconstrained item-level deviations. Therefore, this model more closely resembles a true LCDM in equation \@ref(eq:meas-lcdm), with the $\lambda_{i,0}$ and $\lambda_{i,1,1}$ parameters using the prior distributions described for the [attribute-level priors](#attr-priors).
The partial equivalency model represents a compromise between the fungible and non-fungible models. In this model, item-level parameters are not entirely independent but are also not constrained to be equivalent. Instead, the item-level parameters are assumed to come from some distribution of deviations. The smaller the variance of the distribution, the more fungible the items are. Conversely, a large variance would correspond to less fungibility. Conceptually and in practice, this model is similar to multilevel models. The item-level deviations use a hierarchical normal prior, $\mathcal{N}(\mu=0,\ \sigma)$, where $\sigma$ is an estimated parameter in the model. The $\sigma$ parameter uses a half-Student's *t*-distribution with $df = 3$ (Figure \@ref(fig:sigma-prior)). This prior ensures that the variance is always positive and also allows for larger variances than a normal distribution would. However, the variances are also constrained to reasonable values (i.e., less than approximately 5).
```{r sigma-prior, fig.cap = "Prior distribution for hierarchical variance prior."}
ggplot(data = tibble(x = c(0, 6)), aes(x = x)) +
stat_function(fun = dt, n = 500, args = list(df = 3)) +
geom_segment(x = 0, y = 0, xend = 0, yend = dt(0, df = 3)) +
labs(x = "Parameter Value", y = "Density") -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
## Prior Specification for Class-Level Parameters {#strc-priors}
The last parameter that requires a prior is the structural parameter in equation \@ref(eq:dcm), $\nu_c$. This parameter defines the base rate of inclusion for each class. As such, $\nu$ is constrained so that all elements sum to one (i.e., there are no non-class respondents). Because this discussion is limited to models with a single binary attribute, there are only two classes and therefore two elements of $\nu$. No assumptions are made about the base rate of mastery for attributes; therefore, a uniform Dirichlet prior, $\text{Dir}(1)$, was used for the prior distribution. As there are only two classes, this is equivalent to using a uniform Beta distribution, $\text{Beta}(\alpha=1,\ \beta=1)$, for $\nu_1$ and then calculating $\nu_2$ as $\nu_2 = 1 - \nu_1$.
# The Bayesian Framework in Practice
In order to demonstrate the utility and benefits of using the Bayesian model definition and estimation process, a single simulated data set was generated. This data set was then used to walk through each step of the Bayesian model fit process, from model estimation to model evaluations and comparisons. All analyses were performed in *R* version `r getRversion()` [@R-base].
## Measures
To demonstrate the Bayesian framework in practice, the Dynamic Learning Maps^®^ (DLM^®^) Alternate Assessment System is used as an example diagnostic assessment where this framework is applicable. DLM assessments in English language arts (ELA), mathematics, and science are administered in 19 states to students with the most significant cognitive disabilities. For exemplary purposes, the through-course assessment model, which features instructionally embedded assessments during the year, is used as a template.
In the instructionally embedded model, students cover the entire testing blueprint during each of two testing windows. The first testing window occurs during the fall, from September through December. The second window is open during the spring from February through May. During each window, students take one or more testlets, each consisting of three to nine items, for each alternate content standard (called an Essential Element [EE]) required for blueprint coverage. To ensure that each EE is accessible to all students, each EE is associated with multiple skills that represent the EE at varying levels of depth, breadth, and complexity (called linkage levels). There are five linkage levels for each EE in ELA and mathematics and three linkage levels for each EE in science. Due to the intended flexibility of the instructionally embedded testing model, students may or may not test on the same EE and linkage level multiple times within a testing window or across testing windows. Thus, the number of responses that can be used to estimate student mastery of a linkage level varies by student. For more details on the assignment of testlets, see Chapter 4 of @dlm_tech_1415_im.
For modeling and scoring the DLM assessments, the linkage level is the unit of analysis. That is, a latent class analysis [LCA; @bartholomew_lca] with two classes is estimated for each linkage level [see Chapter 5 of @dlm_tech_1516_im]. The latent class model currently employed for operational use represents an unconstrained version of the models defined in Section \@ref(model-def) [@lcdm; @rupp_dcm]. Specifically, as discussed in Section \@ref(item-priors), the main effects of equation \@ref(eq:dlm-lcdm) are constrained to be positive to ensure monotonicity in the model. When using the unconstrained latent class model, post hoc analysis is needed to ensure the mastery classes are properly defined (i.e., the labels of master and non-master are applied to the correct classes).
Regardless of the choice between the LCA or DCM for estimation, the resulting score is the probability that the student has mastered the linkage level. This probability is often dichotomized into a mastery categorization [@bradshaw_2019]. For example, the DLM assessments use a mastery threshold of 0.8 [see Chapter 5 of @dlm_tech_1516_im]. That is, students with a mastery probability of 0.8 or higher are classified as masters, and students with a mastery probability of less than 0.8 are classified as non-masters. Thus, the scores used for reporting are a profile of mastery classification decisions for each linkage level. For further details on the scoring model for DLM assessments, see Chapter 5 of @dlm_tech_1516_im.
## Simulated Data
```{r example-data, include = FALSE, cache = TRUE}
set.seed(9416)
num_stu <- 1700
att_mastery <- 0.6
rt_pct <- 0.1
# simulate items
items <- tibble(testlet_id = 101:104,
window = rep(c("IE", "SP"), each = 2)) %>%
mutate(num_item = sample(3:5, n(), replace = TRUE)) %>%
uncount(weights = num_item, .id = "testlet_item_id") %>%
rowid_to_column(var = "item_id") %>%
mutate(attr_intercept = runif(1, -2.25, -1.00),
attr_maineffect = runif(1, 1.00, 4.50),
int_mean = case_when(testlet_id == 103L ~ 0.5, TRUE ~ -0.5),
mef_mean = case_when(testlet_id == 103L ~ -0.5, TRUE ~ 0.5),
item_intercept = map_dbl(int_mean, ~rnorm(1, mean = .x, sd = 0.85)),
item_maineffect = map2_dbl(mef_mean, attr_maineffect,
~trunc_sample(rnorm, n = 1, lb = -1 * .y,
mean = .x, sd = 0.85)),
intercept = attr_intercept + item_intercept,
maineffect = attr_maineffect + item_maineffect,
nm_prob = map_dbl(intercept, inv_logit),
ms_prob = map_dbl(intercept + maineffect, inv_logit)) %>%
select(-int_mean, -mef_mean) %>%
write_csv(here("data", "item-parameters.csv"))
all_response <- crossing(fall = c("101", "102", "101,102"),
sp = c("103", "104", "103,104")) %>%
mutate(testlet_id = glue("{fall},{sp}"),
testlet_id = as.character(testlet_id)) %>%
select(-fall, -sp) %>%
mutate(testlets = str_count(testlet_id, ",") + 1,
weight = case_when(testlets == 2 ~ ((1 - rt_pct)^2) * 0.25,
testlets == 3 ~ ((1 - rt_pct) * rt_pct) * 0.5,
testlets == 4 ~ (rt_pct * rt_pct))) %>%
filter(weight > 0) %>%
select(-testlets) %>%
sample_n(size = num_stu, replace = TRUE, weight = weight) %>%
select(-weight) %>%
rowid_to_column(var = "stu_id") %>%
mutate(mastery = sample(c(0L, 1L), n(), replace = TRUE,
prob = c(1 - att_mastery, att_mastery))) %>%
separate_rows(testlet_id, convert = TRUE) %>%
left_join(
items %>%
select(testlet_id, item_id, nm_prob, ms_prob),
by = "testlet_id"
) %>%
mutate(prob_correct = (mastery * ms_prob) + ((1 - mastery) * nm_prob),
rand = runif(n(), 0, 1),
score = case_when(rand <= prob_correct ~ 1L, TRUE ~ 0L)) %>%
write_rds(here("data", "all_response.rds"))
mastery <- distinct(all_response, stu_id, mastery) %>%
write_csv(here("data", "student-parameters.csv"))
# Format data for Stan
response_matrix <- all_response %>%
select(stu_id, item_id, score) %>%
arrange(stu_id, item_id)
ragged_array <- response_matrix %>%
rowid_to_column() %>%
group_by(stu_id) %>%
summarize(start = min(rowid), num = n())
stan_data = list(
I = nrow(items),
J = num_stu,
N = nrow(response_matrix),
ii = response_matrix$item_id,
jj = response_matrix$stu_id,
y = response_matrix$score,
s = ragged_array$start,
l = ragged_array$num
)
```
To illustrate the Bayesian methods for estimating and evaluating diagnostic models, a single data set was generated. Simulated data was chosen for two reasons. First, because the data is simulated, the expected results of the analysis are known. Thus, the results can be compared to the *a priori* expectations to confirm that the methods work as expected. Second, by using simulated data, it is possible to ensure that some models fit the example data and others do not. This means that when examining model fit, there will be examples of fitting and non-fitting models that can be compared. Although this is useful for illustrating the methods, it is important to remember that the data was generated to serve this purpose.
When simulating the example data set, the data was structured similarly to the DLM assessments. In this way, the structure of the simulated data matched what could reasonably be expected from an operational assessment scaled with a DCM. Specifically, items were grouped together into testlets, and testlets were assigned to either the fall or spring testing window. By assigning testlets to the testing windows, it was possible to simulate data with students testing on combinations of testlets consistent with observed data. In other words, the amount and structure of missing data (from testlets not assigned to a student) was comparable across the simulated and observed data. Additionally, following the DLM test design, all items were assumed to follow a simple Q-matrix structure, where all items measure a single attribute [@dlm_tech_1415_im]. Item parameters were simulated according to the partial equivalency model defined in equation \@ref(eq:dlm-lcdm). Thus, the partial equivalency and non-fungible models are expected to show adequate model fit, as these are the true model and a less-constrained model, respectively. Conversely, the fungible model should show poor fit, as the fungible model is more constrained than the partial equivalency model.^[The partial equivalency model was chosen in order to illustrate differences between fitting and non-fitting models and thus should not imply that this model best represents DLM data. See @dlm_tech_1516_im for more information on the operational model used for DLM assessments.]
The attribute-level intercept, $\lambda_0$, was drawn from a $\mathcal{U}(-2.25, -1.00)$ distribution, and the attribute-level main effect, $\lambda_{1,1}$, from a $\mathcal{U}(1.00, 4.50)$. The item-level deviations $b_{i,0}$ and $b_{i,1,1}$ were drawn from a $\mathcal{N}(\mu=0,\ \sigma = 1.0)$ distribution. This resulted in total item intercepts and main effects consistent with those reported for other measures that have been scaled with the LCDM [e.g., @dtmr; @hdcm; @ecpe]. The true parameter values for each testlet and item that were used to simulate the data can be seen in Table \@ref(tab:true-item-param).
```{r true-item-param}
items %>%
select(-testlet_item_id, -nm_prob, -ms_prob) %>%
mutate(window = case_when(window == "IE" ~ "Fall",
window == "SP" ~ "Spring")) %>%
mutate_if(is.double, ~sprintf("%0.2f", .)) %>%
select(window, testlet_id, item_id, everything()) %>%
kable(align = c("c", "c", "c", rep("r", 4), "c", "c"), booktabs = TRUE,
linesep = "", escape = FALSE, caption = "True Item Parameters",
col.names = c("Window", "Testlet", "Item",
"$\\pmb{\\lambda_0}$",
"$\\pmb{\\lambda_{1,1}}$", "$\\pmb{b_{i,0}}$",
"$\\pmb{b_{i,1,1}}$", "$\\pmb{\\lambda_0 + b_{i,0}}$",
"$\\pmb{\\lambda_{1,1} + b_{i,1,1}}$")) %>%
kable_styling(latex_options = "HOLD_position", position = "left") %>%
row_spec(0, bold = TRUE, align = "c") %>%
collapse_rows(columns = 1:2, latex_hline = "custom",
custom_latex_hline = 2, valign = "middle")
```
To mimic the DLM test structure, students were randomly assigned a combination of the simulated testlets. Following the test administration design for the instructionally embedded DLM testing model [for details see Chapter 4 of @dlm_tech_1415_im], students were assigned testlets from both the instructionally embedded and spring pools. During spring assessments, students were randomly assigned only one testlet. For instructionally embedded assessments, students had a `r str_extract(indefinite((1 - rt_pct) * 100), "\\w+")` `r (1 - rt_pct) * 100`% chance of taking only one testlet and `r str_extract(indefinite(rt_pct * 100), "\\w+")` `r rt_pct * 100`% chance of taking both testlets. This is consistent with the reported usage of the instructionally embedded assessment window [@ie_usage]. The resulting probabilities for each possible combination of assigned testlets can be seen in Table \@ref(tab:testlet-prob), along with the total number of students actually simulated to have that combination. In total, `r prettyNum(num_stu, big.mark = ",")` students were simulated, which is consistent with the total number of students that test on a single attribute in a given year from states participating in the instructionally embedded assessment model [see Chapter 7 of @dlm_tech_1415_im].
```{r testlet-prob}
crossing(fall = c("101", "102", "101,102"), sp = c("103", "104", "103,104")) %>%
mutate(testlet_id = glue("{fall},{sp}"),
testlet_id = as.character(testlet_id)) %>%
select(-fall, -sp) %>%
mutate(testlets = str_count(testlet_id, ",") + 1,
weight = case_when(testlets == 2 ~ ((1 - rt_pct)^2) * 0.25,
testlets == 3 ~ ((1 - rt_pct) * rt_pct) * 0.5,
testlets == 4 ~ (rt_pct * rt_pct))) %>%
filter(weight > 0) %>%
arrange(testlets, testlet_id) %>%
select(-testlets) %>%
left_join(
all_response %>%
select(stu_id, testlet_id) %>%
group_by(stu_id) %>%
summarize(testlet_id = paste(sort(unique(testlet_id)), collapse = ",")) %>%
count(testlet_id),
by = "testlet_id"
) %>%
mutate(testlet_id = str_replace_all(testlet_id, ",", ", "),
weight = sprintf("%0.3f", weight),
n = prettyNum(n, big.mark = ",")) %>%
kable(align = c("c", "c", "r"), booktabs = TRUE, linesep = "", escape = FALSE,
col.names = c("Testlet Combination", "Probability", "\\textit{n}"),
caption = "Number of Simulated Students Assigned to Each Testlet Combination") %>%
kable_styling(latex_options = "HOLD_position", position = "left") %>%
row_spec(0, bold = TRUE, align = "c")
```
## Model Estimation {#estimate}
```{r estimate-models, dependson = "example-data", cache = TRUE, include = FALSE}
chains <- 4
iter <- 2000
warmup <- 1000
set.seed(1992)
fung_init <- map(seq_len(chains), function(x, num) {
list(
mean_intercept = runif(1, -2.25, -1.00),
mean_maineffect = runif(1, 1.00, 4.50)
)
})
pteq_init <- map(seq_len(chains), function(x, num) {
list(
mean_intercept = runif(1, -2.25, -1.00),
mean_maineffect = runif(1, 1.00, 4.50),
intercept_dev = runif(num, -0.5, 0.5),
maineffect_dev = runif(num, -0.5, 0.5)
)
}, num = nrow(items))
nfng_init <- map(seq_len(chains), function(x, num) {
list(
intercept = runif(num, -2.25, -1.00),
maineffect = runif(num, 1.00, 4.50)
)
}, num = nrow(items))
if (file_exists(here("data", "estimated-models", "fung.rds"))) {
fung <- read_rds(here("data", "estimated-models", "fung.rds"))
} else {
fung <- stan(here("Stan", "lca_fungible.stan"), data = stan_data,
init = fung_init, chains = chains, iter = iter, warmup = warmup,
cores = chains, refresh = 0, seed = 924,
control = list(adapt_delta = 0.99, max_treedepth = 15))
write_rds(fung, here("data", "estimated-models", "fung.rds"), compress = "gz")
}
if (file_exists(here("data", "estimated-models", "pteq.rds"))) {
pteq <- read_rds(here("data", "estimated-models", "pteq.rds"))
} else {
pteq <- stan(here("Stan", "lca_parteqest.stan"), data = stan_data,
init = pteq_init, chains = chains, iter = iter, warmup = warmup,
cores = chains, refresh = 0, seed = 924,
control = list(adapt_delta = 0.99, max_treedepth = 15))
write_rds(pteq, here("data", "estimated-models", "pteq.rds"), compress = "gz")
}
if (file_exists(here("data", "estimated-models", "nfng.rds"))) {
nfng <- read_rds(here("data", "estimated-models", "nfng.rds"))
} else {
nfng <- stan(here("Stan", "lca_nonfungible.stan"), data = stan_data,
init = nfng_init, chains = chains, iter = iter, warmup = warmup,
cores = chains, refresh = 0, seed = 924,
control = list(adapt_delta = 0.99, max_treedepth = 15))
write_rds(nfng, here("data", "estimated-models", "nfng.rds"), compress = "gz")
}
```
```{r ppmc-samples, dependson = "estimate-models", include = FALSE}
# ppmc
if (file_exists(here("data", "estimated-models", "fung_ppmc.rds"))) {
fung_ppmc <- read_rds(here("data", "estimated-models", "fung_ppmc.rds"))
} else {
fung_ppmc <- rep_data(fung, obs = response_matrix) %>%
write_rds(here("data", "estimated-models", "fung_ppmc.rds"))
}
if (file_exists(here("data", "estimated-models", "pteq_ppmc.rds"))) {
pteq_ppmc <- read_rds(here("data", "estimated-models", "pteq_ppmc.rds"))
} else {
pteq_ppmc <- rep_data(pteq, obs = response_matrix) %>%
write_rds(here("data", "estimated-models", "pteq_ppmc.rds"))
}
if (file_exists(here("data", "estimated-models", "nfng_ppmc.rds"))) {
nfng_ppmc <- read_rds(here("data", "estimated-models", "nfng_ppmc.rds"))
} else {
nfng_ppmc <- rep_data(nfng, obs = response_matrix) %>%
write_rds(here("data", "estimated-models", "nfng_ppmc.rds"))
}
# compare
if (all(file_exists(here("data", "estimated-models",
glue("{c('fung_loo', 'fung_waic')}.rds"))))) {
fung_loo <- read_rds(here("data", "estimated-models", "fung_loo.rds"))
fung_waic <- read_rds(here("data", "estimated-models", "fung_waic.rds"))
} else {
fung_log_lik <- extract_log_lik(fung)
fung_loo <- loo(fung) %>%
write_rds(here("data", "estimated-models", "fung_loo.rds"))
fung_waic <- waic(fung_log_lik) %>%
write_rds(here("data", "estimated-models", "fung_waic.rds"))
}
if (all(file_exists(here("data", "estimated-models",
glue("{c('pteq_loo', 'pteq_waic')}.rds"))))) {
pteq_loo <- read_rds(here("data", "estimated-models", "pteq_loo.rds"))
pteq_waic <- read_rds(here("data", "estimated-models", "pteq_waic.rds"))
} else {
pteq_log_lik <- extract_log_lik(pteq)
pteq_loo <- loo(pteq) %>%
write_rds(here("data", "estimated-models", "pteq_loo.rds"))
pteq_waic <- waic(pteq_log_lik) %>%
write_rds(here("data", "estimated-models", "pteq_waic.rds"))
}
if (all(file_exists(here("data", "estimated-models",
glue("{c('nfng_loo', 'nfng_waic')}.rds"))))) {
nfng_loo <- read_rds(here("data", "estimated-models", "nfng_loo.rds"))
nfng_waic <- read_rds(here("data", "estimated-models", "nfng_waic.rds"))
} else {
nfng_log_lik <- extract_log_lik(nfng)
nfng_loo <- loo(nfng) %>%
write_rds(here("data", "estimated-models", "nfng_loo.rds"))
nfng_waic <- waic(nfng_log_lik) %>%
write_rds(here("data", "estimated-models", "nfng_waic.rds"))
}
```
The models are estimated in *R* version `r getRversion()` [@R-base] using the **rstan** package interface [@R-rstan] to *Stan* [@stan], which utilizes Markov chain Monte Carlo (MCMC) and the Hamiltonian Monte Carlo (HMC) algorithm to efficiently transition between draws of the posterior distribution [@hmc; @hmc_intro]. Specifically, *Stan* utilizes the No-U-Turn sampler [NUTS; @nuts] to dynamically choose a step size and leap trajectory for the HMC algorithm in order to ensure efficient estimation [@hmc_step]. A complete description HMC with NUTS can be found in @nuts. For a less technical introduction to MCMC and HMC, see @sr_mcmc.
The *Stan* code for all models can be found in the [online repository for this report](https://github.com/atlas-aai/bayes-concept). The models were estimated with `r words(chains)` chains, each with `r prettyNum(iter, big.mark = ",")` iterations. The first `r prettyNum(warmup, big.mark = ",")` iterations of each chain were discarded for warm-up, leaving a total of `r prettyNum((iter - warmup) * chains, big.mark = ",")` retained iterations that made up the posterior distributions. There were also several settings specific to NUTS [@nuts] used by *Stan*. First, the adaptive threshold was set to 0.99 to avoid divergent transitions [@betancourt_diverge]. Secondly, the maximum tree depth, which determines how far the algorithm can go before making a U-turn [@betancourt_rstan], was set to 15. These are both more conservative than the values suggested by the @stan_user. The implications of these setting are discussed in the following sections, along with diagnostics to assess their impact.
After estimating the model but before the parameters can be analyzed and inferences can be made, the model is checked to ensure the estimation process completed in an appropriate manner. This diagnostic information is critical to MCMC estimation, as without proper estimation, no valid inferences can be made. Checks include evaluating convergence, efficiency of the sampler, and parameter recovery. Each check is described in detail below using the estimated partial equivalency model as an example, as this was the true data-generating model.
### Convergence {#converge}
A check of convergence evaluates whether the MCMC chain successfully found the high density area of the posterior distribution and stayed there. When multiple chains are estimated, this can be checked by verifying that each chain is drawing estimates from the same parameter space. For a single chain, this is checked by verifying that the parameter is sampled from roughly the same area at the beginning of the chain (after warm-up) as it is at the end of the chain. This is commonly assessed through trace plots. An example of a trace plot is shown in Figure \@ref(fig:exm-trace).
(ref:exm-trace-cap) Trace plot for the attribute-level intercept $\lambda_0$.
```{r exm-trace, dependson = "estimate-models", fig.cap = "(ref:exm-trace-cap)"}
gather_draws(pteq, mean_intercept) %>%
ggplot(aes(x = .iteration, y = .value, color = factor(.chain))) +
geom_line() +
labs(x = "Iteration", y = expression(lambda[0]), color = "Chain") -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
Figure \@ref(fig:exm-trace) shows the trace plot for the attribute-level intercept, $\lambda_0$, and looks the way a trace plot is expected to look. The draws appear to be coming from a stable distribution (i.e., the plot is relatively horizontal with no large upward or downward swings), and all `r words(chains)` are mixing well (as evidenced by the overlap of the `r words(chains)` colors). However, there is no empirical method that uses visual inspection alone to determine how poor a trace plot must be to conclude convergence was not met. Additionally, when there are many parameters, it is impractical to look at each individual trace plot.
To address these shortcomings of evaluating trace plots directly, the $\widehat{R}$ statistic can be used to evaluate convergence [@rhat; @new_rhat]. The $\widehat{R}$ statistic is also known as the potential scale reduction [@bda3] and is a measure of how much variance there is between chains relative to the amount of variation within chains. @rhat_cut suggest that in order to conclude that the model has successfully converged, all $\widehat{R}$ values should be less than 1.1. These results can be summarized, as in Figure \@ref(fig:rhat), to demonstrate the $\widehat{R}$ values for the estimated parameters. In the estimated partial equivalency model, all values are below 1.1, indicating that the model converged.
(ref:rhat-cap) $\widehat{R}$ values for the estimated parameters in the partial equivalency model. Dotted line represents the suggested cutoff by @rhat_cut.
(ref:rhat-scap) $\widehat{R}$ values for the estimated parameters in the partial equivalency model.
```{r rhat, fig.cap = "(ref:rhat-cap)", fig.scap = "(ref:rhat-scap)"}
parms <- c("mean_intercept", "mean_maineffect", "intercept_dev",
"maineffect_dev", "intercept_sd", "maineffect_sd", "nu")
labls <- c(expression(lambda[0]), expression(lambda[list(1,1)]),
expression(italic(b)[list(i,0)]), expression(italic(b)[list(i,1,1)]),
expression(sigma[italic(b)[list(i,0)]]),
expression(sigma[italic(b)[list(i,1,1)]]), expression(nu[c]))
sims <- as.array(pteq)
apply(sims, MARGIN = 3, FUN = Rhat) %>%
enframe(name = "parameter", value = "Rhat") %>%
mutate(parameter = str_replace_all(parameter, "\\[.*\\]", "")) %>%
filter(parameter %in% c("nu", "mean_intercept", "mean_maineffect",
"intercept_dev", "maineffect_dev", "intercept_sd",
"maineffect_sd")) %>%
mutate(parameter = factor(parameter, levels = parms)) %>%
ggplot(aes(x = parameter, y = Rhat, color = parameter)) +
geom_jitter(size = 3, height = 0, width = 0.2, show.legend = FALSE) +
geom_hline(yintercept = 1.1, linetype = "dashed") +
scale_x_discrete(labels = labls) +
labs(x = NULL, y = expression(widehat(R))) -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
### Efficiency
A second check of the MCMC estimation is the efficiency of the sampler, which verifies that the algorithm adequately sampled the full posterior distribution. There are several ways this can be examined. The first is by examining the effective sample size. This diagnostic takes into account the autocorrelation (or anticorrelation) within chains to determine the effective number of independent draws from the posterior. If the chain is slow moving, the draws will be highly autocorrelated, and effective sample size will be well below the total number of retained iterations [@auto_corr]. Conversely, if the chain is moving quickly, it is possible for the draws to be better than independent, or anticorrelated [@anti_corr]. In this scenario, the effective sample size is actually larger than the true sample size.
There are two types of effective sample size that can be used to evaluate the efficiency of the location and scale of the posterior distributions. The sampling efficiency of the location (e.g., mean or median) can be assessed with the bulk effective sample size. Similarly, the scale can be assessed through tail effective sample size. This can be useful for diagnosing problems with mixing due to posterior samples having different scales across chains [@new_rhat]. For both measures, the @stan_best_practice recommend an effective sample size greater than or equal to the number of chains multiplied by 100.
The effective sample size for all parameters in the model can be summarized, as in Figure \@ref(fig:eff-size). Because the model was estimated with `r words(chains)` chains, the effective sample size should be above `r prettyNum(chains * 100, big.mark = ",")`. Figure \@ref(fig:eff-size) shows that all parameters in the estimated partial equivalency model have both a bulk and tail effective sample size above this threshold.
(ref:eff-size-cap) Effective sample size for each estimated parameter. Dotted line represents the suggested cutoff by @stan_best_practice. ESS = effective sample size.
(ref:eff-size-scap) Effective sample size for each estimated parameter.
```{r eff-size, fig.cap = "(ref:eff-size-cap)", fig.scap = "(ref:eff-size-scap)"}
bulk_ess <- apply(sims, MARGIN = 3, FUN = ess_bulk)
tail_ess <- apply(sims, MARGIN = 3, FUN = ess_tail)
ess <- list(
summary(pteq)$summary %>%
as_tibble(rownames = "parameter") %>%
select(parameter, n_eff),
enframe(apply(sims, MARGIN = 3, FUN = ess_bulk), "parameter", "bulk_ess"),
enframe(apply(sims, MARGIN = 3, FUN = ess_tail), "parameter", "tail_ess")
)
reduce(ess, full_join, by = "parameter") %>%
mutate(parameter = str_replace_all(parameter, "\\[.*\\]", "")) %>%
filter(parameter %in% c("nu", "mean_intercept", "mean_maineffect",
"intercept_dev", "maineffect_dev", "intercept_sd",
"maineffect_sd")) %>%
mutate(parameter = factor(parameter, levels = parms)) %>%
gather(key = "measure", value = "value", -parameter) %>%
mutate(measure = case_when(measure == "n_eff" ~ "ESS",
measure == "bulk_ess" ~ "Bulk ESS",
measure == "tail_ess" ~ "Tail ESS")) %>%
filter(measure != "ESS") %>%
ggplot(aes(x = parameter, y = value, color = parameter)) +
facet_wrap(~ measure, nrow = 1) +
geom_jitter(size = 3, height = 0, width = 0.2, show.legend = FALSE) +
geom_hline(yintercept = chains * 100, linetype = "dashed") +
expand_limits(y = c(0, (iter - warmup) * chains)) +
scale_x_discrete(labels = labls) +
scale_y_continuous(labels = scales::comma_format()) +
labs(x = NULL, y = "Effective Sample Size") -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
There are also measures of efficiency that are exclusive to NUTS [@nuts]. For example, the Bayesian factor of missing information gives an estimate of how well the sampler adapted and explored the posterior distribution. The Bayesian factor of missing information generally ranges from zero to one, with zero and one representing poor and excellent estimation, respectively. This is calculated for each chain overall, rather than each individual parameter [@bfmi].
The Bayesian factor of missing information values for this example are shown in Table \@ref(tab:efficiency) and indicate that the sample was able to adequately visit the posterior distributions. Additionally, Table \@ref(tab:efficiency) shows the mean acceptance rate for each chain. As expected, these values are very close to the 0.99 adaptive threshold that was specified during the [model estimation](#estimate). As mentioned previously, a target acceptance rate this high is needed to prevent divergent transitions.
The concern with setting the target acceptance rate this high is that for parameters with wider posteriors, the sampler will not be able to move fast enough. When using NUTS, at each iteration, the sampler looks for a place to "U-Turn" in a series of possible branches. If the sample is terminating before the maximum possible tree depth (which was specified to be 15), then the algorithm is able to adequately find good values for the next iteration of the chain, despite the small steps being enforced by the high target acceptance rate. Bumping up against the maximum allowed tree depth, or going beyond it, indicates that the step size is too small [@stan_user; @stan_warn]. Because the maximum tree depth values in Table \@ref(tab:efficiency) are all below the maximum specified, and the Bayesian factor of missing information values are all close to one, there is strong evidence that in this model, the sampler was able to adequately sample the posteriors.
```{r efficiency}
sampler_params <- get_sampler_params(pteq, inc_warmup = FALSE)
upars <- suppressMessages(stan(here("Stan", "lca_parteqest.stan"),
data = stan_data, chains = 0)) %>%
get_num_upars()
E <- as.matrix(sapply(sampler_params, FUN = function(x) x[, "energy__"]))
EBFMI <- upars / apply(E, 2, var)
mean_accept <- sapply(sampler_params, function(x) mean(x[, "accept_stat__"]))
max_treedepth <- sapply(sampler_params, function(x) max(x[, "treedepth__"]))
tibble(chain = glue("{seq_len(chains)}"),
bfmi = EBFMI,
mean_accept = mean_accept,
max_treedepth = as.integer(max_treedepth)) %>%
mutate_if(is.double, ~ sprintf("%0.3f", .)) %>%
kable(align = "c", booktabs = TRUE, linesep = "",
caption = "Diagnostic Statistics for the No-U-Turn Sampler",
col.names = c("Chain", "BFMI", "Mean Acceptance Rate",
"Max Tree Depth")) %>%
kable_styling(latex_options = "HOLD_position", full_width = TRUE) %>%
row_spec(0, bold = TRUE, align = "c") %>%
footnote(general = "BFMI = Bayesian factor of missing information.",
footnote_as_chunk = TRUE)
```
### Parameter Recovery
In addition to having diagnostics to ensure that the model is estimated properly, it is also important to establish that the model as defined in the *Stan* code is able to accurately recover the true parameter values. Otherwise, a model may estimate well but be miss-specified, leading to incorrect parameter estimates. Figure \@ref(fig:item-recover) shows the true (from Table \@ref(tab:true-item-param)) versus estimated item parameter values, indicating successful parameter recovery for the partial equivalency model with the simulated data.
```{r item-recover, fig.cap = "Parameter recovery from the example partial equivalency model with simulated data."}
recovery_sum <- summary(pteq)$summary %>%
as_tibble(rownames = "parameter") %>%
filter(str_detect(parameter, "(intercept|maineffect)\\[")) %>%
select(parameter, est = mean) %>%
separate(parameter, into = c("parameter", "item_id", NA), convert = TRUE) %>%
left_join(
items %>%
select(item_id, intercept, maineffect) %>%
gather(key = "parameter", value = "true", -item_id),
by = c("parameter", "item_id")
) %>%
mutate(
parameter = factor(parameter, levels = c("intercept", "maineffect"),
labels = c("lambda[0] + italic(b)[list(i,0)]",
"lambda[1,1] + italic(b)[list(i,1,1)]"))
)
facet_limits <- recovery_sum %>%
group_by(parameter) %>%
summarize(min = min(est, true), max = max(est, true)) %>%
pivot_longer(cols = c(min, max), names_to = "type", values_to = "est") %>%
mutate(true = est)
ggplot(recovery_sum, aes(x = true, y = est)) +
facet_wrap(~ parameter, nrow = 1, scales = "free",
labeller = label_parsed) +
geom_point(size = 3) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
geom_blank(data = facet_limits) +
labs(x = "True Value", y = "Estimated Value") -> plot
plot %>%
ggsave2(fig_path(".png")) %>%
ggsave2(fig_path(".pdf"))
include_graphics(fig_path(".pdf"))
```
```{r class-recover-calc}
thresh <- 0.5
class_recovery <- mastery %>%
rename(true = mastery) %>%
left_join(
map_dfr(pteq_ppmc$mastery_probs, function(x) return(x), .id = ".draw") %>%
group_by(stu_id) %>%
summarize(prob = median(master_prob)) %>%
mutate(est = case_when(prob >= thresh ~ 1L, TRUE ~ 0L)) %>%
select(stu_id, est),
by = "stu_id"
) %>%
count(true, est)
correct_rate <- class_recovery %>%
mutate(correct = case_when(true == est ~ n, TRUE ~ 0L)) %>%
summarize(correct = sum(correct) / sum(n)) %>%
pull(correct)
```
It is also possible to examine the accuracy of the respondent classifications as a master or non-master. For this analysis, respondents were classified as masters if the median of the posterior distribution for the probability of mastery was greater than or equal to `r sprintf("%0.1f", thresh)`. This threshold places respondents in their most likely class; however, any threshold can be used in practice to facilitate stakeholder understanding of scores [@bradshaw_2019]. Respondent classification results for the partial equivalency model are summarized in Table \@ref(tab:class-recover). In total, `r sprintf("%0.0f", correct_rate * 100)`% of the simulated students were correctly classified as masters or non-masters. This is not surprising, given that the data was simulated to fit the model, and all of the items are fairly discriminating (Table \@ref(tab:true-item-param)).
```{r class-recover}
class_recovery %>%
mutate(n = prettyNum(n, big.mark = ",")) %>%
kable(align = c("c", "c", "r"), booktabs = TRUE, linesep = "", escape = FALSE,
caption = "Respondent Classification Accuracy",
col.names = c("True Mastery", "Estimated Mastery", "\\textit{n}")) %>%
kable_styling(latex_options = "HOLD_position", position = "left") %>%
row_spec(0, bold = TRUE, align = "c")
```
## Evaluating Model Fit
```{r clear-mem, include = FALSE}
rm(fung, pteq, nfng, stan_data); gc()
```
Model fit can be assessed in both an absolute and relative sense. Absolute fit is used to evaluate whether or not the estimated model adequately reflects the observed data and is a prerequisite for the evaluation of relative fit. Relative fit compares the fit of two more models that all show adequate absolute model fit [@chen_2013; @sen_2017]. In this document, methods are presented for assessing absolute and relative fit through posterior predictive model checking and information criteria, respectively. In order to demonstrate how these methods work in practice, both the fungible and non-fungible models were estimated on the same data used to estimate the partial equivalency model in the [previous section](#estimate). Thus, there are a total of three models to compare and calculate posterior predictive checks for. As described previously, the partial equivalency model was the true data generation model.
### Absolute Fit {#abs-fit}
Posterior predictive model checks are used to assess the absolute fit of a specific model to the observed data. Posterior predictive checks involve simulating replications of the data using the values of the posterior distributions and then comparing the replicated data sets back to the observed data [@bda3]. As explained in the [model estimation section](#estimate), a total of `r prettyNum((iter - warmup) * chains, big.mark = ",")` iterations were retained from the MCMC estimation. Thus, `r prettyNum((iter - warmup) * chains, big.mark = ",")` replicated data sets can be simulated, one for each iteration, using the current values of the parameters at each iteration. The process for simulating a replicated data set for a single iteration is as follows:
1. Randomly assign the first respondent to the master or non-master class, with probability equal to the current value of the respondent's probability of attribute mastery.
2. For the first item the respondent takes, simulate a response using the current values of the item parameters and the mastery status that was simulated in step 1.
3. Repeat step 2 for all items the respondent tested on.
4. Repeat steps 1--3 for all respondents.
This process is repeated for each iteration in the chain. Because the replicated data sets are simulated from the current values of the parameters, these data sets represent what the data would be expected to look like *if the specified model were true*. Therefore, summaries of these data sets can then be used to look for systematic differences in the characteristics of the observed data and the replicated data sets, often through visualizations [@gelman_hill].
#### Model-Level Fit
```{r score-dist-calc, cache = TRUE, include = FALSE}
calc_score_dist <- function(x) {
map_dfr(x$ppmc_data, function(x) {
x %>%
group_by(stu_id) %>%
summarize(raw_score = sum(score)) %>%
count(raw_score)
}, .id = ".draw")
}
score_dist <- list(fung_ppmc, pteq_ppmc, nfng_ppmc) %>%
set_names("Fungible", "Partial Equivalency", "Non-fungible") %>%
map(calc_score_dist) %>%
bind_rows(.id = "model") %>%
complete(model, .draw, raw_score, fill = list(n = 0)) %>%
mutate(model = factor(model, levels = c("Fungible", "Partial Equivalency",
"Non-fungible")))
score_summary <- score_dist %>%
group_by(model, raw_score) %>%
summarize(mean = mean(n),
median = median(n),
lb = quantile(n, probs = 0.025),
ub = quantile(n, probs = 0.975)) %>%
left_join(
response_matrix %>%
group_by(stu_id) %>%
summarize(raw_score = sum(score)) %>%
count(raw_score),
by = "raw_score"
) %>%
rename(obs = n) %>%
replace_na(list(obs = 0))
fung_text <- score_summary %>%
filter(model == "Fungible") %>%
mutate(under = obs > ub,
over = obs < lb) %>%
filter(over | under)
```
At the model level, posterior predictive checks can be calculated for the raw score distribution. This is accomplished by counting the number of respondents at each raw score point in each of the `r prettyNum((iter - warmup) * chains, big.mark = ",")` replicated data sets. Thus, a distribution is derived from the number of respondents expected to be present at each raw score point in the observed data. Figure \@ref(fig:score-dist) shows the distribution of expected raw scores along with the number of observed students in the simulated data.
```{r score-dist, fig.cap = "Posterior predictive model check for the raw score distribution."}
ggplot() +
facet_wrap(~ model, ncol = 1) +
geom_jitter(data = group_by(score_dist, model, raw_score) %>% sample_n(500),
aes(x = raw_score, y = n),
alpha = 0.2, height = 0, width = 0.3) +
geom_line(data = score_summary,
aes(x = raw_score, y = lb, color = "95% Credible Interval"),
linetype = "dashed", show.legend = FALSE) +
geom_line(data = score_summary,
aes(x = raw_score, y = ub, color = "95% Credible Interval"),
linetype = "dashed", show.legend = FALSE) +
geom_line(data = score_summary,
aes(x = raw_score, y = obs, color = "Observed"),
linetype = "solid") +
geom_point(data = score_summary,
aes(x = raw_score, y = obs, color = "Observed"),
size = 4, show.legend = FALSE) +
scale_x_continuous(breaks = seq(0, 20, 1)) +
scale_y_continuous(labels = scales::comma_format()) +
labs(x = "Raw Score", y = "Students", color = NULL) -> plot
plot %>%
ggsave2(fig_path(".png"), height = 8) %>%
ggsave2(fig_path(".pdf"), height = 8)
include_graphics(fig_path(".pdf"))
```
Figure \@ref(fig:score-dist) shows a bimodal distribution, which is a result of the mixture of raw score distributions for masters and non-masters. Additionally, very few students are expected to have a high raw score due to the fact that relatively few students test on more than two testlets (Table \@ref(tab:testlet-prob)). Finally, the expected distribution for fungible model shows some deviations from the observed scores. Specifically, the fungible model overestimates the number of students with a raw score of `r pull(filter(fung_text, over), raw_score) %>% english() %>% combine_words()` and underestimates the number of students with a raw score of `r pull(filter(fung_text, under), raw_score) %>% english() %>% combine_words()`.
Similar to the examination of trace plots [above](#converge) (Figure \@ref(fig:exm-trace)), this visualization alone is insufficient for determining if the amount of misfit in the distribution is significant. Rather, @beguin_2001 suggest a $\chi^2$ discrepancy measure can be calculated according to equation \@ref(eq:chisq).
\begin{equation}
\chi_{obs}^2=\sum_{s=0}^S\frac{[n_s-E(n_s)]^2}{E(n_s)}
(\#eq:chisq)
\end{equation}
In equation \@ref(eq:chisq), $s$ represents the score point, $n_s$ is the number of respondents at score point $s$, and $E(n_s)$ is the expected number of respondents at score point $s$, calculated as the average over all of the replicated data sets. Like the $\chi^2$ tests that are used to assess model fit when the expectation-maximization algorithm is used, the $\chi_{obs}^2$ statistic does not follow a true $\chi^2$ distribution. However, when using posterior predictive model checks, none of the distributional assumptions are required. This is because the reference distribution can be generated directly from the replicated data sets, similar to a parametric bootstrap. Using the same definition of $E(n_s)$ as above, a $\chi_{rep}^2$ can be computed for each of the replicated data sets. The `r prettyNum((iter - warmup) * chains, big.mark = ",")` $\chi_{rep}^2$ values then make up the reference distribution to compare back to $\chi_{obs}^2$. A posterior predictive *p*-value (*ppp*) can then be calculated as shown in equation \@ref(eq:ppp).
\begin{equation}
ppp=P(\chi_{rep}^2\geq\chi_{obs}^2\ |\ n_s)
(\#eq:ppp)
\end{equation}
Equation \@ref(eq:ppp) says that the posterior predictive *p*-value is the proportion of replicated data sets whose $\chi_{rep}^2$ value is greater than the $\chi_{obs}^2$ value from the observed data. Posterior predictive *p*-values close to zero indicate poor model fit (a cutoff of .05 could be used, for example), whereas values very close to one may indicate possible over-fitting. The $\chi_{obs}^2$ distributions and posterior predictive *p*-values for the fungible, partial equivalency, and non-fungible model are shown in Table \@ref(tab:chisq-stats) and visualized in Figure \@ref(fig:chisq-dist). As expected, given the distributions in Figure \@ref(fig:score-dist), the fungible model shows poor model fit, with a posterior predictive *p*-value of less than .05. In contrast, both the partial equivalency and non-fungible models show acceptable fit to the simulated data.
```{r chisq-calc, include = FALSE}
ppmc_chisq <- score_dist %>%
left_join(select(score_summary, model, raw_score, exp = mean),
by = c("model", "raw_score")) %>%
mutate(exp = na_if(exp, 0)) %>%
replace_na(list(exp = 0.000001)) %>%
mutate(piece = ((n - exp) ^ 2) / exp) %>%
group_by(model, .draw) %>%
summarize(chisq = sum(piece))
obs_chisq <- response_matrix %>%
group_by(stu_id) %>%
summarize(raw_score = sum(score)) %>%
count(raw_score) %>%
full_join(select(score_summary, model, raw_score, exp = mean),
by = "raw_score") %>%
mutate(exp = na_if(exp, 0)) %>%
replace_na(list(exp = 0.000001, n = 0)) %>%
mutate(piece = ((n - exp) ^ 2) / exp) %>%
group_by(model) %>%
summarize(obs_chisq = sum(piece)) %>%
left_join(ppmc_chisq, by = "model") %>%
group_by(model) %>%
summarize(ppp = sprintf("%0.3f", mean(chisq >= obs_chisq)),
obs_chisq = unique(obs_chisq),
rep_mean = mean(chisq),
rep_5 = quantile(chisq, probs = 0.05),
rep_95 = quantile(chisq, probs = 0.95)) %>%
mutate(sign = case_when(ppp == "1.000" ~ ">",
ppp == "0.000" ~ "<",
TRUE ~ "="),
ppp = case_when(ppp == "1.000" ~ "0.999",
ppp == "0.000" ~ "0.001",
TRUE ~ ppp),
lab = paste0(expression(italic(ppp)), '~"', sign, '"~', ppp))
```
(ref:chisq-stats-cap) $\chi_{obs}^2$ Values and Summaries of $\chi_{rep}^2$ Distributions
(ref:chisq-stat-foot) *ppp* = posterior predictive *p*-value.
```{r chisq-stats}
obs_chisq %>%
select(model, obs_chisq, rep_mean, rep_5, rep_95, ppp) %>%
mutate_if(is.double, ~ sprintf("%0.2f", .)) %>%
kable(align = c("l", "r", rep("c", 4)), booktabs = TRUE, linesep = "",
escape = FALSE, caption = "(ref:chisq-stats-cap)",
col.names = c("Model", "$\\pmb{\\chi_{obs}^2}$",
"$\\pmb{\\chi_{rep}^2}$ Mean",
"$\\pmb{\\chi_{rep}^2}$ 5\\%",
"$\\pmb{\\chi_{rep}^2}$ 95\\%",
"\\textit{ppp}")) %>%
kable_styling(latex_options = "HOLD_position", position = "left") %>%