library("SBC")
library("brms")
library("cmdstanr")
library("ggplot2")
library("colorspace")
library("cowplot")
library("future")
Intro
Relevant parts of the code given here is inspired by the resources of the great SBC package (Modrák et al., 2023) published at https://hyunjimoon.github.io/SBC/index.html.
Also, coding of posterior p-values is inspired by the split predictive check (Li & Huggins, 2024) R code published at https://github.com/TARPS-group/split-predictive-checks.
Software
We use the statistical software environment R (R Core Team, 2024), and R add-on packages colorspace (Stauffer et al., 2009), SBC (Modrák et al., 2023), brms (Bürkner, 2017, 2018, 2021), cmdstanr (Gabry et al., 2024), ggplot2 (Wickham, 2016), and future (Bengtsson, 2021).
This document is produced using Quarto (Allaire et al., 2024).
Organize R Session
Load packages:
Settings:
rm(list = ls())
options(brms.backend = "cmdstanr")
# Using parallel processing
plan(multisession)
options(SBC.min_chunk_size = 5)
# Setup caching of results
<- "./_SBC_cache"
cache_dir if(!dir.exists(cache_dir)) {
dir.create(cache_dir)
}
Data preparation function for graphical uniformity checks:
## from https://github.com/TeemuSailynoja/simultaneous-confidence-bands/tree/main
<- function (x, variables = NULL, K = NULL,
dat_ecdf_diff gamma = NULL, prob = 0.95,
combine_variables = NULL,
ecdf_alpha = NULL, ...,
parameters = NULL) {
if (!is.null(parameters)) {
warning("The `parameters` argument is deprecated use `variables` instead.")
if (is.null(variables)) {
<- parameters
variables
}
}<- data_for_ecdf_plots(x, variables = variables,
ecdf_data prob = prob, K = K, gamma = gamma, combine_variables = combine_variables,
ecdf_alpha = ecdf_alpha, ...)
if (ecdf_data$N < 50 && is.null(K)) {
message("With less than 50 simulations, we recommend using plot_ecdf as it has better fidelity.\n",
"Disable this message by explicitly setting the K parameter. ",
"You can use the strings \"max\" (high fidelity) or \"min\" (nicer plot) or choose a specific integer.")
}<- ecdf_data$N
N <- ecdf_data$K
K <- ecdf_data$z
z <- dplyr::mutate(ecdf_data$ecdf_df, z_diff = ecdf -
ecdf_df type = "sample ECDF")
z, <- dplyr::mutate(ecdf_data$limits_df, ymax = upper -
limits_df_trans ymin = lower - uniform_val, type = "theoretical CDF")
uniform_val, return(list('ecdf_df' = ecdf_df,
'limits_df_trans' = limits_df_trans))
}
Data Generator
The data can be generated using the following code – note that we need to be careful to match the parameter names as brms
uses them.
<- function(sim_df,
f_generate_y
b_Intercept,
b_x, ## sd_g__Intercept or r_g
grouping_term,
sigma) {if (is.matrix(grouping_term)) {
<- grouping_term
r_g else {
} <- length(levels(sim_df$g))
G <- matrix(rnorm(n = G, mean = 0, sd = grouping_term),
r_g nrow = G, ncol = 1,
dimnames = list(levels(sim_df$g), "Intercept"))
}$'mu' <- exp(b_Intercept +
sim_df$'x' * b_x +
sim_df$'g'])
r_g[sim_df$'sigma' <- sigma
sim_df$'y' <- rnorm(n = nrow(sim_df),
sim_dfmean = sim_df$'mu',
sd = sim_df$'sigma')
return(sim_df)
}<- function(N, G) {
f_generate_one_sim <- list('b_Intercept' = 0,
prior_mu 'b_x' = 1,
'sd_g__Intercept' = .5,
'sigma' = 1)
<- list('b_Intercept' = .1,
prior_sd 'b_x' = .1,
'sd_g__Intercept' = .1,
'sigma' = .1)
<- rnorm(n = 1,
b_Intercept mean = prior_mu$b_Intercept,
sd = prior_sd$b_Intercept)
<- rnorm(n = 1,
b_x mean = prior_mu$b_x,
sd = prior_sd$b_x)
<- rnorm(n = 1,
sd_g__Intercept mean = prior_mu$sd_g__Intercept,
sd = prior_sd$sd_g__Intercept)
<- abs(sd_g__Intercept)
sd_g__Intercept <- rnorm(n = 1,
sigma mean = prior_mu$sigma,
sd = prior_sd$sigma)
<- abs(sigma)
sigma <- rep(c(5, 25, 45), c(6, 8, 6))
indx if (sum(indx) != 500) {
stop("'N' must be 500 currently!")
}<- paste0("Group ", formatC(1:G, width = 2, flag = "0"))
g <- sample(g)
g <- rep(g, indx)
g <- as.factor(g)
g <- matrix(rnorm(n = G, mean = 0, sd = sd_g__Intercept),
r_g nrow = G, ncol = 1,
dimnames = list(levels(g), "Intercept"))
<- data.frame('x' = 2 * runif(N),
sim_df_skeleton 'g' = g)
return(list('variables' = list('b_Intercept' = b_Intercept,
'b_x' = b_x,
'sd_g__Intercept' = sd_g__Intercept,
'r_g' = r_g,
'sigma' = sigma),
'generated' = f_generate_y(sim_df = sim_df_skeleton,
b_Intercept,
b_x, grouping_term = r_g,
sigma)))
}<- SBC_generator_function(f_generate_one_sim,
f_generate_n_sims N = 500,
G = 20)
set.seed(0)
<- generate_datasets(f_generate_n_sims, 100)
datasets head(datasets$generated[[1]])
x g mu sigma y
1 0.6933670 Group 11 2.010588 1.127243 1.257388
2 0.6675499 Group 11 1.960995 1.127243 3.026448
3 0.9527025 Group 11 2.583906 1.127243 3.072794
4 1.7843967 Group 11 5.776880 1.127243 6.909938
5 1.7286789 Group 11 5.473749 1.127243 5.033990
6 0.7799791 Group 14 3.143628 1.127243 3.567889
Descriptive Plot
<- 1
r <- as.data.frame(expand.grid(x = seq(0, 2, by = .05), g = 1:20))
dat <- as.numeric(datasets$variables[r, ])
b $y <- exp(b[1] + dat$x * b[2] + b[dat$g + 3])
dat$g <- paste0("Group ", formatC(dat$g, width = 2, flag = "0"))
dat$k <- dat$l <- "Single group"
dat<- rbind(dat,
dat data.frame(x = seq(0, 2, by = .05),
y = exp(b[1] + dat$x * b[2]),
g = "Group 21",
k = "(a)",
l = "Average"),
data.frame(x = seq(0, 2, by = .05),
y = exp(b[1] + dat$x * b[2] + .5 * b[3]^2),
g = "Group 22",
k = "(b)",
l = "Average"))
<- subset(dat, l == "Average")
sdat $g <- NULL
sdat<- colorspace::sequential_hcl(n = 20, pal = "Grays")[10]
paint ggplot(data = datasets$generated[[r]], aes(x = x, y = y)) +
geom_line(data = subset(dat, l != "Average"), color = paint) +
geom_point(, color = paint) +
geom_line(data = sdat, aes(group = k, linetype = k), color = 1) +
scale_linetype_manual(values = c(2, 1)) +
facet_wrap(~ g) +
labs(linetype = "Variante:") +
theme(legend.position = "top")
Backend
<-
priors prior(normal(0,.1), class = "b", coef = "Intercept") +
prior(normal(1,.1), class = "b", coef = "x") +
prior(normal(.5,.1), class = "sd") +
prior(normal(1,.1), class = "sigma")
<- "brms_log_link_linear_mixed_model.stan"
stan_file_filename <- SBC_backend_brms(bf(y ~ 0 + Intercept + x + (1 | g)),
f_backend family = gaussian(link = "log"),
prior = priors,
chains = 1,
template_data = datasets$generated[[1]],
out_stan_file = file.path(cache_dir,
stan_file_filename))
<- derived_quantities(
log_lik_log_link_lmm log_lik = sum(dnorm(y,
mean = exp(b_Intercept + x * b_x + r_g[g]),
sd = sigma,
log = TRUE)))
<- compute_SBC(datasets,
results
f_backend, dquants = log_lik_log_link_lmm,
cache_mode = "results",
cache_location = file.path(cache_dir,
"results_log_link_linear_mixed_model"))
<- results$result$stats[, c("sim_id", "variable", "rank")]
tmp <- dat_ecdf_diff(tmp,
tmp variables = c("b_Intercept", "b_x", "sd_g__Intercept", "sigma",
"log_lik"),
max_rank = 99)
$variable <- factor(tmp$variable,
tmplevels = c("b_Intercept", "b_x", "sd_g__Intercept", "sigma",
"log_lik"))
<- c(colorspace::sequential_hcl(n = 20, pal = "Grays")[10], "black")
col_color <- c(colorspace::sequential_hcl(n = 20, pal = "Grays")[10], "black")
col_fill names(col_color) <- names(col_fill) <-
c("theoretical CDF", "sample ECDF")
<- c('theoretical CDF' = expression("Theoretical CDF"),
l 'sample ECDF' = expression("Sample ECDF"))
ggplot(tmp$ecdf_df, aes(color = type, fill = type)) +
geom_ribbon(data = tmp$limits_df_trans,
aes(x = x, ymax = ymax, ymin = ymin), alpha = .33,
linewidth = 1) +
geom_step(aes(x = z, y = z_diff, group = variable)) +
scale_color_manual(name = "", values = col_color, labels = l) +
scale_fill_manual(name = "", values = col_fill, labels = l) +
scale_alpha_identity() +
xlab(NULL) +
ylab(NULL) +
facet_wrap(~ variable) +
theme(strip.text = element_text(margin = margin(b = 2, t = 2))) +
theme(legend.position = "top")
Calculate QOI-Check
Preparations:
<- function(fit, nd) {
f_predict_brms_gaussian predict(fit, newdata = nd,
summary = F,
allow_new_levels = T,
sample_new_levels = "gaussian")
}<- function(fit, nd) {
f_predict_brms_uncrtnty predict(fit, newdata = nd,
summary = F,
allow_new_levels = T,
sample_new_levels = "uncertainty")
}set.seed(123456789)
<- p_stat_mean_ref_gr_uncrtnty_a <-
p_stat_mean_ref_gr_gaussian_a <- p_stat_mean_ref_gr_uncrtnty_b <-
p_stat_mean_ref_gr_gaussian_b <- p_stat_mean_sim_df_uncrtnty_a <-
p_stat_mean_sim_df_gaussian_a <- p_stat_mean_sim_df_uncrtnty_b <-
p_stat_mean_sim_df_gaussian_b ## [October 22, 2024]
<- p_stat_mean_ref_gr_uncrtnty_c <-
p_stat_mean_ref_gr_gaussian_c <- p_stat_mean_ref_gr_uncrtnty_d <-
p_stat_mean_ref_gr_gaussian_d <- p_stat_mean_sim_df_uncrtnty_c <-
p_stat_mean_sim_df_gaussian_c <- p_stat_mean_sim_df_uncrtnty_d <-
p_stat_mean_sim_df_gaussian_d ##
<- p_stat_mean_ref_gr_uncrtnty_e <-
p_stat_mean_ref_gr_gaussian_e <- p_stat_mean_ref_gr_uncrtnty_f <-
p_stat_mean_ref_gr_gaussian_f <- p_stat_mean_sim_df_uncrtnty_e <-
p_stat_mean_sim_df_gaussian_e <- p_stat_mean_sim_df_uncrtnty_f <-
p_stat_mean_sim_df_gaussian_f ##
<- p_stat_mean_eq_a_b <- p_stat_mean_eq_a_c <-
p_stat_mean_eq_a_a <- p_stat_mean_eq_a_e <- p_stat_mean_eq_a_f <-
p_stat_mean_eq_a_d ##
<- p_stat_mean_eq_b_b <- p_stat_mean_eq_b_c <-
p_stat_mean_eq_b_a <- p_stat_mean_eq_b_e <- p_stat_mean_eq_b_f <-
p_stat_mean_eq_b_d ##
rep(NA, length(results$fits))
Loop through the simulation runs and calculate all QOI variants for each run:
for (r in 1:length(results$fits)) {
## Prior in repetition r:
<- subset(results$stats, sim_id == r)
prior_r ## Methods a and b:
<- exp(subset(prior_r, variable == "b_Intercept")$simulated_value +
equation_a 1 * subset(prior_r, variable == "b_x")$simulated_value)
<- exp(subset(prior_r, variable == "b_Intercept")$simulated_value +
equation_b 1 * subset(prior_r, variable == "b_x")$simulated_value +
5 * (subset(prior_r,
.== "sd_g__Intercept")$simulated_value^2))
variable ## Reference grid data 'ref_gr':
<- paste0("g", formatC(101:300, width = 3, flag = "0"))
g <- data.frame('x' = 1,
ref_gr 'g' = as.factor(g))
## Duplicate observation data 'sim_df':
<- results$fits[[r]]$data[, c('x', 'g')]
sim_df $x <- 1 ## November 15, 2024
sim_df<- 100 + as.numeric(sim_df$g) ## 'break' original levels
g <- paste0("g", formatC(g, width = 3, flag = "0"))
g $g <- as.factor(g)
sim_df## Prior predictions (methods c and d):
<- f_generate_y(sim_df = ref_gr,
y_prior_pred b_Intercept = subset(prior_r,
== "b_Intercept")$simulated_value,
variable b_x = subset(prior_r,
== "b_x")$simulated_value,
variable grouping_term = subset(prior_r,
== "sd_g__Intercept")$simulated_value,
variable sigma = subset(prior_r,
== "sigma")$simulated_value)
variable $y_prior_pred <- y_prior_pred$y
ref_grrm(y_prior_pred)
<- f_generate_y(sim_df = sim_df,
y_prior_pred b_Intercept = subset(prior_r,
== "b_Intercept")$simulated_value,
variable b_x = subset(prior_r,
== "b_x")$simulated_value,
variable grouping_term = subset(prior_r,
== "sd_g__Intercept")$simulated_value,
variable sigma = subset(prior_r,
== "sigma")$simulated_value)
variable $y_prior_pred <- y_prior_pred$y
sim_dfrm(y_prior_pred)
<- mean(ref_gr$y_prior_pred)
reference_c <- mean(sim_df$y_prior_pred)
reference_d <- datasets$variables[r, ]
b <- as.numeric(b)
b ## Prior 'uncertainty' for ref_gr:
<- sample(b[grep(x = colnames(datasets$variables),
gamma pattern = "r_g[Group", fixed = T)],
size = 200, replace = T)
<- matrix(gamma, nrow = 200, ncol = 1,
gamma dimnames = list(levels(ref_gr$g), "Intercept"))
$y_prior_pred_uncrtnty <- f_generate_y(sim_df = ref_gr,
ref_grb_Intercept =
which(colnames(datasets$variables) == "b_Intercept")],
b[b_x =
which(colnames(datasets$variables) == "b_x")],
b[grouping_term = gamma,
sigma =
which(colnames(datasets$variables) == "sigma")])$y
b[## Prior 'uncertainty' for sim_df:
<- sample(b[grep(x = colnames(datasets$variables),
gamma pattern = "r_g[Group", fixed = T)],
size = 20, replace = T)
<- matrix(gamma, nrow = 20, ncol = 1,
gamma dimnames = list(levels(sim_df$g), "Intercept"))
$y_prior_pred_uncrtnty <- f_generate_y(sim_df = sim_df,
sim_dfb_Intercept =
which(colnames(datasets$variables) == "b_Intercept")],
b[b_x =
which(colnames(datasets$variables) == "b_x")],
b[grouping_term = gamma,
sigma =
which(colnames(datasets$variables) == "sigma")])$y
b[<- mean(ref_gr$y_prior_pred_uncrtnty)
reference_e <- mean(sim_df$y_prior_pred_uncrtnty)
reference_f <- as.matrix(results$fits[[r]]) ## Posterior sample
B <- exp(B[, "b_Intercept"] + 1 * B[, "b_x"])
posterior_equation_a <- exp(B[, "b_Intercept"] + 1 * B[, "b_x"] +
posterior_equation_b 5 * B[, "sd_g__Intercept"]^2)
.<- f_predict_brms_gaussian(results$fits[[r]], ref_gr)
Y_rep_ref_gr_g <- f_predict_brms_uncrtnty(results$fits[[r]], ref_gr)
Y_rep_ref_gr_u <- f_predict_brms_gaussian(results$fits[[r]], sim_df)
Y_rep_sim_df_g <- f_predict_brms_uncrtnty(results$fits[[r]], sim_df)
Y_rep_sim_df_u <- stat_mean_ref_gr_gaussian_a <-
stat_mean_ref_gr_uncrtnty_a <- stat_mean_ref_gr_gaussian_b <-
stat_mean_ref_gr_uncrtnty_b <- stat_mean_sim_df_gaussian_a <-
stat_mean_sim_df_uncrtnty_a <- stat_mean_sim_df_gaussian_b <-
stat_mean_sim_df_uncrtnty_b ##
<- stat_mean_ref_gr_gaussian_c <-
stat_mean_ref_gr_uncrtnty_c <- stat_mean_ref_gr_gaussian_d <-
stat_mean_ref_gr_uncrtnty_d <- stat_mean_sim_df_gaussian_c <-
stat_mean_sim_df_uncrtnty_c <- stat_mean_sim_df_gaussian_d <-
stat_mean_sim_df_uncrtnty_d ##
<- stat_mean_ref_gr_gaussian_e <-
stat_mean_ref_gr_uncrtnty_e <- stat_mean_ref_gr_gaussian_f <-
stat_mean_ref_gr_uncrtnty_f <- stat_mean_sim_df_gaussian_e <-
stat_mean_sim_df_uncrtnty_e <- stat_mean_sim_df_gaussian_f <-
stat_mean_sim_df_uncrtnty_f ##
<- eq_a_b <- eq_a_c <- eq_a_d <- eq_a_e <- eq_a_f <-
eq_a_a <- eq_b_b <- eq_b_c <- eq_b_d <- eq_b_e <- eq_b_f <-
eq_b_a rep(NA, nrow(Y_rep_ref_gr_g))
for (s in 1:nrow(Y_rep_ref_gr_g)) {
$y_rep_gaussian <- Y_rep_ref_gr_g[s, ]
ref_gr$y_rep_uncrtnty <- Y_rep_ref_gr_u[s, ]
ref_gr$y_rep_gaussian <- Y_rep_sim_df_g[s, ]
sim_df$y_rep_uncrtnty <- Y_rep_sim_df_u[s, ]
sim_df## Calculate stats:
<- equation_a < mean(ref_gr$y_rep_gaussian)
stat_mean_ref_gr_gaussian_a[s] <- equation_a < mean(ref_gr$y_rep_uncrtnty)
stat_mean_ref_gr_uncrtnty_a[s] <- equation_b < mean(ref_gr$y_rep_gaussian)
stat_mean_ref_gr_gaussian_b[s] <- equation_b < mean(ref_gr$y_rep_uncrtnty)
stat_mean_ref_gr_uncrtnty_b[s] <- equation_a < mean(sim_df$y_rep_gaussian)
stat_mean_sim_df_gaussian_a[s] <- equation_a < mean(sim_df$y_rep_uncrtnty)
stat_mean_sim_df_uncrtnty_a[s] <- equation_b < mean(sim_df$y_rep_gaussian)
stat_mean_sim_df_gaussian_b[s] <- equation_b < mean(sim_df$y_rep_uncrtnty)
stat_mean_sim_df_uncrtnty_b[s] ##
<- reference_c < mean(ref_gr$y_rep_gaussian)
stat_mean_ref_gr_gaussian_c[s] <- reference_c < mean(ref_gr$y_rep_uncrtnty)
stat_mean_ref_gr_uncrtnty_c[s] <- reference_d < mean(ref_gr$y_rep_gaussian)
stat_mean_ref_gr_gaussian_d[s] <- reference_d < mean(ref_gr$y_rep_uncrtnty)
stat_mean_ref_gr_uncrtnty_d[s] <- reference_c < mean(sim_df$y_rep_gaussian)
stat_mean_sim_df_gaussian_c[s] <- reference_c < mean(sim_df$y_rep_uncrtnty)
stat_mean_sim_df_uncrtnty_c[s] <- reference_d < mean(sim_df$y_rep_gaussian)
stat_mean_sim_df_gaussian_d[s] <- reference_d < mean(sim_df$y_rep_uncrtnty)
stat_mean_sim_df_uncrtnty_d[s] ##
<- reference_e < mean(ref_gr$y_rep_gaussian)
stat_mean_ref_gr_gaussian_e[s] <- reference_e < mean(ref_gr$y_rep_uncrtnty)
stat_mean_ref_gr_uncrtnty_e[s] <- reference_f < mean(ref_gr$y_rep_gaussian)
stat_mean_ref_gr_gaussian_f[s] <- reference_f < mean(ref_gr$y_rep_uncrtnty)
stat_mean_ref_gr_uncrtnty_f[s] <- reference_e < mean(sim_df$y_rep_gaussian)
stat_mean_sim_df_gaussian_e[s] <- reference_e < mean(sim_df$y_rep_uncrtnty)
stat_mean_sim_df_uncrtnty_e[s] <- reference_f < mean(sim_df$y_rep_gaussian)
stat_mean_sim_df_gaussian_f[s] <- reference_f < mean(sim_df$y_rep_uncrtnty)
stat_mean_sim_df_uncrtnty_f[s] ##
<- equation_a < posterior_equation_a[s]
eq_a_a[s] <- equation_b < posterior_equation_a[s]
eq_a_b[s] <- reference_c < posterior_equation_a[s]
eq_a_c[s] <- reference_d < posterior_equation_a[s]
eq_a_d[s] <- reference_e < posterior_equation_a[s]
eq_a_e[s] <- reference_f < posterior_equation_a[s]
eq_a_f[s] ##
<- equation_a < posterior_equation_b[s]
eq_b_a[s] <- equation_b < posterior_equation_b[s]
eq_b_b[s] <- reference_c < posterior_equation_b[s]
eq_b_c[s] <- reference_d < posterior_equation_b[s]
eq_b_d[s] <- reference_e < posterior_equation_b[s]
eq_b_e[s] <- reference_f < posterior_equation_b[s]
eq_b_f[s]
}## Calculate posterior predictive p-values:
<- mean(stat_mean_ref_gr_gaussian_a)
p_stat_mean_ref_gr_gaussian_a[r] <- mean(stat_mean_ref_gr_gaussian_b)
p_stat_mean_ref_gr_gaussian_b[r] <- mean(stat_mean_ref_gr_gaussian_c)
p_stat_mean_ref_gr_gaussian_c[r] <- mean(stat_mean_ref_gr_gaussian_d)
p_stat_mean_ref_gr_gaussian_d[r] <- mean(stat_mean_ref_gr_gaussian_e)
p_stat_mean_ref_gr_gaussian_e[r] <- mean(stat_mean_ref_gr_gaussian_f)
p_stat_mean_ref_gr_gaussian_f[r] ##
<- mean(stat_mean_ref_gr_uncrtnty_a)
p_stat_mean_ref_gr_uncrtnty_a[r] <- mean(stat_mean_ref_gr_uncrtnty_b)
p_stat_mean_ref_gr_uncrtnty_b[r] <- mean(stat_mean_ref_gr_uncrtnty_c)
p_stat_mean_ref_gr_uncrtnty_c[r] <- mean(stat_mean_ref_gr_uncrtnty_d)
p_stat_mean_ref_gr_uncrtnty_d[r] <- mean(stat_mean_ref_gr_uncrtnty_e)
p_stat_mean_ref_gr_uncrtnty_e[r] <- mean(stat_mean_ref_gr_uncrtnty_f)
p_stat_mean_ref_gr_uncrtnty_f[r] ##
<- mean(stat_mean_sim_df_gaussian_a)
p_stat_mean_sim_df_gaussian_a[r] <- mean(stat_mean_sim_df_gaussian_b)
p_stat_mean_sim_df_gaussian_b[r] <- mean(stat_mean_sim_df_gaussian_c)
p_stat_mean_sim_df_gaussian_c[r] <- mean(stat_mean_sim_df_gaussian_d)
p_stat_mean_sim_df_gaussian_d[r] <- mean(stat_mean_sim_df_gaussian_e)
p_stat_mean_sim_df_gaussian_e[r] <- mean(stat_mean_sim_df_gaussian_f)
p_stat_mean_sim_df_gaussian_f[r] ##
<- mean(stat_mean_sim_df_uncrtnty_a)
p_stat_mean_sim_df_uncrtnty_a[r] <- mean(stat_mean_sim_df_uncrtnty_b)
p_stat_mean_sim_df_uncrtnty_b[r] <- mean(stat_mean_sim_df_uncrtnty_c)
p_stat_mean_sim_df_uncrtnty_c[r] <- mean(stat_mean_sim_df_uncrtnty_d)
p_stat_mean_sim_df_uncrtnty_d[r] <- mean(stat_mean_sim_df_uncrtnty_e)
p_stat_mean_sim_df_uncrtnty_e[r] <- mean(stat_mean_sim_df_uncrtnty_f)
p_stat_mean_sim_df_uncrtnty_f[r] ##
<- mean(eq_a_a)
p_stat_mean_eq_a_a[r] <- mean(eq_a_b)
p_stat_mean_eq_a_b[r] <- mean(eq_a_c)
p_stat_mean_eq_a_c[r] <- mean(eq_a_d)
p_stat_mean_eq_a_d[r] <- mean(eq_a_e)
p_stat_mean_eq_a_e[r] <- mean(eq_a_f)
p_stat_mean_eq_a_f[r] ##
<- mean(eq_b_a)
p_stat_mean_eq_b_a[r] <- mean(eq_b_b)
p_stat_mean_eq_b_b[r] <- mean(eq_b_c)
p_stat_mean_eq_b_c[r] <- mean(eq_b_d)
p_stat_mean_eq_b_d[r] <- mean(eq_b_e)
p_stat_mean_eq_b_e[r] <- mean(eq_b_f)
p_stat_mean_eq_b_f[r] if (r %% 50 < .5) {cat(".\n")} else {cat(".")} ## Progress 'bar'
}
Organize results
<- rbind(data.frame('brms_sample_new_levels' = "gaussian",
results_holdout_ppc 'use_prior' = "exp(b0+b1)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_gaussian_a),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' = "exp(b0+b1)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_uncrtnty_a),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' = "exp(b0+b1+.5*sigma_gamma^2)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_gaussian_b),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' = "exp(b0+b1+.5*sigma_gamma^2)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_uncrtnty_b),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' = "exp(b0+b1)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_gaussian_a),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' = "exp(b0+b1)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_uncrtnty_a),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' = "exp(b0+b1+.5*sigma_gamma^2)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_gaussian_b),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' = "exp(b0+b1+.5*sigma_gamma^2)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_uncrtnty_b),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on ref. grid",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_gaussian_c),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on ref. grid",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_uncrtnty_c),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on sim. data",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_gaussian_d),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on sim. data",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_uncrtnty_d),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on ref. grid",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_gaussian_c),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on ref. grid",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_uncrtnty_c),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on sim. data",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_gaussian_d),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on sim. data",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_uncrtnty_d),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on ref. grid (uncertainty)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_gaussian_e),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on ref. grid (uncertainty)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_uncrtnty_e),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on sim. data (uncertainty)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_gaussian_f),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on sim. data (uncertainty)",
'posterior_predict_population' = "ref. grid",
'value' = p_stat_mean_ref_gr_uncrtnty_f),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on ref. grid (uncertainty)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_gaussian_e),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on ref. grid (uncertainty)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_uncrtnty_e),
data.frame('brms_sample_new_levels' = "gaussian",
'use_prior' =
"mean of prior pred. on sim. data (uncertainty)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_gaussian_f),
data.frame('brms_sample_new_levels' = "uncertainty",
'use_prior' =
"mean of prior pred. on sim. data (uncertainty)",
'posterior_predict_population' = "sim. data",
'value' = p_stat_mean_sim_df_uncrtnty_f),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1)]",
'use_prior' = "exp(b0+b1)",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_a_a),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1)]",
'use_prior' = "exp(b0+b1+.5*sigma_gamma^2)",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_a_b),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1+.5*sigma_gamma^2)]",
'use_prior' = "exp(b0+b1)",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_b_a),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1+.5*sigma_gamma^2)]",
'use_prior' = "exp(b0+b1+.5*sigma_gamma^2)",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_b_b),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1)]",
'use_prior' = "prior_pred_ref_gr_gaussian",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_a_c),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1+.5*sigma_gamma^2)]",
'use_prior' = "prior_pred_ref_gr_gaussian",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_b_c),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1)]",
'use_prior' = "prior_pred_sim_df_gaussian",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_a_d),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1+.5*sigma_gamma^2)]",
'use_prior' = "prior_pred_sim_df_gaussian",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_b_d),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1)]",
'use_prior' = "prior_pred_ref_gr_uncrtnty",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_a_e),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1+.5*sigma_gamma^2)]",
'use_prior' = "prior_pred_ref_gr_uncrtnty",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_b_e),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1)]",
'use_prior' = "prior_pred_sim_df_uncrtnty",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_a_f),
data.frame('brms_sample_new_levels' =
"none [exp(b0+b1+.5*sigma_gamma^2)]",
'use_prior' = "prior_pred_sim_df_uncrtnty",
'posterior_predict_population' = "none",
'value' = p_stat_mean_eq_b_f))
Visualise QOI-Check
<- results_holdout_ppc
tmp $variable <- paste0("Mean of prediction on ",
tmp$posterior_predict_population, " with\n",
tmp$brms_sample_new_levels, " sampling for gamma.\n",
tmp"Prior application: ", tmp$use_prior)
$variable <- gsub(x = tmp$variable, pattern = "gaussian",
tmpreplacement = "Gaussian", fixed = T)
$rank <- tmp$value * 1000
tmp$max_rank <- 1000
tmp$sim_id <- rep(1:100, 6 * 6)
tmp<- dat_ecdf_diff(x = tmp)
tmp $ecdf_df$x1 <- sapply(X = strsplit(x = tmp$ecdf_df$variable, split = '\n',
tmpfixed = T), FUN = function(x){x[1]})
$ecdf_df$x2 <- sapply(X = strsplit(x = tmp$ecdf_df$variable, split = '\n',
tmpfixed = T), FUN = function(x){x[2]})
$ecdf_df$x3 <- sapply(X = strsplit(x = tmp$ecdf_df$variable, split = '\n',
tmpfixed = T), FUN = function(x){x[3]})
$ecdf_df$x3 <- gsub(x = tmp$ecdf_df$x3, pattern = "Prior application: ",
tmpreplacement = "", fixed = T)
$ecdf_df$x3f <- as.factor(tmp$ecdf_df$x3)
tmp$ecdf_df$x4f <- as.factor(tmp$ecdf_df$x3)
tmplevels(tmp$ecdf_df$x3f) <- c(expression("exp(" * beta[0] * "+" * beta[1] * ")"),
expression("exp(" * beta[0] * "+" * beta[1] *
"+0.5" * sigma[gamma]^2 * ")"),
expression(paste("Mean pre. ref. grid")),
expression(paste("Mean pre. ref. grid")),
expression(paste("Mean pre. sim. data")),
expression(paste("Mean pre. sim. data")),
expression(paste("Mean pre. ref. grid")),
expression(paste("Mean pre. ref. grid")),
expression(paste("Mean pre. sim. data")),
expression(paste("Mean pre. sim. data")))
levels(tmp$ecdf_df$x4f) <- c("",
"",
"Gaussian sampling",
"uncertainty sampling",
"Gaussian sampling",
"uncertainty sampling",
"Gaussian sampling",
"uncertainty sampling",
"Gaussian sampling",
"uncertainty sampling")
$ecdf_df$x1a <- "Posterior application:"
tmp$ecdf_df$x3a <- "Prior application:"
tmp$ecdf_df$x1[tmp$ecdf_df$x1 == "Mean of prediction on none with"] <- " "
tmp$ecdf_df$x1f <- as.factor(tmp$ecdf_df$x1)
tmplevels(tmp$ecdf_df$x1f) <- c(" ",
"Mean pre. ref. grid",
"Mean pre. sim. data")
$ecdf_df$x2[tmp$ecdf_df$x2 == "none [exp(b0+b1)] sampling for gamma."] <-
tmp"exp(b0+b1)"
$ecdf_df$x2[tmp$ecdf_df$x2 ==
tmp"none [exp(b0+b1+.5*sigma_gamma^2)] sampling for gamma."] <-
"exp(b0+b1+.5*sigma_gamma^2)"
$ecdf_df$x2f <- as.factor(tmp$ecdf_df$x2)
tmplevels(tmp$ecdf_df$x2f) <- c(expression("exp(" * beta[0] * "+" * beta[1] * ")"),
expression("exp(" * beta[0] * "+" * beta[1] *
"+0.5" * sigma[gamma]^2 * ")"),
expression(paste("Gaussian sampling")),
expression(paste("uncertainty sampling")))
<- col_fill <-
col_color c(colorspace::sequential_hcl(n = 20, pal = "Grays")[10], "black")
names(col_color) <- names(col_fill) <-
c("theoretical CDF", "sample ECDF")
<- c('theoretical CDF' = expression("Theoretical CDF"),
l 'sample ECDF' = expression("Sample ECDF"))
ggplot(tmp$ecdf_df, aes(color = type, fill = type)) +
geom_ribbon(data = tmp$limits_df_trans,
aes(x = x, ymax = ymax, ymin = ymin), alpha = .33,
linewidth = 1) +
geom_step(aes(x = z, y = z_diff, group = variable)) +
scale_color_manual(name = "", values = col_color, labels = l) +
scale_fill_manual(name = "", values = col_fill, labels = l) +
scale_alpha_identity() +
xlab(NULL) +
ylab(NULL) +
facet_grid(cols = vars(x1a, x1f, x2f),
rows = vars(x3a, x3f, x4f),
labeller = labeller(x2f = label_parsed,
x3f = label_parsed)) +
theme(strip.text = element_text(margin = margin(b = 0, t = 0))) +
theme(legend.position = "top")