library("mgcv")
library("SBC")
library("brms")
library("cmdstanr")
library("ggplot2")
library("colorspace")
library("cowplot")
library("future")Intro
Relevant parts of the code given here is inspired by the resources of the great SBC package (Modrák et al., 2023) published at https://hyunjimoon.github.io/SBC/index.html.
Also, coding of posterior p-values is inspired by the split predictive check (Li & Huggins, 2024) R code published at https://github.com/TARPS-group/split-predictive-checks.
Software
We use the statistical software environment R (R Core Team, 2024), and R add-on packages SBC (Modrák et al., 2023), brms (Bürkner, 2017, 2018, 2021), mgcv (Wood, 2011), cmdstanr (Gabry et al., 2024), ggplot2 (Wickham, 2016), colorspace (Stauffer et al., 2009), and future (Bengtsson, 2021).
This document is produced using Quarto (Allaire et al., 2024).
Organize R Session
Load packages:
Settings:
rm(list = ls())
options(brms.backend = "cmdstanr")
# Using parallel processing
plan(multisession)
options(SBC.min_chunk_size = 5)
# Setup caching of results
cache_dir <- "./SBC_cache"
if(!dir.exists(cache_dir)) {
dir.create(cache_dir)
}Data preparation function for graphical uniformity checks:
# convenient wrapper around mgcv::PredictMat
PredictMat <- function(object, data, ...) {
data <- rm_attr(data, "terms")
out <- mgcv::PredictMat(object, data = data, ...)
if (length(dim(out)) < 2L) {
# fixes issue #494
out <- matrix(out, nrow = 1)
}
out
}
# convenient wrapper around mgcv::smoothCon
smoothCon <- function(object, data, ...) {
data <- rm_attr(data, "terms")
vars <- setdiff(c(object$term, object$by), "NA")
for (v in vars) {
if (is_like_factor(data[[v]])) {
# allow factor-like variables #562
data[[v]] <- as.factor(data[[v]])
} else if (inherits(data[[v]], "difftime")) {
# mgcv cannot handle 'difftime' variables
data[[v]] <- as.numeric(data[[v]])
}
}
mgcv::smoothCon(object, data = data, ...)
}
# Aid prediction from smooths represented as 'type = 2'
# code obtained from the doc of ?mgcv::smooth2random
# @param sm output of mgcv::smoothCon
# @param re output of mgcv::smooth2random
# @param data new data supplied for prediction
# @return A list of the same structure as returned by mgcv::smooth2random
s2rPred <- function(sm, re, data) {
#sm = sm[[1]] ## for check!
#re = hmmmm ## for check!
#data = nd ## for check!
# prediction matrix for new data
X <- mgcv::PredictMat(sm, data)
# transform to RE parameterization
if (!is.null(re$trans.U)) {
X <- X %*% re$trans.U
}
if (is.null(re$trans.D)) {
# regression spline without penalization
out <- list(Xf = X)
} else {
X <- t(t(X) * re$trans.D)
# re-order columns according to random effect re-ordering
X[, re$rind] <- X[, re$pen.ind != 0]
# re-order penalization index in same way
pen.ind <- re$pen.ind
pen.ind[re$rind] <- pen.ind[pen.ind > 0]
# start returning the object
Xf <- X[, which(re$pen.ind == 0), drop = FALSE]
out <- list(rand = list(), Xf = Xf)
for (i in seq_along(re$rand)) {
# loop over random effect matrices
out$rand[[i]] <- X[, which(pen.ind == i), drop = FALSE]
attr(out$rand[[i]], "s.label") <- attr(re$rand[[i]], "s.label")
}
names(out$rand) <- names(re$rand)
}
out
}Data preparation function for graphical uniformity checks:
## from https://github.com/TeemuSailynoja/simultaneous-confidence-bands/tree/main
dat_ecdf_diff <- function (x, variables = NULL, K = NULL,
gamma = NULL, prob = 0.95,
combine_variables = NULL,
ecdf_alpha = NULL, ...,
parameters = NULL) {
if (!is.null(parameters)) {
warning("The `parameters` argument is deprecated use `variables` instead.")
if (is.null(variables)) {
variables <- parameters
}
}
ecdf_data <- data_for_ecdf_plots(x, variables = variables,
prob = prob, K = K, gamma = gamma,
combine_variables = combine_variables,
ecdf_alpha = ecdf_alpha, ...)
if (ecdf_data$N < 50 && is.null(K)) {
message("With less than 50 simulations, we recommend using plot_ecdf as it has better fidelity.\n",
"Disable this message by explicitly setting the K parameter. ",
"You can use the strings \"max\" (high fidelity) or \"min\" (nicer plot) or choose a specific integer.")
}
N <- ecdf_data$N
K <- ecdf_data$K
z <- ecdf_data$z
ecdf_df <- dplyr::mutate(ecdf_data$ecdf_df, z_diff = ecdf -
z, type = "sample ECDF")
limits_df_trans <- dplyr::mutate(ecdf_data$limits_df,
ymax = upper - uniform_val,
ymin = lower - uniform_val,
type = "theoretical CDF")
return(list('ecdf_df' = ecdf_df,
'limits_df_trans' = limits_df_trans))
}Data Generator
f_generate_y <- function(sim_df,
eta,
sigma) {
sim_df$'mu_y' <- eta
sim_df$'sigma_y' <- rep(sigma, nrow(sim_df))
sim_df$'y' <- rnorm(n = nrow(sim_df),
mean = sim_df$'mu_y',
sd = sim_df$'sigma_y')
return(sim_df)
}
f_generate_xz <- function(N, b_Intercept, phi) {
mu <- plogis(b_Intercept)
return(rbeta(n = N, shape1 = mu * phi, shape2 = (1 - mu) * phi))
}
f_generate_one_sim <- function(N) {
prior_mu <- list('b_x_Intercept' = 0,
'b_z_Intercept' = -1.1,
'b_y_Intercept' = .5,
'bs_y_sxz_1' = 1,
'bs_y_sxz_2' = -1,
'sds_y_sxz_1' = 1,
'phi_x' = 3,
'phi_z' = 3,
'sigma_y' = .1)
prior_sd <- list('b_x_Intercept' = .001,
'b_z_Intercept' = .001,
'b_y_Intercept' = .1,
'bs_y_sxz_1' = .1,
'bs_y_sxz_2' = .1,
'sds_y_sxz_1' = .1,
'phi_x' = .001,
'phi_z' = .001,
'sigma_y' = .01)
b_y_Intercept <- rnorm(n = 1,
mean = prior_mu$b_y_Intercept,
sd = prior_sd$b_y_Intercept)
bs_y_sxz_1 <- rnorm(n = 1,
mean = prior_mu$bs_y_sxz_1,
sd = prior_sd$bs_y_sxz_1)
bs_y_sxz_2 <- rnorm(n = 1,
mean = prior_mu$bs_y_sxz_2,
sd = prior_sd$bs_y_sxz_2)
sds_y_sxz_1 <- rnorm(n = 1,
mean = prior_mu$sds_y_sxz_1,
sd = prior_sd$sds_y_sxz_1)
sds_y_sxz_1 <- abs(sds_y_sxz_1)
sigma_y <- rnorm(n = 1,
mean = prior_mu$sigma_y,
sd = prior_sd$sigma_y)
b_x_Intercept <- rnorm(n = 1,
mean = prior_mu$b_x_Intercept,
sd = prior_sd$b_x_Intercept)
phi_x <- rnorm(n = 1,
mean = prior_mu$phi_x,
sd = prior_sd$phi_x)
phi_x <- abs(phi_x)
b_z_Intercept <- rnorm(n = 1,
mean = prior_mu$b_z_Intercept,
sd = prior_sd$b_z_Intercept)
phi_z <- rnorm(n = 1,
mean = prior_mu$phi_z,
sd = prior_sd$phi_z)
phi_z <- abs(phi_z)
x <- f_generate_xz(N, b_x_Intercept, phi_x)
z <- f_generate_xz(N, b_z_Intercept, phi_z)
dat_xz <- data.frame('x' = x, 'z' = z)
sm <- mgcv::smoothCon(object = s(x, z, k = 10), data = dat_xz,
absorb.cons = TRUE, modCon = 3, diagonal.penalty = TRUE)
sm2r <- mgcv::smooth2random(object = sm[[1]], vnames = c('x', 'z'), type = 2)
Xf <- sm2r[['Xf']]
Xr <- sm2r[['rand']][['Xr']]
s_y_sxz <- t(MASS::mvrnorm(n = 1, mu = rep(0, ncol(Xr)),
Sigma = sds_y_sxz_1 * diag(ncol(Xr))))
variables <- list('b_y_Intercept' = b_y_Intercept,
'bs_y_sxz_1' = bs_y_sxz_1,
'bs_y_sxz_2' = bs_y_sxz_2,
'sds_y_sxz_1' = sds_y_sxz_1,
's_y_sxz[1]' = s_y_sxz[1, 1],
's_y_sxz[2]' = s_y_sxz[1, 2],
's_y_sxz[3]' = s_y_sxz[1, 3],
's_y_sxz[4]' = s_y_sxz[1, 4],
's_y_sxz[5]' = s_y_sxz[1, 5],
's_y_sxz[6]' = s_y_sxz[1, 6],
's_y_sxz[7]' = s_y_sxz[1, 7],
'sigma_y' = sigma_y,
'b_x_Intercept' = b_x_Intercept,
'b_z_Intercept' = b_z_Intercept,
'phi_x' = phi_x,
'phi_z' = phi_z)
etaf <- Xf %*% c(bs_y_sxz_1, bs_y_sxz_2)
etar <- Xr %*% t(s_y_sxz)
eta <- b_y_Intercept + etaf + etar
return(list('variables' = variables,
'generated' = f_generate_y(sim_df = dat_xz,
eta = eta,
sigma = sigma_y)))
}
f_generate_n_sims <- SBC_generator_function(f_generate_one_sim,
N = 1000)
set.seed(123)
datasets <- generate_datasets(f_generate_n_sims, 20)
head(datasets$generated[[1]]) x z mu_y sigma_y y
1 0.6428125 0.471219239 0.3496019 0.1012929 0.33045945
2 0.1542820 0.158118671 0.1796588 0.1012929 0.10296001
3 0.3846568 0.116883626 0.1478711 0.1012929 0.22810111
4 0.3718601 0.228222288 0.2942474 0.1012929 0.43050453
5 0.4644184 0.003815211 0.1276784 0.1012929 0.05732729
6 0.6761364 0.082005730 1.0524761 0.1012929 1.00740764
Backend
priors <- prior(normal(.5,.1), class = "Intercept", resp = "y") +
prior(normal(1,.1), class = "b", coef = "sxz_1", resp = "y") +
prior(normal(-1,.1), class = "b", coef = "sxz_2", resp = "y") +
prior(normal(1,.1), class = "sds", resp = "y") +
prior(normal(.1,.01), class = "sigma", resp = "y") +
prior(normal(0,.001), class = "Intercept", resp = "x") +
prior(normal(-1.1,.001), class = "Intercept", resp = "z") +
prior(normal(3,.001), class = "phi", resp = "x") +
prior(normal(3,.001), class = "phi", resp = "z")
frmla <- bf(x ~ 1, family = Beta(link = "logit")) +
bf(z ~ 1, family = Beta(link = "logit")) +
bf(y ~ s(x, z, k = 10), family = gaussian())
f_backend <- SBC_backend_brms(frmla,
prior = priors,
chains = 1,
iter = 2000,
template_data = datasets$generated[[1]],
out_stan_file = file.path(cache_dir,
"case_study_II.stan"))set.seed(0)
results <- compute_SBC(datasets = datasets,
backend = f_backend,
cache_mode = "results",
cache_location = file.path(cache_dir,
"SBC_results_case_study_II"))Visualize bivariate smooth at first four simulation runs
nd <- expand.grid('x' = seq(0, 1, by = .02), 'z' = seq(0, 1, by = .02))
p_list <- vector("list", 4)
for (r in 1:4) {
m <- results$fits[[r]]
fit <- fitted(object = m, newdata = nd)
nd$fit_y <- fit[, 'Estimate', 'y']
p_list[[r]] <- ggplot(data = nd, aes(x = x, y = z)) +
geom_raster(aes(fill = fit_y)) +
scale_fill_continuous_sequential(pal = "Grays", l1 = 0, l2 = 70) +
geom_point(data = m$data, aes(color = y), alpha = .8) +
scale_color_continuous_sequential(breaks = scales::breaks_extended(n = 3),
pal = "Grays", l1 = 0, l2 = 70, rev = FALSE) +
# colorspace::scale_color_continuous_sequential(pal = "Rocket") +
guides(color = guide_legend(nrow = 1)) +
labs(color = "y:", fill = "E(Y | x, z):",
subtitle = paste0("Simulation run ", r, ":")) +
theme(legend.position = 'top') +
theme(legend.title = element_text(vjust = .8))
}
cowplot::plot_grid(plotlist = p_list, ncol = 2)QOI-Check
f_postprocessing <- function(m, b, focus_on = 'x', at = .8,
integration_grid = seq(.025, .975, by = .05)) {
if (!(focus_on %in% c('x', 'z'))) {
stop("'focus_on' must be either 'x' or 'z'.")
}
nd <- data.frame('variable1' = at)
nd <- as.data.frame(expand.grid('variable1' = nd$variable1,
'variable2' = integration_grid))
names(nd) <- c('x', 'z')
fit <- fitted(m, newdata = nd, dpar = "mu", summary = F)
if (focus_on == 'x') {
phi <- fitted(m, newdata = nd, resp = "z", dpar = "phi", summary = F)
shape1 <- fit[, , "z"] * phi
shape2 <- (1 - fit[, , "z"]) * phi
d <- shape1 * 0
for (i in 1:nrow(nd)) {
d[, i] <- dbeta(x = nd$z[i], shape1 = shape1[, i], shape2 = shape2[, i])
}
}
if (focus_on == 'z') {
phi <- fitted(m, newdata = nd, resp = "x", dpar = "phi", summary = F)
shape1 <- fit[, , "x"] * phi
shape2 <- (1 - fit[, , "x"]) * phi
d <- shape1 * 0
for (i in 1:nrow(nd)) {
d[, i] <- dbeta(x = nd$x[i], shape1 = shape1[, i], shape2 = shape2[, i])
}
}
d <- d / apply(d, MAR = 1, FUN = sum)
tmp <- abs(apply(d, MAR = 1, FUN = sum) - 1)
if (any(tmp > 1e-10)) {
stop("No proper calculation of weights!")
}
return(list('Weighted' = apply(d * fit[, , 'y'], MAR = 1, FUN = sum),
'Unweighted' = apply(fit[, , 'y'], MAR = 1, FUN = mean)))
}
ranks_rgpp <- NULL
for (j in 1:20) {
cat(paste0("j = ", j, ":\n"))
cat(" f_postprocessing_fast():\n")
cat(" ")
res <- all_weighted <- NULL
x_grid <- seq(.025, .975, by = .05)
for (i in 1:length(x_grid)) {
x <- x_grid[i]
tmp <- f_postprocessing(results$fits[[j]],
as.data.frame(datasets$variables)[j, ],
focus_on = 'x', x,
seq(.0005, .9995, by = .001))
all_weighted <- cbind(all_weighted, tmp[['Weighted']])
for (method in c('Weighted', 'Unweighted')) {
res <- rbind(res,
data.frame('x' = x,
'y' = mean(tmp[[method]]),
'method' = method,
'stat' = 'Mean'))
res <- rbind(res,
data.frame('x' = x,
'y' = median(tmp[[method]]),
'method' = method,
'stat' = 'Median'))
res <- rbind(res,
data.frame('x' = x,
'y' = quantile(tmp[[method]], probs = .025),
'method' = method,
'stat' = '2.5% quantile'))
res <- rbind(res,
data.frame('x' = x,
'y' = quantile(tmp[[method]], probs = .975),
'method' = method,
'stat' = '97.5% quantile'))
}
cat(".")
}
res0 <- res
## Compare to GAM s() + s() + ti()
## https://cran.r-project.org/web/packages/mgcv/mgcv.pdf
## p. 79:
## "Sometimes it is interesting to specify smooth models with a main
## effects + interaction structure such as
## E(yi) = f1(xi) + f2(zi) + f3(xi, zi)
## [...]
## for example.
## Such models should be set up using ti terms in the model formula.
## For example:
## y ~ ti(x) + ti(z) + ti(x,z),
## [...]
## The ti terms produce interactions with the component main effects
## excluded appropriately. (There is in fact no need to use ti terms
## for the main effects here, s terms could also be used.)" ## "[...] in fact no need to use ti terms for the main effects here,
## s terms could also be used."
m_gam <- gam(y ~ s(x) + s(z) + ti(x, z), data = datasets$generated[[j]])
nd <- data.frame(x = x_grid, z = x_grid)
tmp <- predict(m_gam, type = "terms", newdata = nd)
res <- rbind(res,
data.frame(x = x_grid,
y = tmp[, 's(x)'] + coef(m_gam)[1],
method = "mgcv::gam: s(x) [model: y ~ s(x) + s(z) + ti(x, z)]",
stat = "Effect"))
## ############################################
## Reference grid posterior predictive check ##
## ############################################
cat("\n")
cat(" reference grid prior prediction:\n")
cat(" ")
x_grid <- seq(.025, .975, by = .05)
ref_grid_prior_mean_y_x <- rep(NA, length(x_grid))
for (i in 1:length(x_grid)) {
b <- as.data.frame(datasets$variables)[j, ]
z <- f_generate_xz(N = 10000,
b_Intercept = as.numeric(b['b_z_Intercept']),
phi = as.numeric(b['phi_z']))
dat_xz <- data.frame('x' = x_grid[i], 'z' = z)
sm <- mgcv::smoothCon(object = s(x, z, k = 10), data = results$fits[[j]]$data,
absorb.cons = TRUE, modCon = 3, diagonal.penalty = TRUE)
sm2r <- mgcv::smooth2random(object = sm[[1]], vnames = c('x', 'z'), type = 2)
sm2r_pre <- s2rPred(sm = sm[[1]], re = sm2r, data = dat_xz)
Xf <- sm2r_pre[['Xf']]
Xr <- sm2r_pre[['rand']][['Xr']]
etaf <- Xf %*% as.numeric(b[grep(names(b), pattern = "bs_", fixed = T)])
etar <- Xr %*% as.numeric(b[grep(names(b), pattern = "s_y_sxz[", fixed = T)])
eta <- as.numeric(b['b_y_Intercept']) + etaf + etar
dat_xz <- f_generate_y(dat_xz, as.numeric(eta), as.numeric(b["sigma_y"]))
ref_grid_prior_mean_y_x[i] <- mean(dat_xz$y)
cat(".")
}
cat("\n")
res <- rbind(res,
data.frame(x = x_grid,
y = ref_grid_prior_mean_y_x,
method = "Reference grid prior prediction (N = 1000)",
stat = "Mean"))
res$stat <- factor(res$stat, levels = c("Mean", "Median", "2.5% quantile",
"97.5% quantile", "Effect"))
if (j == 1) {
p <- ggplot(data = subset(res, stat != "Median"), aes(x = x, y = y)) +
geom_line(aes(group = paste0(stat, method),
linetype = stat)) + #, color = method
theme(legend.position = "top", legend.box="vertical", legend.margin=margin()) +
guides(color = "none") +
scale_linetype_manual(values = c(1, 3, 3, 2)) +
facet_wrap(~ method) +
labs(color = "Method:", linetype = "Statistic:")
}
rgpp_all_weighted <- rbind(subset(res,
method == "reference grid prior prediction (N = 1000)")$y,
all_weighted)
ranks_rgpp <- cbind(ranks_rgpp,
apply(rgpp_all_weighted, MAR = 2,
FUN = function(x){mean(x[-1] > x[1])}))
}
ranks <- data.frame('rank' = reshape2::melt(ranks_rgpp)$value,
'max_rank' = 1,
'variable' = reshape2::melt(ranks_rgpp)$Var1,
'sim_id' = reshape2::melt(ranks_rgpp)$Var2)
ranks$rank <- ranks$rank * 1000
ranks$rank <- ranks$rank - 1 + runif(nrow(ranks))
ranks$max_rank <- ranks$max_rank * 1000
ranks$variable <- factor(paste("x:", x_grid[ranks$variable]),
levels = paste("x:", x_grid))tmp <- dat_ecdf_diff(ranks, variables = levels(ranks$variable), K = 40)
col_color <- c(colorspace::sequential_hcl(n = 20, pal = "Grays")[10], "black")
col_fill <- c(colorspace::sequential_hcl(n = 20, pal = "Grays")[10], "black")
names(col_color) <- names(col_fill) <-
c("theoretical CDF", "sample ECDF")
l <- c('theoretical CDF' = expression("Theoretical CDF"),
'sample ECDF' = expression("Sample ECDF"))
ggplot(tmp$ecdf_df, aes(color = type, fill = type)) +
geom_ribbon(data = tmp$limits_df_trans,
aes(x = x, ymax = ymax, ymin = ymin), alpha = .33,
linewidth = 1) +
geom_step(aes(x = z, y = z_diff, group = variable)) +
scale_color_manual(name = "", values = col_color, labels = l) +
scale_fill_manual(name = "", values = col_fill, labels = l) +
scale_alpha_identity() +
xlab(NULL) +
ylab(NULL) +
facet_wrap(~variable) +
theme(strip.text = element_text(margin = margin(b = 0, t = 0))) +
theme(legend.position = "top")

