Source code for sctrial.stats.bayes

from __future__ import annotations

from collections.abc import Sequence

import numpy as np
import pandas as pd
from anndata import AnnData

from ..design import TrialDesign
from ._utils import apply_fdr
from .did import (
    AggregateFunc,
    AggregateMode,
    _add_feature_columns,
    _aggregate_for_did,
    _prepare_did_obs,
)

__all__ = ["did_table_bayes", "prior_predictive_check"]


def _build_did_model(
    unit_codes: np.ndarray,
    n_units: int,
    time_vals: np.ndarray,
    arm_vals: np.ndarray,
    y: np.ndarray,
    prior_scale: float = 1.0,
    sigma_scale: float = 1.0,
    covariate_matrix: np.ndarray | None = None,
):
    """Build the Bayesian DiD PyMC model and return it as a context manager.

    This is a shared helper used by both ``did_table_bayes`` and
    ``prior_predictive_check`` so that the prior specification stays in sync.

    Parameters
    ----------
    covariate_matrix
        Optional (n_obs, n_covariates) array of covariate values to include
        as additional fixed effects.
    """
    try:
        import pymc as pm
    except ImportError as exc:
        raise ImportError(
            "pymc is required for Bayesian DiD. Install with: pip install sctrial[bayes]"
        ) from exc

    interaction = time_vals * arm_vals
    model = pm.Model()
    with model:
        sigma = pm.HalfNormal("sigma", sigma_scale)
        alpha = pm.Normal("alpha", 0.0, prior_scale, shape=n_units)
        beta_time = pm.Normal("beta_time", 0.0, prior_scale)
        beta_arm = pm.Normal("beta_arm", 0.0, prior_scale)
        beta_did = pm.Normal("beta_did", 0.0, prior_scale)
        mu = (
            alpha[unit_codes] + beta_time * time_vals + beta_arm * arm_vals + beta_did * interaction
        )
        # Add covariate fixed effects if provided
        if covariate_matrix is not None and covariate_matrix.shape[1] > 0:
            n_cov = covariate_matrix.shape[1]
            beta_cov = pm.Normal("beta_cov", 0.0, prior_scale, shape=n_cov)
            mu = mu + pm.math.dot(covariate_matrix, beta_cov)
        pm.Normal("y", mu=mu, sigma=sigma, observed=y)
    return model


[docs] def did_table_bayes( adata: AnnData, features: Sequence[str], design: TrialDesign, visits: tuple[str, str], *, exclude_crossovers: bool = True, celltype: str | None = None, aggregate: AggregateMode = "participant_visit", layer: str | None = None, standardize: bool = True, agg: AggregateFunc = "mean", covariates: list[str] | None = None, prior_scale: float = 1.0, sigma_scale: float = 1.0, draws: int = 1000, tune: int = 1000, chains: int = 4, target_accept: float = 0.9, max_treedepth: int = 12, seed: int = 42, ) -> pd.DataFrame: """Bayesian DiD with participant random intercepts. Fits a simple hierarchical model: y_ijt = alpha_i + beta_time * time + beta_arm * arm + beta_did * time*arm + X_cov * beta_cov + eps Prior specification ------------------- - alpha_i ~ Normal(0, prior_scale) (participant random intercepts) - beta_time ~ Normal(0, prior_scale) - beta_arm ~ Normal(0, prior_scale) - beta_did ~ Normal(0, prior_scale) - sigma ~ HalfNormal(sigma_scale) These weakly-informative priors are designed to stabilize estimation for small samples while remaining agnostic about effect direction. Use ``prior_scale`` and ``sigma_scale`` to widen or tighten them. Parameters ---------- adata AnnData object containing expression data. features Feature names (genes or ``obs`` columns) to test. design A ``TrialDesign`` specifying participant, visit, and arm columns. visits Tuple of ``(baseline, followup)`` visit labels. exclude_crossovers Whether to exclude crossover participants. celltype If provided, subset to this cell type before analysis. aggregate Aggregation mode (``"participant_visit"`` or ``"cell"``). layer Expression layer for gene features (``None`` uses ``adata.X``). standardize Whether to z-score outcomes before model fitting. agg Aggregation function (``"mean"``, ``"median"``, or ``"pct_pos"``). covariates Optional covariate columns to include as fixed effects. prior_scale Scale (standard deviation) for all Normal priors on coefficients and random intercepts. Increase to make priors more diffuse. sigma_scale Scale for the HalfNormal prior on the observation noise ``sigma``. draws Number of posterior draws (after tuning). tune Number of tuning (warm-up) iterations per chain. chains Number of MCMC chains. target_accept Target acceptance rate for the NUTS sampler. max_treedepth Maximum tree depth for the NUTS sampler. seed Random seed for reproducibility. Returns ------- pd.DataFrame Columns: - feature: Feature name - beta_DiD: Posterior mean of the DiD effect - ci_low: 2.5th percentile of posterior - ci_high: 97.5th percentile of posterior - p_bayes: Two-sided posterior tail probability (not a frequentist p-value) - n_units: Number of paired units used - FDR_bayes: BH-adjusted p_bayes """ try: import pymc as pm except ImportError as exc: # pragma: no cover raise ImportError("pymc is required for did_table_bayes") from exc ad, obs = _prepare_did_obs(adata, design, visits, celltype, exclude_crossovers) cols = [design.participant_col, design.visit_col, design.arm_col, "visit_num", "arm_bin"] if design.celltype_col and design.celltype_col in obs.columns: cols.append(design.celltype_col) if covariates: for c in covariates: if c not in obs.columns: raise KeyError(f"Covariate '{c}' not found in adata.obs") cols.append(c) df = obs[cols].copy() df, final_features = _add_feature_columns(df, ad, features, layer) df_use, unit, time, arm_bin = _aggregate_for_did( df, final_features, design, visits, aggregate, agg, covariates, ) cov_cols = covariates or [] rows = [] for feat in final_features: select_cols = [unit, time, arm_bin, feat] + cov_cols df_feat = df_use[select_cols].dropna() if df_feat.empty: rows.append({"feature": feat, "beta_DiD": np.nan, "n_units": 0}) continue # participant index for random intercepts (remove unused categories) unit_cat = df_feat[unit].astype("category").cat.remove_unused_categories() unit_codes = unit_cat.cat.codes.to_numpy() n_units = int(unit_cat.cat.categories.size) time_vals = df_feat[time].to_numpy() arm_vals = df_feat[arm_bin].to_numpy() y = df_feat[feat].astype(float).to_numpy() if standardize: y_std = y.std(ddof=1) if not np.isfinite(y_std) or y_std < 1e-12: rows.append({"feature": feat, "beta_DiD": np.nan, "n_units": n_units}) continue y = (y - y.mean()) / y_std # Build covariate matrix if covariates are specified. # Categorical columns are automatically dummy-encoded (drop_first=True # to avoid multicollinearity with the intercept). cov_matrix = None if cov_cols: cov_df = df_feat[cov_cols].copy() cov_df = pd.get_dummies(cov_df, drop_first=True, dtype=float) cov_matrix = cov_df.to_numpy(dtype=float) model = _build_did_model( unit_codes, n_units, time_vals, arm_vals, y, prior_scale=prior_scale, sigma_scale=sigma_scale, covariate_matrix=cov_matrix, ) with model: idata = pm.sample( draws=draws, tune=tune, chains=chains, target_accept=target_accept, nuts={"max_treedepth": max_treedepth}, random_seed=seed, progressbar=False, ) post = idata.posterior["beta_did"].to_numpy().ravel() beta_mean = float(post.mean()) ci_low, ci_high = np.quantile(post, [0.025, 0.975]) n_post = len(post) p_bayes = max(2 * min((post > 0).mean(), (post < 0).mean()), 1.0 / n_post) rows.append( { "feature": feat, "beta_DiD": beta_mean, "ci_low": float(ci_low), "ci_high": float(ci_high), "p_bayes": float(p_bayes), "n_units": n_units, } ) res = pd.DataFrame(rows).sort_values("p_bayes", na_position="last") res = apply_fdr(res, p_col="p_bayes", fdr_col="FDR_bayes") return res.reset_index(drop=True)
# --------------------------------------------------------------------------- # Prior predictive checking # ---------------------------------------------------------------------------
[docs] def prior_predictive_check( adata: AnnData, features: list[str], design: TrialDesign, visits: tuple[str, str], *, prior_scale: float = 1.0, sigma_scale: float = 1.0, n_samples: int = 500, seed: int = 42, ) -> dict: """Run prior predictive check for the Bayesian DiD model. Samples from the prior predictive distribution to verify that the chosen priors produce plausible outcome values *before* fitting the model. This is useful for calibrating ``prior_scale`` and ``sigma_scale``. Parameters ---------- adata AnnData object containing expression data. features Feature names (genes or ``obs`` columns) to check. design A ``TrialDesign`` specifying participant, visit, and arm columns. visits Tuple of ``(baseline, followup)`` visit labels. prior_scale Scale (standard deviation) for all Normal priors on coefficients and random intercepts. sigma_scale Scale for the HalfNormal prior on the observation noise ``sigma``. n_samples Number of prior predictive samples to draw. seed Random seed for reproducibility. Returns ------- dict Keys: - ``prior_predictive`` : np.ndarray -- prior predictive samples for the outcome variable (shape depends on number of observations and ``n_samples``). - ``observed_range`` : tuple[float, float] -- ``(min, max)`` of the observed data across all requested features. - ``prior_covers_data`` : bool -- whether the 95 % prior predictive interval covers the full observed data range. """ try: import pymc as pm except ImportError as exc: # pragma: no cover raise ImportError("pymc is required for prior_predictive_check") from exc ad, obs = _prepare_did_obs(adata, design, visits, celltype=None, exclude_crossovers=True) cols = [design.participant_col, design.visit_col, design.arm_col, "visit_num", "arm_bin"] if design.celltype_col and design.celltype_col in obs.columns: cols.append(design.celltype_col) df = obs[cols].copy() df, final_features = _add_feature_columns(df, ad, features, layer=None) df_use, unit, time, arm_bin = _aggregate_for_did( df, final_features, design, visits, "participant_visit", "mean", None, ) # Collect observed values across all requested features for the range all_observed: list[np.ndarray] = [] # We build the model from the first usable feature (they share the same # design matrix) and draw prior predictive samples once. model = None for feat in final_features: df_feat = df_use[[unit, time, arm_bin, feat]].dropna() if df_feat.empty: continue y = df_feat[feat].astype(float).to_numpy() all_observed.append(y) if model is None: unit_cat = df_feat[unit].astype("category").cat.remove_unused_categories() unit_codes = unit_cat.cat.codes.to_numpy() n_units = int(unit_cat.cat.categories.size) time_vals = df_feat[time].to_numpy() arm_vals = df_feat[arm_bin].to_numpy() model = _build_did_model( unit_codes, n_units, time_vals, arm_vals, y, prior_scale=prior_scale, sigma_scale=sigma_scale, ) if model is None: raise ValueError("No usable features found for prior predictive check.") with model: idata = pm.sample_prior_predictive(samples=n_samples, random_seed=seed) prior_pred = idata.prior_predictive["y"].to_numpy().ravel() obs_all = np.concatenate(all_observed) obs_min, obs_max = float(obs_all.min()), float(obs_all.max()) pp_low, pp_high = np.quantile(prior_pred, [0.025, 0.975]) prior_covers_data = bool(pp_low <= obs_min and pp_high >= obs_max) return { "prior_predictive": prior_pred, "observed_range": (obs_min, obs_max), "prior_covers_data": prior_covers_data, }