Source code for rerandomstats.model_comparison

# ╔══════════════════════════════════════════════════════════════════╗
# ║  reRandomStats — model_comparison                                ║
# ║  « comparing fitted coefficients + correcting test batteries »   ║
# ╠══════════════════════════════════════════════════════════════════╣
# ║  Four tools for the after-the-fit stage of any analysis where    ║
# ║  multiple estimators are compared or multiple tests are pooled:  ║
# ║                                                                  ║
# ║    wald_two_sample_beta   — two-sample Wald z-test on the        ║
# ║                              difference between two independent  ║
# ║                              estimated coefficients              ║
# ║    likelihood_ratio_test  — LRT of a full vs reduced statsmodels ║
# ║                              fit (nested models)                 ║
# ║    correct_pvalues        — single-method correction of a dict   ║
# ║                              of pre-computed p-values; thin      ║
# ║                              wrapper around statsmodels.stats.   ║
# ║                              multipletests — the SHARED          ║
# ║                              ALGORITHMIC CORE for all multi-     ║
# ║                              comparison correction in the        ║
# ║                              package                             ║
# ║    benjamini_hochberg     — dual-method (BH + Bonferroni) report ║
# ║                              built on correct_pvalues; the       ║
# ║                              convenience API for the typical     ║
# ║                              "show both corrections side-by-     ║
# ║                              side" use case                      ║
# ║                                                                  ║
# ║  Design rule: every multi-comparison correction in reRandomStats ║
# ║  ultimately routes through statsmodels.stats.multipletests, so   ║
# ║  the actual correction math has exactly one implementation.      ║
# ║  Different I/O wrappers (dict-in/dict-out here, DataFrame-in/    ║
# ║  DataFrame-out in MultiGroupTest) are fine; the algorithm is     ║
# ║  not duplicated.                                                 ║
# ║                                                                  ║
# ║  Generalised from the ThermoStrife pipeline (Geurten 2026,       ║
# ║  ThermoStrife v0.1.1, DOI 10.5281/zenodo.20371612).              ║
# ╚══════════════════════════════════════════════════════════════════╝
"""Comparing fitted coefficients and correcting heterogeneous test batteries.

The four functions in this module operate on already-fitted models or
already-computed p-values — they do NOT take raw data.  For raw-data
workflows that combine data ingestion, pairwise testing, and
correction into one call, use :class:`rerandomstats.MultiGroupTest`.

This separation matters when the test battery is heterogeneous (e.g.
a case-crossover conditional logit, a Poisson IRR, a Wald two-sample-β
contrast, and a likelihood-ratio test, all in the same auxiliary
battery) and the corresponding p-values cannot all be produced by a
single pairwise-comparisons pipeline.  In that case the caller runs
each test individually, collects the p-values into a dict, and passes
the dict to :func:`correct_pvalues` (for a single method) or
:func:`benjamini_hochberg` (for the BH + Bonferroni dual-method
report) for correction.

**Algorithmic-source guarantee.** Both :func:`correct_pvalues` and
:func:`benjamini_hochberg` delegate the underlying correction
arithmetic to ``statsmodels.stats.multipletests``; the same call is
used internally by :class:`rerandomstats.MultiGroupTest`.  There is
no parallel implementation of BH / Bonferroni / Holm / Sidak / FDR-BY
in this package — every code path that adjusts p-values converges
on statsmodels' canonical implementation.
"""

from __future__ import annotations

import math
from typing import Any, Literal

import numpy as np
import statsmodels.stats.multitest as sm_multitest
from scipy import stats

# ┌────────────────────────────────────────────────────────────┐
# │ Wald two-sample test on two estimated coefficients         │
# └────────────────────────────────────────────────────────────┘



[docs]
def wald_two_sample_beta(
    beta_a: float,
    se_a: float,
    beta_b: float,
    se_b: float,
    *,
    alternative: Literal["two-sided", "greater", "less"] = "two-sided",
    name_a: str = "a",
    name_b: str = "b",
) -> dict:
    """Wald z-test on the difference between two estimated coefficients.

    Tests ``H0: β_a = β_b`` (or directional variants) using

        z = (β_a − β_b) / sqrt(SE(β_a)² + SE(β_b)²)

    This is the right test when you have two independently-fitted
    estimates of related coefficients — e.g. a case-crossover OR
    fitted on a hot-host subsample vs the same estimator fitted on a
    cool-host subsample, or a Poisson IRR for an aggression-set
    outcome vs the same estimator on a non-aggression outcome.  The
    formula assumes the two estimates are independent (no overlap in
    the data they were fit on); for **nested** models on the **same**
    data, use :func:`likelihood_ratio_test` instead.

    Args:
        beta_a:      Estimated coefficient from subsample / model A.
        se_a:        Standard error of ``beta_a``.
        beta_b:      Estimated coefficient from subsample / model B.
        se_b:        Standard error of ``beta_b``.
        alternative: ``"two-sided"`` (default) tests ``β_a ≠ β_b``.
                     ``"greater"`` tests ``β_a > β_b``.
                     ``"less"`` tests ``β_a < β_b``.
        name_a:      Optional label for the A side, surfaced in the
                     output dict.  Defaults to ``"a"``.
        name_b:      Optional label for the B side.  Defaults to ``"b"``.

    Returns:
        Dict with the input estimates, the difference and its SE, the
        Wald z statistic, the requested-tail p-value, and a 95 %
        confidence interval on the difference.

    Raises:
        ValueError: if ``alternative`` is not one of the three accepted
                    strings.
    """
    if alternative not in {"two-sided", "greater", "less"}:
        raise ValueError(
            f"alternative must be 'two-sided', 'greater', or 'less'; got {alternative!r}"
        )

    diff = float(beta_a - beta_b)
    se_diff = float(math.sqrt(se_a**2 + se_b**2))
    z = diff / se_diff if se_diff > 0 else float("nan")

    if math.isnan(z):
        p = float("nan")
    elif alternative == "two-sided":
        p = float(2.0 * (1.0 - stats.norm.cdf(abs(z))))
    elif alternative == "greater":
        p = float(1.0 - stats.norm.cdf(z))
    else:  # "less"
        p = float(stats.norm.cdf(z))

    return {
        "name_a": name_a,
        "beta_a": float(beta_a),
        "se_a": float(se_a),
        "name_b": name_b,
        "beta_b": float(beta_b),
        "se_b": float(se_b),
        "diff": diff,
        "se_diff": se_diff,
        "z_statistic": float(z),
        "alternative": alternative,
        "pvalue": p,
        "ci95_low_diff": diff - 1.96 * se_diff,
        "ci95_high_diff": diff + 1.96 * se_diff,
    }



# ┌────────────────────────────────────────────────────────────┐
# │ Likelihood-ratio test on nested models                     │
# └────────────────────────────────────────────────────────────┘



[docs]
def likelihood_ratio_test(
    model_full: Any,
    model_reduced: Any,
) -> dict:
    """Likelihood-ratio test of a full vs reduced nested fit.

    Computes::

        LR = 2 · (log L_full − log L_reduced)
        p  = 1 − F_chi²(LR; df_full − df_reduced)

    Both arguments may be any objects exposing ``.llf`` (log-likelihood)
    and ``.df_model`` (number of estimated parameters) attributes —
    the convention used by ``statsmodels`` result objects, which
    covers ``ConditionalLogit``, ``Poisson``, ``Logit``, ``OLS``, and
    most others.

    Use this for **nested** model comparisons on the **same** data:
    e.g. testing whether adding an interaction term to a
    case-crossover model significantly improves fit, or whether a
    tournament-edition × anomaly interaction is needed beyond the
    main-effect model.  For comparing two estimates of the SAME
    coefficient fitted on DIFFERENT data, use
    :func:`wald_two_sample_beta` instead.

    Args:
        model_full:    Fitted result object for the larger (nesting) model.
        model_reduced: Fitted result object for the smaller (nested) model.

    Returns:
        Dict with the LR statistic, the degrees-of-freedom difference,
        the chi² p-value, and both log-likelihoods.

    Raises:
        ValueError: if ``model_full`` does not have strictly more
                    parameters than ``model_reduced``.
    """
    llf_full = float(model_full.llf)
    llf_red = float(model_reduced.llf)
    df_diff = int(model_full.df_model - model_reduced.df_model)
    if df_diff <= 0:
        raise ValueError(
            f"model_full must have more parameters than model_reduced; "
            f"got df_full={model_full.df_model}, df_reduced={model_reduced.df_model}"
        )

    lr_stat = 2.0 * (llf_full - llf_red)
    p = float(1.0 - stats.chi2.cdf(lr_stat, df_diff))

    return {
        "lr_statistic": float(lr_stat),
        "df": df_diff,
        "pvalue": p,
        "log_likelihood_full": llf_full,
        "log_likelihood_reduced": llf_red,
    }



# ┌────────────────────────────────────────────────────────────┐
# │ Single-method correction  « shared algorithmic core »      │
# └────────────────────────────────────────────────────────────┘



[docs]
def correct_pvalues(
    pvalues: dict[str, float],
    *,
    alpha: float = 0.05,
    method: str = "fdr_bh",
) -> dict:
    """Adjust a heterogeneous battery of p-values via one correction method.

    Thin wrapper around ``statsmodels.stats.multitest.multipletests``.
    This is the **shared algorithmic core** for every multi-comparison
    correction in reRandomStats: both :func:`benjamini_hochberg` (the
    dual-method convenience helper) and the
    :class:`rerandomstats.MultiGroupTest` pipeline ultimately route
    through the same statsmodels call.  No parallel implementation of
    the underlying math exists in this package.

    Use this directly when you want a single correction method (e.g.
    just BH FDR or just Holm) and need a tidy per-test result dict
    rather than DataFrame columns.

    Args:
        pvalues: Dict mapping test name → raw p-value.  Keys are
                 carried verbatim through to the output ``results``
                 dict.  Insertion order is preserved.
        alpha:   Family-wise significance level.  Default 0.05.
        method:  Any method accepted by
                 ``statsmodels.stats.multitest.multipletests``: e.g.
                 ``"fdr_bh"`` (Benjamini–Hochberg step-up, the
                 default), ``"fdr_by"`` (Benjamini–Yekutieli),
                 ``"bonferroni"``, ``"holm"``, ``"hommel"``,
                 ``"sidak"``, ``"holm-sidak"``, ``"simes-hochberg"``,
                 ``"hommel"``, ``"fdr_tsbh"``, ``"fdr_tsbky"``.

    Returns:
        Dict with top-level keys ``family_size``, ``alpha``,
        ``method``, ``n_rejected``, and ``results``.  ``results`` is
        a per-test dict carrying ``raw_p``, ``rank`` (1 = smallest
        raw p-value), ``adjusted_p``, and ``reject`` (bool).  Empty
        input returns ``{"family_size": 0, "alpha": alpha, "method":
        method, "n_rejected": 0, "results": {}}``.
    """
    items = list(pvalues.items())
    k = len(items)
    if k == 0:
        return {
            "family_size": 0, "alpha": alpha, "method": method,
            "n_rejected": 0, "results": {},
        }

    names = [n for n, _ in items]
    raw_ps = np.array([p for _, p in items], dtype=float)

    # The single algorithmic source for the correction math.
    reject, adjusted, _, _ = sm_multitest.multipletests(
        raw_ps, alpha=alpha, method=method,
    )

    # Rank in raw-p ascending order (1 = smallest).  Used for
    # downstream callers that want to enumerate tests in increasing
    # p-order without re-sorting.
    sort_idx = np.argsort(raw_ps, kind="stable")
    ranks = np.empty(k, dtype=int)
    ranks[sort_idx] = np.arange(1, k + 1)

    results = {}
    for i, name in enumerate(names):
        results[name] = {
            "raw_p": float(raw_ps[i]),
            "rank": int(ranks[i]),
            "adjusted_p": float(adjusted[i]),
            "reject": bool(reject[i]),
        }
    return {
        "family_size": k,
        "alpha": alpha,
        "method": method,
        "n_rejected": int(reject.sum()),
        "results": results,
    }



# ┌────────────────────────────────────────────────────────────┐
# │ Dual-method (BH + Bonferroni) battery report               │
# └────────────────────────────────────────────────────────────┘



[docs]
def benjamini_hochberg(
    pvalues: dict[str, float],
    alpha: float = 0.05,
) -> dict:
    """Dual-method BH + Bonferroni report for a heterogeneous battery.

    Convenience wrapper around :func:`correct_pvalues`: runs the
    correction twice (``method="fdr_bh"`` and ``method="bonferroni"``)
    and merges the per-test results so reports can show BH q-values
    AND Bonferroni-adjusted p-values side-by-side.  The BH FDR is
    typically the primary (less conservative for correlated batteries)
    with Bonferroni as a conservative reference.

    Both corrections are computed via the same
    ``statsmodels.stats.multitest.multipletests`` call (one per
    method), so the underlying math has exactly one implementation.

    Args:
        pvalues: Dict mapping test name → raw p-value.
        alpha:   Target false-discovery rate.  Default 0.05.

    Returns:
        Dict with per-test ``raw_p``, ``rank``, ``bh_adjusted_p``,
        ``bh_threshold`` (the per-rank BH cut ``(rank/k)·alpha``),
        ``bh_reject``, ``bonferroni_adjusted_p``, and
        ``bonferroni_reject``.  Top-level keys also expose
        ``family_size``, ``alpha``, ``bonferroni_threshold`` (the
        per-test alpha/k cut), ``bh_cutoff_rank`` (largest rank that
        passes BH), and ``n_bh_rejected`` / ``n_bonferroni_rejected``
        counts.  Empty input returns
        ``{"family_size": 0, "alpha": alpha, "results": {}}``.
    """
    bh = correct_pvalues(pvalues, alpha=alpha, method="fdr_bh")
    if bh["family_size"] == 0:
        return {"family_size": 0, "alpha": alpha, "results": {}}

    bf = correct_pvalues(pvalues, alpha=alpha, method="bonferroni")
    k = bh["family_size"]
    bonf_threshold = alpha / k

    results: dict[str, dict] = {}
    for name in bh["results"]:
        bh_r = bh["results"][name]
        bf_r = bf["results"][name]
        rank = bh_r["rank"]
        results[name] = {
            "raw_p": bh_r["raw_p"],
            "rank": rank,
            "bh_adjusted_p": bh_r["adjusted_p"],
            "bh_threshold": float(rank / k * alpha),
            "bh_reject": bh_r["reject"],
            "bonferroni_adjusted_p": bf_r["adjusted_p"],
            "bonferroni_reject": bf_r["reject"],
        }
    n_bh_rejected = bh["n_rejected"]
    return {
        "family_size": k,
        "alpha": alpha,
        "bonferroni_threshold": float(bonf_threshold),
        "bh_cutoff_rank": n_bh_rejected,
        "n_bh_rejected": n_bh_rejected,
        "n_bonferroni_rejected": bf["n_rejected"],
        "results": results,
    }



# ┌────────────────────────────────────────────────────────────┐
# │ Array-in / array-out BH helper                             │
# │ « drop-in replacement for project-specific NumPy BH »      │
# └────────────────────────────────────────────────────────────┘



[docs]
def correct_pvalues_array(
    pvalues: np.ndarray,
    *,
    alpha: float = 0.05,
    method: str = "fdr_bh",
) -> np.ndarray:
    """Adjust a NumPy array of p-values, preserving NaN entries.

    Array-in / array-out wrapper around the same shared
    ``statsmodels.stats.multitest.multipletests`` call that
    :func:`correct_pvalues` uses.  Designed as a one-line drop-in
    replacement for project-specific BH/Bonferroni helpers that take
    and return arrays — e.g. ``digimuh.stats_core.benjamini_hochberg``
    — so refactors converging on reRandomStats are a single import
    change.

    NaN entries in the input are preserved at the same positions in
    the output and are excluded from the correction (the family size
    used by statsmodels is the count of finite p-values, matching the
    DigiMuh convention).

    Args:
        pvalues: 1-D array of raw p-values.  May contain NaN.
        alpha:   Family-wise significance level.  Default 0.05.
        method:  Any method accepted by
                 ``statsmodels.stats.multitest.multipletests``.
                 Default ``"fdr_bh"``.

    Returns:
        Array of adjusted p-values, same shape and dtype as input,
        with NaN entries preserved.  Empty input returns an empty
        array.
    """
    p = np.asarray(pvalues, dtype=float)
    n = p.size
    if n == 0:
        return p

    notnan = ~np.isnan(p)
    p_valid = p[notnan]
    if p_valid.size == 0:
        return p.copy()

    _reject, adjusted, _, _ = sm_multitest.multipletests(
        p_valid, alpha=alpha, method=method,
    )

    out = np.full(n, np.nan)
    out[notnan] = adjusted
    return out



__all__ = [
    "wald_two_sample_beta",
    "likelihood_ratio_test",
    "correct_pvalues",
    "correct_pvalues_array",
    "benjamini_hochberg",
]