Source code for rerandomstats.fisher_resampling

"""
┌──────────────────────────────────────────────────────────────────────┐
│      fisher_resampling.py « Fisher's Resampling Test »               │
│                                                                      │
│  Implements Sir Ronald Fisher's re-randomisation test for            │
│  comparing two independent samples.  The observed test statistic    │
│  (mean, median, or sum difference) is ranked against a null         │
│  distribution built by combinatorial or random reshuffling.         │
│                                                                      │
│  Author : Bart R.H. Geurten                                         │
│  Licence: MIT                                                        │
└──────────────────────────────────────────────────────────────────────┘
"""

from __future__ import annotations

from typing import List, Literal, Sequence, Union

import numpy as np

from rerandomstats.resample_n_of_k import GetNofK


[docs] class FisherResamplingTest: """Two-sample resampling test in the tradition of R.A. Fisher. The observed difference between two groups (by mean, median, or sum) is ranked against a null distribution constructed by reshuffling group labels. Args: data_a: First sample. data_b: Second sample. func: Test-statistic identifier — ``'meanDiff'``, ``'medianDiff'``, or ``'sumDiff'``. combination_n: ``'all'`` for exhaustive permutation or an integer for random resampling draws. Attributes: p_value: Two-sided p-value after :meth:`main` has been called. original_test_result: Observed test statistic. shuffled_results: Sorted null distribution of the statistic. Example: >>> test = FisherResamplingTest([1, 2, 3], [7, 8, 9], 'meanDiff', 10000) >>> p = test.main() >>> p < 0.05 True """ def __init__( self, data_a: Sequence[float], data_b: Sequence[float], func: Literal["meanDiff", "medianDiff", "sumDiff"], combination_n: Union[int, str] = 10_000, ) -> None: self.data_a = data_a self.data_b = data_b self.func = func self.combination_n = combination_n self.p_value: float | None = None self.original_test_result: float | None = None self.shuffled_results: List[float] = [] self.n_of_k: GetNofK | None = None self.resample_n: int = 0 # ── shuffled-index generation ────────────────────────────────────
[docs] def get_shuffled_indices(self) -> None: """Build the combinatorial index sets via :class:`GetNofK`.""" self.n_of_k = GetNofK(self.data_a, self.data_b, self.combination_n) self.n_of_k.main() self.resample_n = self.n_of_k.combination_n
# ── main entry point ─────────────────────────────────────────────
[docs] def main(self) -> float: """Execute the full resampling pipeline and return the p-value. Steps: 1. Generate shuffled index sets. 2. Compute the observed test statistic. 3. Build the null distribution via bootstrap resampling. 4. Rank the observed statistic and derive a two-sided p-value. Returns: Two-sided p-value. """ self.get_shuffled_indices() self.original_test_result = self.calculate_test(self.data_a, self.data_b) self.shuffled_results = sorted(self.bootstrap_resampling()) self.index_of_original_in_shuffled = self._get_index_of_closest_value( self.shuffled_results, self.original_test_result ) self.index_normalized = self.index_of_original_in_shuffled / self.resample_n if self.index_normalized > 0.5: self.index_normalized = abs(self.index_normalized - 1) if self.index_normalized == 0.0: self.index_normalized = 1.0 / self.resample_n self.p_value = self.index_normalized * 2 return self.p_value
# ── closest-value ranking ──────────────────────────────────────── @staticmethod def _get_index_of_closest_value( values_list: Sequence[float], value_to_match: float ) -> float: """Return the median index of the closest value(s) in a sorted list. When multiple values share the minimum absolute difference (e.g. when both samples are identical), the *median* of their indices is returned to avoid a first-index bias. Args: values_list: Sorted numeric sequence. value_to_match: Target value. Returns: Median index of the closest matching value(s). """ arr = np.sort(np.asarray(values_list, dtype=float)) abs_diff = np.abs(arr - value_to_match) min_diff = np.min(abs_diff) min_indices = np.where(abs_diff == min_diff)[0] return float(np.median(min_indices)) # ── null-distribution construction ───────────────────────────────
[docs] def bootstrap_resampling(self) -> List[float]: """Compute the test statistic for every reshuffled split. Returns: List of resampled test-statistic values. """ assert self.n_of_k is not None results: List[float] = [] for i in range(self.resample_n): shuf_a, shuf_b = self.n_of_k.get_shuffled_set(i) results.append(self.calculate_test(shuf_a, shuf_b)) return results
# ── test-statistic dispatch ──────────────────────────────────────
[docs] def calculate_test( self, data_a: Sequence[float], data_b: Sequence[float] ) -> float: """Dispatch to the chosen test-statistic function. Args: data_a: First sample. data_b: Second sample. Returns: Scalar test statistic. Raises: ValueError: If :attr:`func` is not recognised. """ if self.func == "medianDiff": return float(np.median(data_a) - np.median(data_b)) elif self.func == "meanDiff": return float(np.mean(data_a) - np.mean(data_b)) elif self.func == "sumDiff": return float(np.sum(data_a) - np.sum(data_b)) else: raise ValueError( f"FisherResamplingTest.calculate_test: " f"unknown statistic '{self.func}'" )