"""
┌──────────────────────────────────────────────────────────────────────┐
│ fisher_resampling.py « Fisher's Resampling Test » │
│ │
│ Implements Sir Ronald Fisher's re-randomisation test for │
│ comparing two independent samples. The observed test statistic │
│ (mean, median, or sum difference) is ranked against a null │
│ distribution built by combinatorial or random reshuffling. │
│ │
│ Author : Bart R.H. Geurten │
│ Licence: MIT │
└──────────────────────────────────────────────────────────────────────┘
"""
from __future__ import annotations
from typing import List, Literal, Sequence, Union
import numpy as np
from rerandomstats.resample_n_of_k import GetNofK
[docs]
class FisherResamplingTest:
"""Two-sample resampling test in the tradition of R.A. Fisher.
The observed difference between two groups (by mean, median, or
sum) is ranked against a null distribution constructed by
reshuffling group labels.
Args:
data_a: First sample.
data_b: Second sample.
func: Test-statistic identifier — ``'meanDiff'``,
``'medianDiff'``, or ``'sumDiff'``.
combination_n: ``'all'`` for exhaustive permutation or an
integer for random resampling draws.
Attributes:
p_value: Two-sided p-value after :meth:`main` has been called.
original_test_result: Observed test statistic.
shuffled_results: Sorted null distribution of the statistic.
Example:
>>> test = FisherResamplingTest([1, 2, 3], [7, 8, 9], 'meanDiff', 10000)
>>> p = test.main()
>>> p < 0.05
True
"""
def __init__(
self,
data_a: Sequence[float],
data_b: Sequence[float],
func: Literal["meanDiff", "medianDiff", "sumDiff"],
combination_n: Union[int, str] = 10_000,
) -> None:
self.data_a = data_a
self.data_b = data_b
self.func = func
self.combination_n = combination_n
self.p_value: float | None = None
self.original_test_result: float | None = None
self.shuffled_results: List[float] = []
self.n_of_k: GetNofK | None = None
self.resample_n: int = 0
# ── shuffled-index generation ────────────────────────────────────
[docs]
def get_shuffled_indices(self) -> None:
"""Build the combinatorial index sets via :class:`GetNofK`."""
self.n_of_k = GetNofK(self.data_a, self.data_b, self.combination_n)
self.n_of_k.main()
self.resample_n = self.n_of_k.combination_n
# ── main entry point ─────────────────────────────────────────────
[docs]
def main(self) -> float:
"""Execute the full resampling pipeline and return the p-value.
Steps:
1. Generate shuffled index sets.
2. Compute the observed test statistic.
3. Build the null distribution via bootstrap resampling.
4. Rank the observed statistic and derive a two-sided
p-value.
Returns:
Two-sided p-value.
"""
self.get_shuffled_indices()
self.original_test_result = self.calculate_test(self.data_a, self.data_b)
self.shuffled_results = sorted(self.bootstrap_resampling())
self.index_of_original_in_shuffled = self._get_index_of_closest_value(
self.shuffled_results, self.original_test_result
)
self.index_normalized = self.index_of_original_in_shuffled / self.resample_n
if self.index_normalized > 0.5:
self.index_normalized = abs(self.index_normalized - 1)
if self.index_normalized == 0.0:
self.index_normalized = 1.0 / self.resample_n
self.p_value = self.index_normalized * 2
return self.p_value
# ── closest-value ranking ────────────────────────────────────────
@staticmethod
def _get_index_of_closest_value(
values_list: Sequence[float], value_to_match: float
) -> float:
"""Return the median index of the closest value(s) in a sorted list.
When multiple values share the minimum absolute difference
(e.g. when both samples are identical), the *median* of their
indices is returned to avoid a first-index bias.
Args:
values_list: Sorted numeric sequence.
value_to_match: Target value.
Returns:
Median index of the closest matching value(s).
"""
arr = np.sort(np.asarray(values_list, dtype=float))
abs_diff = np.abs(arr - value_to_match)
min_diff = np.min(abs_diff)
min_indices = np.where(abs_diff == min_diff)[0]
return float(np.median(min_indices))
# ── null-distribution construction ───────────────────────────────
[docs]
def bootstrap_resampling(self) -> List[float]:
"""Compute the test statistic for every reshuffled split.
Returns:
List of resampled test-statistic values.
"""
assert self.n_of_k is not None
results: List[float] = []
for i in range(self.resample_n):
shuf_a, shuf_b = self.n_of_k.get_shuffled_set(i)
results.append(self.calculate_test(shuf_a, shuf_b))
return results
# ── test-statistic dispatch ──────────────────────────────────────
[docs]
def calculate_test(
self, data_a: Sequence[float], data_b: Sequence[float]
) -> float:
"""Dispatch to the chosen test-statistic function.
Args:
data_a: First sample.
data_b: Second sample.
Returns:
Scalar test statistic.
Raises:
ValueError: If :attr:`func` is not recognised.
"""
if self.func == "medianDiff":
return float(np.median(data_a) - np.median(data_b))
elif self.func == "meanDiff":
return float(np.mean(data_a) - np.mean(data_b))
elif self.func == "sumDiff":
return float(np.sum(data_a) - np.sum(data_b))
else:
raise ValueError(
f"FisherResamplingTest.calculate_test: "
f"unknown statistic '{self.func}'"
)