"""
┌──────────────────────────────────────────────────────────────────────┐
│ resample_n_of_k.py « Combinatorial Index Sampler » │
│ │
│ Generates index tuples for Fisher-style resampling. Supports │
│ exhaustive enumeration of all C(N, k) combinations as well as │
│ random sampling with or without uniqueness constraints. │
│ │
│ Author : Bart R.H. Geurten │
│ Licence: MIT │
└──────────────────────────────────────────────────────────────────────┘
"""
from __future__ import annotations
import random
from itertools import combinations
from typing import List, Literal, Sequence, Tuple, Union
[docs]
class GetNofK:
"""Generate index-pair tuples for resampling two data sets.
Given two data sets **A** and **B** the class combines them, then
produces pairs of index tuples ``(indices_A, indices_B)`` where
``indices_A`` has the same length as the shorter set and
``indices_B`` is its complement.
Args:
data_set_a: First data set.
data_set_b: Second data set.
combination_n: ``'all'`` for exhaustive enumeration or an
integer specifying the number of random draws.
mode: ``'combinations'`` for exhaustive, ``'resampling'``
for random (non-unique), ``'resample_unique'`` for
random with uniqueness guarantee.
max_len_possible_4_perms: If the shorter set exceeds this
length, exhaustive mode falls back to resampling.
resampling_n: Default number of random draws when
*combination_n* triggers resampling mode.
Attributes:
combined_data: Concatenation of both data sets.
data_indices: List of ``(indices_A, indices_B)`` tuples
produced by :meth:`main`.
combination_n: Actual number of combinations generated.
"""
def __init__(
self,
data_set_a: Sequence,
data_set_b: Sequence,
combination_n: Union[int, str],
mode: Literal["combinations", "resampling", "resample_unique"] = "resampling",
max_len_possible_4_perms: int = 10,
resampling_n: int = 100_000,
) -> None:
self.data_set_a: List = list(data_set_a)
self.data_set_b: List = list(data_set_b)
self.combination_n = combination_n
self.max_len_possible_4_perms = max_len_possible_4_perms
self.resampling_n = resampling_n
self.mode = mode
self.combined_data: List = self.data_set_a + self.data_set_b
self.combined_len: int = self._get_combined_len()
self.short_len: int = self._get_smaller_set_n()
# ── resolve mode from combination_n ──────────────────────────
if self.combination_n == "all":
self.mode = "combinations"
else:
self.mode = "resampling"
self.resampling_n = int(combination_n)
if self.mode == "combinations" and self.max_len_possible_4_perms < self.short_len:
self.mode = "resampling"
self.combination_n = self.resampling_n
self.combinations: List[Tuple[int, ...]] = []
self.data_indices: List[Tuple[Tuple[int, ...], Tuple[int, ...]]] = []
# ── internal helpers ─────────────────────────────────────────────
def _get_combined_len(self) -> int:
"""Return the total length of both data sets."""
self.len_a = len(self.data_set_a)
self.len_b = len(self.data_set_b)
return self.len_a + self.len_b
def _get_smaller_set_n(self) -> int:
"""Return the length of the smaller data set."""
return min(self.len_a, self.len_b)
# ── combination generators ───────────────────────────────────────
[docs]
def get_all_combinations(self) -> List[Tuple[int, ...]]:
"""Return all C(N, k) unique index tuples.
Returns:
Exhaustive list of index tuples of length *short_len*
drawn from ``range(combined_len)``.
"""
return list(combinations(range(self.combined_len), self.short_len))
[docs]
def get_unique_random_combinations(self) -> List[Tuple[int, ...]]:
"""Return *resampling_n* unique random index tuples.
If the target count cannot be reached after
``10 × resampling_n`` attempts the method returns whatever
unique tuples it managed to collect.
Returns:
List of unique random index tuples.
"""
combis: set[Tuple[int, ...]] = set()
all_indices = list(range(self.combined_len))
tries = 0
desperation = False
while len(combis) < self.resampling_n and not desperation:
combis.add(tuple(sorted(random.sample(all_indices, self.short_len))))
tries += 1
if tries > self.resampling_n * 10:
desperation = True
print(
f"GetNofK: could not produce more than {len(combis)} "
f"unique combinations in {tries} tries — using those."
)
return [tuple(x) for x in combis]
[docs]
def get_random_combinations(self) -> List[Tuple[int, ...]]:
"""Return *resampling_n* random index tuples (may contain duplicates).
Returns:
List of random index tuples.
"""
all_indices = list(range(self.combined_len))
return [
tuple(sorted(random.sample(all_indices, self.short_len)))
for _ in range(self.resampling_n)
]
# ── complement construction ──────────────────────────────────────
[docs]
def complement_indices(self) -> None:
"""Build ``(indices_A, complement)`` pairs from *combinations*.
Populates :attr:`data_indices` so that each entry contains
the sampled indices and their complement within the combined
data.
"""
all_idx = set(range(self.combined_len))
self.data_indices = [
(idx_a, tuple(all_idx - set(idx_a)))
for idx_a in self.combinations
]
# ── main entry point ─────────────────────────────────────────────
[docs]
def main(self) -> None:
"""Generate combinations and their complements.
Dispatches to the appropriate combination generator based on
:attr:`mode`, then builds complement index pairs.
"""
if self.mode == "combinations":
self.combinations = self.get_all_combinations()
elif self.mode == "resampling":
self.combinations = self.get_random_combinations()
elif self.mode == "resample_unique":
self.combinations = self.get_unique_random_combinations()
else:
raise ValueError(
f"GetNofK.main: unknown mode '{self.mode}'"
)
self.complement_indices()
self.combination_n = len(self.combinations)
# ── data retrieval ───────────────────────────────────────────────
[docs]
def get_shuffled_set(self, combi_i: int) -> Tuple[List, List]:
"""Retrieve the *combi_i*-th shuffled split of the data.
Args:
combi_i: Index into :attr:`data_indices`.
Returns:
A tuple ``(shuffled_A, shuffled_B)`` of data subsets.
"""
idx_a, idx_b = self.data_indices[combi_i]
shuffle_a = [self.combined_data[i] for i in idx_a]
shuffle_b = [self.combined_data[i] for i in idx_b]
return shuffle_a, shuffle_b