Source code for rerandomstats.data_io

"""
┌──────────────────────────────────────────────────────────────────────┐
│            data_io.py « CSV Data Ingestion Utilities »                │
│                                                                      │
│  Reads wide-format CSV files (including German-locale semicolon-    │
│  delimited variants), converts them to NumPy matrices, and          │
│  reshapes wide tables into long (value, id) format suitable for     │
│  statistical analysis.                                              │
│                                                                      │
│  Author : Bart R.H. Geurten                                         │
│  Licence: MIT                                                        │
└──────────────────────────────────────────────────────────────────────┘
"""

from __future__ import annotations

import csv
from pathlib import Path
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np


[docs] class DataIO: """CSV reader and wide→long table converter. Handles both standard comma-separated and German-locale semicolon-separated CSV files. Args: german_csv: If ``True``, treat semicolons as delimiters (common in German Excel exports). Attributes: raw_data: Raw rows as read from the CSV. Example: >>> dio = DataIO() >>> ids, vals = dio.wide_table_csv_to_long_table('data.csv') """ def __init__(self, german_csv: bool = False) -> None: self.raw_data: Optional[List] = None self.german_csv = german_csv # ── CSV reading ──────────────────────────────────────────────────
[docs] def read_csv(self, file_path: Union[str, Path]) -> None: """Read a CSV file into :attr:`raw_data`. Args: file_path: Path to the CSV file. """ with open(file_path, "r", encoding="utf-8-sig") as fh: reader = csv.reader(fh) if self.german_csv: self.raw_data = [ row[:].replace(";", ",") for row in reader ] else: self.raw_data = [row for row in reader]
# ── matrix construction ──────────────────────────────────────────
[docs] def make_square_np_matrix(self, values: List) -> np.ndarray: """Build a rectangular NumPy float matrix from row data. Empty cells are filled with ``np.nan``. Args: values: List of rows (each row a list of strings or a semicolon-separated string when *german_csv* is set). Returns: 2-D :class:`numpy.ndarray` of shape ``(nrows, ncols)``. """ if self.german_csv: nrows = len(values) ncols = max(row.count(",") for row in values) + 1 values = [row.split(",") for row in values] else: nrows = len(values) ncols = len(values[0]) matrix = np.full((nrows, ncols), np.nan) for i, row in enumerate(values): for j, val in enumerate(row): if val: matrix[i, j] = float(val) return matrix
# ── header extraction ────────────────────────────────────────────
[docs] def split_csv_headers(self) -> List[str]: """Return the column headers from the first CSV row. Returns: List of header strings. """ assert self.raw_data is not None, "Call read_csv() first." if self.german_csv: return self.raw_data[0].split(",") return self.raw_data[0]
# ── wide → long conversion ───────────────────────────────────────
[docs] def wide_table_to_value_id_list( self, values: np.ndarray, col_header: Sequence[str], ) -> Tuple[List[str], List[float]]: """Convert a wide matrix to parallel id / value lists. Non-NaN cells are unpacked column-wise, tagging each value with its column header. Args: values: 2-D NumPy array. col_header: Column names (length must match ``values.shape[1]``). Returns: Tuple ``(id_list, value_list)``. """ id_list: List[str] = [] value_list: List[float] = [] for i in range(values.shape[0]): for j in range(values.shape[1]): if not np.isnan(values[i, j]): value_list.append(values[i, j]) id_list.append(col_header[j]) return id_list, value_list
[docs] def wide_table_csv_to_long_table( self, file_path: Union[str, Path] ) -> Tuple[List[str], List[float]]: """Read a wide-format CSV and return long-format id/value lists. Convenience wrapper that chains :meth:`read_csv`, :meth:`make_square_np_matrix`, :meth:`split_csv_headers`, and :meth:`wide_table_to_value_id_list`. Args: file_path: Path to the wide-format CSV. Returns: Tuple ``(id_list, value_list)``. """ self.read_csv(file_path) assert self.raw_data is not None data = self.make_square_np_matrix(self.raw_data[1:]) headers = self.split_csv_headers() return self.wide_table_to_value_id_list(data, headers)
# ── subsetting ───────────────────────────────────────────────────
[docs] @staticmethod def get_subset_of_data( id_list: Sequence[str], value_list: Sequence[float], id_subset: Sequence[str], ) -> Tuple[List[str], List[float]]: """Filter parallel id/value lists to a subset of ids. Args: id_list: Full list of identifiers. value_list: Full list of values. id_subset: Identifiers to keep. Returns: Filtered ``(id_list, value_list)`` tuple. Example: >>> DataIO.get_subset_of_data( ... ['a', 'b', 'c'], [1, 2, 3], ['a', 'c'] ... ) (['a', 'c'], [1, 3]) """ subset_ids: List[str] = [] subset_vals: List[float] = [] for ident, val in zip(id_list, value_list): if ident in id_subset: subset_ids.append(ident) subset_vals.append(val) return subset_ids, subset_vals