Source code for igloo_weta.ingest

#!/usr/bin/env python3
"""
 ┌─────────────────────────────────────────────────────────────────────┐
 │  INGEST                                « loading the raw signal »   │
 └─────────────────────────────────────────────────────────────────────┘

Data loading, validation, and cleaning for field temperature recordings,
photogrammetric burrow geometry, and wētā morphometrics.

All loaders return pandas DataFrames with consistent column names.
Paths default to the ``data/`` directory adjacent to the package root.

Example::

    from igloo_weta.ingest import load_all
    ds = load_all("/path/to/data")
    print(ds.hourly_24h.columns)
"""

from __future__ import annotations

import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd


# ┌─────────────────────────────────────────────────────────────────────┐
# │  DATA BUNDLE                          « one object to rule them »   │
# └─────────────────────────────────────────────────────────────────────┘
[docs] @dataclass class DataBundle: """Container for all experimental datasets. Attributes: hourly_24h: Mean 24-h diurnal cycle per rock (hourly bins). incubator: Full-duration hourly aggregates from incubator control. overall: Per-rock summary statistics across all days. daily: Day-by-day averages per rock. rock_phys: Photogrammetric cavity geometry (foam cast measurements). weta_morph: Wētā morphometrics and species assignments. """ hourly_24h: pd.DataFrame incubator: pd.DataFrame overall: pd.DataFrame daily: pd.DataFrame rock_phys: pd.DataFrame weta_morph: pd.DataFrame
# ┌─────────────────────────────────────────────────────────────────────┐ # │ PATH RESOLUTION « finding the goods » │ # └─────────────────────────────────────────────────────────────────────┘ def _default_data_dir() -> Path: """Return ``<package_root>/../data`` as the default data directory.""" return Path(__file__).resolve().parent.parent / "data" def _resolve(data_dir: Optional[str], filename: str) -> Path: """Build and validate a file path inside the data directory. Args: data_dir: Override data directory. ``None`` → default. filename: Filename to look up. Returns: Resolved absolute ``Path``. Raises: FileNotFoundError: If the file does not exist. """ d = Path(data_dir) if data_dir else _default_data_dir() p = d / filename if not p.is_file(): raise FileNotFoundError(f"Expected data file not found: {p}") return p # ┌─────────────────────────────────────────────────────────────────────┐ # │ LOADERS « reading the bit stream » │ # └─────────────────────────────────────────────────────────────────────┘
[docs] def load_hourly_24h(data_dir: Optional[str] = None) -> pd.DataFrame: """Load the 24-hour hourly average temperatures per rock. Args: data_dir: Path to data directory. ``None`` uses the default. Returns: DataFrame with columns including ``Hour``, ``rock``, ``inside_mean``, ``outside_mean``, ``diff_mean``, etc. """ p = _resolve(data_dir, "24h_hourly_averages.csv") return pd.read_csv(p)
[docs] def load_incubator(data_dir: Optional[str] = None) -> pd.DataFrame: """Load the incubator passive-control hourly time series. Args: data_dir: Path to data directory. Returns: DataFrame indexed by ``elapsed_hour`` with inside/outside temperature means, SEMs, and confidence intervals. """ p = _resolve(data_dir, "full_duration_hourly_aggregates.csv") return pd.read_csv(p)
[docs] def load_overall(data_dir: Optional[str] = None) -> pd.DataFrame: """Load per-rock overall summary statistics. Args: data_dir: Path to data directory. Returns: DataFrame with one row per rock: mean temperatures, confidence intervals, humidity. """ p = _resolve(data_dir, "full_duration_overall_stats.csv") return pd.read_csv(p)
[docs] def load_daily(data_dir: Optional[str] = None) -> pd.DataFrame: """Load day-by-day temperature averages per rock. Args: data_dir: Path to data directory. Returns: DataFrame with ``day``, ``rock``, and temperature columns. """ p = _resolve(data_dir, "total_duration_averages.csv") return pd.read_csv(p)
[docs] def load_rock_physics(data_dir: Optional[str] = None) -> pd.DataFrame: """Load photogrammetric burrow geometry from foam casts. The ``Total Volume`` and ``Total Surface area`` columns describe the **air cavity** (foam imprint), not the stone. Stone shell properties are computed downstream in :mod:`igloo_weta.physics`. Args: data_dir: Path to data directory. Returns: DataFrame with ``Rock number``, ``Total Volume (cm3)``, ``Total Surface area (cm2)``, and chunk data. """ p = _resolve(data_dir, "Rock_data.xlsx") df = pd.read_excel(p, sheet_name="Sheet1") return df
[docs] def load_weta_morphometrics( data_dir: Optional[str] = None, ) -> pd.DataFrame: """Load wētā body measurements and assign species from ID prefixes. Species are inferred from the ``Weta number`` prefix: ``HM`` → *H. maori*, ``Hthora`` → *H. thoracica*, ``Hcrass`` → *H. crassidens*. Args: data_dir: Path to data directory. Returns: DataFrame with added ``species`` column. """ p = _resolve(data_dir, "Weta_thermoregulation_datasheet.xlsx") df = pd.read_excel(p, sheet_name="Sheet1") def _assign(row: pd.Series) -> str: wn = str(row["Weta number"]).lower() if wn.startswith("hm"): return "H. maori" elif wn.startswith("hthora"): return "H. thoracica" elif wn.startswith("hcrass"): return "H. crassidens" return "unknown" df["species"] = df.apply(_assign, axis=1) return df
[docs] def summarise_species(morph_df: pd.DataFrame) -> dict: """Compute per-species weight statistics from morphometric data. Args: morph_df: Output of :func:`load_weta_morphometrics`. Returns: Dict keyed by species name, each containing ``n``, ``mean``, ``std``, ``min``, ``max``, ``median``, and ``weights`` (array). """ stats = {} for sp in ["H. maori", "H. thoracica", "H. crassidens"]: w = morph_df.loc[morph_df["species"] == sp, "Weight (g)"].dropna() if len(w) == 0: warnings.warn(f"No weight data for {sp}") continue stats[sp] = { "n": len(w), "mean": float(w.mean()), "std": float(w.std()), "min": float(w.min()), "max": float(w.max()), "median": float(w.median()), "weights": w.values.copy(), } return stats
# ┌─────────────────────────────────────────────────────────────────────┐ # │ BUNDLE LOADER « the whole enchilada » │ # └─────────────────────────────────────────────────────────────────────┘
[docs] def load_all(data_dir: Optional[str] = None) -> DataBundle: """Load every dataset and return a single :class:`DataBundle`. Args: data_dir: Override for the data directory path. Returns: Populated :class:`DataBundle` ready for analysis. """ return DataBundle( hourly_24h=load_hourly_24h(data_dir), incubator=load_incubator(data_dir), overall=load_overall(data_dir), daily=load_daily(data_dir), rock_phys=load_rock_physics(data_dir), weta_morph=load_weta_morphometrics(data_dir), )