Source code for stag.constants

# ╔══════════════════════════════════════════════════════════════════╗
# ║  STAG — constants                                                ║
# ║  « one source of truth for palettes, paths, and figure rules »   ║
# ╠══════════════════════════════════════════════════════════════════╣
# ║  Central configuration for all STAG analyses and figures.       ║
# ║  Import this module instead of hardcoding hex values, paths,    ║
# ║  feature names, or behavioural-category labels.                 ║
# ║                                                                  ║
# ║  Wong (2011) colourblind-safe palette with semantic mappings    ║
# ║  to the eight prototypical movements (PM0–PM7) identified in    ║
# ║  the k = 8 representative clustering run.                       ║
# ╚══════════════════════════════════════════════════════════════════╝
"""Central configuration for STAG analyses and figures."""

from __future__ import annotations

import csv
from pathlib import Path
from typing import TYPE_CHECKING

import matplotlib.pyplot as plt

from stag.local_paths import get_path_obj as _local

if TYPE_CHECKING:
    from matplotlib.figure import Figure
    from pandas import DataFrame


# ┌────────────────────────────────────────────────────────────┐
# │ Sensor / sampling constants  « hardware-fixed values »     │
# └────────────────────────────────────────────────────────────┘

FPS: int = 50
"""Accelerometer sampling rate in Hz."""

GPS_FPS: float = 0.5
"""GPS sampling rate in Hz (0.5 Hz = one fix every 2 s)."""

VIDEO_FPS: int = 30
"""Ground-truth video sampling rate in Hz."""

ACCELEROMETER_RANGE_G: int = 16
"""Accelerometer dynamic range in units of Earth gravity (±16 g)."""

FEATURE_LABELS: tuple[str, ...] = (
    "Head_X", "Head_Y", "Head_Z",
    "Ear_X", "Ear_Y", "Ear_Z",
)
"""Six accelerometer feature names in the order used by the clustering input matrix."""


# ┌────────────────────────────────────────────────────────────┐
# │ Wong (2011) palette  « colourblind-safe base colours »     │
# └────────────────────────────────────────────────────────────┘

WONG: dict[str, str] = {
    "black":          "#000000",
    "orange":         "#E69F00",
    "sky_blue":       "#56B4E9",
    "bluish_green":   "#009E73",
    "yellow":         "#F0E442",
    "blue":           "#0072B2",
    "vermilion":      "#D55E00",
    "reddish_purple": "#CC79A7",
}
"""Wong (2011, Nature Methods 8: 441) colourblind-safe palette."""


# ┌────────────────────────────────────────────────────────────┐
# │ Prototypical movements  « k = 8 representative run »       │
# └────────────────────────────────────────────────────────────┘

PM_NAMES: dict[int, str] = {
    0: "quiescent",
    1: "resting",
    2: "ear_flick_out",
    3: "resting_active",
    4: "ear_flick_back",
    5: "ear_flick_composite",
    6: "grazing_stepping",
    7: "stationary_grazing",
}
"""Canonical PM index → snake_case label mapping."""

PM_DISPLAY_NAMES: dict[int, str] = {
    0: "Quiescent",
    1: "Resting",
    2: "Ear flick (out)",
    3: "Resting, active",
    4: "Ear flick (back)",
    5: "Ear flick (composite)",
    6: "Stepping, grazing",
    7: "Stationary, grazing",
}
"""Long display name per PM — keeps the ear-flick subtype distinction.

Use this for axes where each cluster is shown on its own row/column
(centroid heatmaps, confusion matrices, transition matrices).
"""

PM_DISPLAY_NAMES_SHORT: dict[int, str] = {
    0: "Quiescent",
    1: "Resting",
    2: "Ear flick",
    3: "Resting, active",
    4: "Ear flick",
    5: "Ear flick",
    6: "Stepping, grazing",
    7: "Stationary, grazing",
}
"""Short display name per PM — collapses the three ear-flick subtypes
to a single "Ear flick" label.

Use this for legends and figures where the three ear-flick prototypes
share a colour (see :data:`PM_COLOURS`) and read as one behaviour.
"""

PM_CATEGORY: dict[int, str] = {
    0: "inactive",
    1: "inactive",
    3: "inactive",
    6: "grazing",
    7: "grazing",
    2: "ear_flick",
    4: "ear_flick",
    5: "ear_flick",
}
"""PM index → behavioural-category label (three families)."""

PM_COLOURS: dict[int, str] = {
    0: "#66ddcc",  # Quiescent
    1: "#1d8475",  # Resting
    2: "#e0ce61",  # Ear flick (shared)
    3: "#96d3ed",  # Resting, active
    4: "#e0ce61",  # Ear flick (shared)
    5: "#e0ce61",  # Ear flick (shared)
    6: "#86771a",  # Stepping, grazing
    7: "#8497b0",  # Stationary, grazing
}
"""Per-PM colour assignment used in every manuscript figure.

PM2, PM4, and PM5 share ``#e0ce61`` so the three ear-flick subtypes
read as one visual category.  These colours are the authoritative
figure palette and supersede any Wong-derived category palette.
"""

PM_CATEGORY_COLOURS: dict[str, str] = {
    "inactive":  "#3F7FB5",   # blue
    "grazing":   "#4E8C3A",   # green
    "ear_flick": "#D96828",   # orange
}
"""Per-behavioural-family colour swatch — lab canonical palette.

These three colours are the agreed-upon family swatches for every
figure that groups the eight PMs into their behavioural families
(Inactive = PM 0 + PM 1 + PM 3, Grazing = PM 6 + PM 7, Ear flick =
PM 2 + PM 4 + PM 5).  They are deliberately *not* derived from
:data:`PM_COLOURS` (which uses the Wong palette for per-PM plotting):
the family-level chart asks a different question (three families,
not eight prototypes) and the lab uses a distinct palette to make
that distinction visually obvious in posters, slides, and the
manuscript figures.

Use :data:`PM_COLOURS` for per-PM plotting; use this dict for
category-level legends and category-grouped bar charts where one
swatch per family is wanted.
"""


# ┌────────────────────────────────────────────────────────────┐
# │ Figure defaults  « SVG + PNG + CSV triple output »         │
# └────────────────────────────────────────────────────────────┘

FIGURE_DPI: int = 200
"""Raster export resolution (PNG)."""

FIGURE_SIZE_SINGLE: tuple[float, float] = (3.5, 2.8)
"""Single-column Elsevier figure size in inches."""

FIGURE_SIZE_DOUBLE: tuple[float, float] = (7.2, 4.5)
"""Two-column (full-width) Elsevier figure size in inches."""

FONT_FAMILY: str = "DejaVu Sans"
"""Default sans-serif font for figures."""

# Matplotlib rcParams applied by :func:`apply_figure_defaults`.
_FIGURE_RC: dict[str, object] = {
    "svg.fonttype":      "none",      # editable text in Inkscape
    "savefig.dpi":       FIGURE_DPI,
    "savefig.bbox":      "tight",
    "font.family":       FONT_FAMILY,
    "axes.spines.top":   False,
    "axes.spines.right": False,
    "axes.grid":         False,
}


[docs] def apply_figure_defaults() -> None: """Apply STAG figure rcParams to the current matplotlib session.""" plt.rcParams.update(_FIGURE_RC)
# ┌────────────────────────────────────────────────────────────┐ # │ Path templates « default output layout » │ # └────────────────────────────────────────────────────────────┘ RESULTS_DIR_DEFAULT: Path = Path("results") """Default top-level output directory (relative to working dir).""" FIGURES_SUBDIR: str = "figures" TABLES_SUBDIR: str = "tables" # ┌────────────────────────────────────────────────────────────┐ # │ Canonical data paths « DINZ deer-2024 archive » │ # └────────────────────────────────────────────────────────────┘ # # See [[Data Files — Source of Truth]] in the DINZ Obsidian folder # for the full file map (what is canonical, derived, legacy, or # corrupt). The paths below are the ones analysis scripts default # to; pass --meta-dir / --data-file on the CLI to override. HCS_SOURCE_DIR: Path = _local( "hcs_source", default="<hcs_source not configured - see local_paths.template.json>", ) """Read-only network archive of the 2024 deer dataset (3.0 TB). Too slow for direct analysis — mirror the curated subset locally. Resolved by :mod:`stag.local_paths`: ``STAG_HCS_DIR`` env var, then ``local_paths.json`` ``hcs_source`` field, then the placeholder default (which will crash any downstream read with a clear path).""" LOCAL_DATA_DIR: Path = _local( "data_root", default="<data_root not configured - see local_paths.template.json>", ) """Local working copy on the NVMe data drive. Tier-1 footprint ≈ 43 GB. All path constants below are anchored here. Resolved by :mod:`stag.local_paths`: ``STAG_DATA_DIR`` env var, then ``local_paths.json`` ``data_root`` field, then placeholder default.""" RAW_CLUSTERING_INPUT: Path = LOCAL_DATA_DIR / "clust_data_raw_20240412.npy" """Raw 8-column input the SLURM clustering actually read. Shape ``(204_554_618, 8)`` float64. Columns 0–5 are the six accelerometer axes; columns 6–7 are GPS-derived speed and tortuosity (excluded from clustering).""" MAXABS_CLUSTERING_INPUT: Path = LOCAL_DATA_DIR / "clust_data_maxabs_6col.npy" """Six-column MaxAbs-scaled feature matrix derived from :data:`RAW_CLUSTERING_INPUT` by ``scripts/preprocess_clustering_data.py``. Shape ``(204_554_618, 6)`` float64. Each column is divided by its absolute maximum, mapping the data to [-1, 1] per column — reproducing the 2024 SLURM pipeline's normalisation exactly (MaxAbsScaler + col-5 ±7.99 clip). This is the file every internal- and external-validation analysis consumes.""" MAXABS_SCALER_CSV: Path = LOCAL_DATA_DIR / "clust_data_maxabs_6col.maxabs.csv" """Per-column max-abs divisors used to produce :data:`MAXABS_CLUSTERING_INPUT`. Written alongside the .npy by the preprocess script — needed to invert the scaling back to physical units for centroid interpretation.""" DEER_DB: Path = LOCAL_DATA_DIR / "deer_data_gps.db" """SQLite copy of the canonical deer-2024 DB (~58 GB) — six tables: ``accelerometer_data`` (~222 M rows), ``cluster_labels`` (~204.5 M rows, FK ``acc_id`` → ``accelerometer_data.data_id``), ``trajectory_data`` (GPS, ~207 M rows after upsampling), ``deer_info`` (26 animals), ``video_observation_reference`` (926 clips), ``video_availability`` (2.8 M rows). All required composite/FK indexes are already in place. See ``scripts/cache_label_timeline.py`` for the canonical join that aligns DB rows with the saved ``labels.npy``.""" LABEL_TIMELINE_DEER_IDS: Path = LOCAL_DATA_DIR / "label_timeline_deer_ids.npy" """Per-sample ``deer_id`` aligned with the saved k=8 ``labels.npy`` (shape ``(204_554_618,)`` int8). Built once by ``scripts/cache_label_timeline.py`` from the DEER_DB join ``cluster_labels.acc_id`` → ``accelerometer_data.data_id``.""" LABEL_TIMELINE_TIMESTAMPS: Path = LOCAL_DATA_DIR / "label_timeline_timestamps.npy" """Per-sample wall-clock timestamp aligned with the saved k=8 ``labels.npy`` (shape ``(204_554_618,)`` int64 nanoseconds since the Unix epoch, NZ local time as stored in the DB). Built once by ``scripts/cache_label_timeline.py``.""" CLUSTER_RESULTS_DIR: Path = LOCAL_DATA_DIR / "cluster_results" / "deer6raw" """Root of the per-fit metadata + centroids + labels tree produced by the Aoraki SLURM sweep. Per-fit JSONs and centroid arrays are present in full; labels are present for the 24 representative runs only.""" CANONICAL_K8_LABELS: Path = ( CLUSTER_RESULTS_DIR / "delSize_0" / "k_8" / "labels" / "deer6raw_labels_k8_delSize0_partA.npy" ) """Manuscript-aligned k=8 labels.npy (Partition A). Of the 50 fits saved at ``delSize_0/k_8``, 17 converged to the basin whose centroids match :data:`centroid_label_info` exactly and whose cluster IDs are in manuscript PM order (0 = Quiescent, 1 = Resting, … 7 = Stationary grazing). None of those 17 had their labels mirrored from the original sweep, so this file is regenerated locally by nearest-manuscript-centroid assignment on :data:`MAXABS_CLUSTERING_INPUT` — bit-equivalent to the converged k-means assignment for that basin, prevalences match the manuscript table to within 0.013 % per PM. Use this constant — not a glob of the labels directory — wherever downstream code wants "the k=8 labels."""
[docs] def save_figure( fig: "Figure", stem: str, output_dir: Path, data: "DataFrame | None" = None, ) -> None: """Export a figure as SVG + PNG with an optional CSV companion table. Args: fig: Matplotlib figure to save. stem: Filename stem (no extension). output_dir: Target directory (created if needed). data: Optional dataframe; written as ``<stem>.csv`` alongside. """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) apply_figure_defaults() svg_path = output_dir / f"{stem}.svg" png_path = output_dir / f"{stem}.png" fig.savefig(svg_path) fig.savefig(png_path, dpi=FIGURE_DPI) if data is not None: csv_path = output_dir / f"{stem}.csv" data.to_csv(csv_path, index=False, quoting=csv.QUOTE_MINIMAL)