# ╔══════════════════════════════════════════════════════════════════╗
# ║ STAG — constants ║
# ║ « one source of truth for palettes, paths, and figure rules » ║
# ╠══════════════════════════════════════════════════════════════════╣
# ║ Central configuration for all STAG analyses and figures. ║
# ║ Import this module instead of hardcoding hex values, paths, ║
# ║ feature names, or behavioural-category labels. ║
# ║ ║
# ║ Wong (2011) colourblind-safe palette with semantic mappings ║
# ║ to the eight prototypical movements (PM0–PM7) identified in ║
# ║ the k = 8 representative clustering run. ║
# ╚══════════════════════════════════════════════════════════════════╝
"""Central configuration for STAG analyses and figures."""
from __future__ import annotations
import csv
from pathlib import Path
from typing import TYPE_CHECKING
import matplotlib.pyplot as plt
from stag.local_paths import get_path_obj as _local
if TYPE_CHECKING:
from matplotlib.figure import Figure
from pandas import DataFrame
# ┌────────────────────────────────────────────────────────────┐
# │ Sensor / sampling constants « hardware-fixed values » │
# └────────────────────────────────────────────────────────────┘
FPS: int = 50
"""Accelerometer sampling rate in Hz."""
GPS_FPS: float = 0.5
"""GPS sampling rate in Hz (0.5 Hz = one fix every 2 s)."""
VIDEO_FPS: int = 30
"""Ground-truth video sampling rate in Hz."""
ACCELEROMETER_RANGE_G: int = 16
"""Accelerometer dynamic range in units of Earth gravity (±16 g)."""
FEATURE_LABELS: tuple[str, ...] = (
"Head_X", "Head_Y", "Head_Z",
"Ear_X", "Ear_Y", "Ear_Z",
)
"""Six accelerometer feature names in the order used by the clustering input matrix."""
# ┌────────────────────────────────────────────────────────────┐
# │ Wong (2011) palette « colourblind-safe base colours » │
# └────────────────────────────────────────────────────────────┘
WONG: dict[str, str] = {
"black": "#000000",
"orange": "#E69F00",
"sky_blue": "#56B4E9",
"bluish_green": "#009E73",
"yellow": "#F0E442",
"blue": "#0072B2",
"vermilion": "#D55E00",
"reddish_purple": "#CC79A7",
}
"""Wong (2011, Nature Methods 8: 441) colourblind-safe palette."""
# ┌────────────────────────────────────────────────────────────┐
# │ Prototypical movements « k = 8 representative run » │
# └────────────────────────────────────────────────────────────┘
PM_NAMES: dict[int, str] = {
0: "quiescent",
1: "resting",
2: "ear_flick_out",
3: "resting_active",
4: "ear_flick_back",
5: "ear_flick_composite",
6: "grazing_stepping",
7: "stationary_grazing",
}
"""Canonical PM index → snake_case label mapping."""
PM_DISPLAY_NAMES: dict[int, str] = {
0: "Quiescent",
1: "Resting",
2: "Ear flick (out)",
3: "Resting, active",
4: "Ear flick (back)",
5: "Ear flick (composite)",
6: "Stepping, grazing",
7: "Stationary, grazing",
}
"""Long display name per PM — keeps the ear-flick subtype distinction.
Use this for axes where each cluster is shown on its own row/column
(centroid heatmaps, confusion matrices, transition matrices).
"""
PM_DISPLAY_NAMES_SHORT: dict[int, str] = {
0: "Quiescent",
1: "Resting",
2: "Ear flick",
3: "Resting, active",
4: "Ear flick",
5: "Ear flick",
6: "Stepping, grazing",
7: "Stationary, grazing",
}
"""Short display name per PM — collapses the three ear-flick subtypes
to a single "Ear flick" label.
Use this for legends and figures where the three ear-flick prototypes
share a colour (see :data:`PM_COLOURS`) and read as one behaviour.
"""
PM_CATEGORY: dict[int, str] = {
0: "inactive",
1: "inactive",
3: "inactive",
6: "grazing",
7: "grazing",
2: "ear_flick",
4: "ear_flick",
5: "ear_flick",
}
"""PM index → behavioural-category label (three families)."""
PM_COLOURS: dict[int, str] = {
0: "#66ddcc", # Quiescent
1: "#1d8475", # Resting
2: "#e0ce61", # Ear flick (shared)
3: "#96d3ed", # Resting, active
4: "#e0ce61", # Ear flick (shared)
5: "#e0ce61", # Ear flick (shared)
6: "#86771a", # Stepping, grazing
7: "#8497b0", # Stationary, grazing
}
"""Per-PM colour assignment used in every manuscript figure.
PM2, PM4, and PM5 share ``#e0ce61`` so the three ear-flick subtypes
read as one visual category. These colours are the authoritative
figure palette and supersede any Wong-derived category palette.
"""
PM_CATEGORY_COLOURS: dict[str, str] = {
"inactive": "#3F7FB5", # blue
"grazing": "#4E8C3A", # green
"ear_flick": "#D96828", # orange
}
"""Per-behavioural-family colour swatch — lab canonical palette.
These three colours are the agreed-upon family swatches for every
figure that groups the eight PMs into their behavioural families
(Inactive = PM 0 + PM 1 + PM 3, Grazing = PM 6 + PM 7, Ear flick =
PM 2 + PM 4 + PM 5). They are deliberately *not* derived from
:data:`PM_COLOURS` (which uses the Wong palette for per-PM plotting):
the family-level chart asks a different question (three families,
not eight prototypes) and the lab uses a distinct palette to make
that distinction visually obvious in posters, slides, and the
manuscript figures.
Use :data:`PM_COLOURS` for per-PM plotting; use this dict for
category-level legends and category-grouped bar charts where one
swatch per family is wanted.
"""
# ┌────────────────────────────────────────────────────────────┐
# │ Figure defaults « SVG + PNG + CSV triple output » │
# └────────────────────────────────────────────────────────────┘
FIGURE_DPI: int = 200
"""Raster export resolution (PNG)."""
FIGURE_SIZE_SINGLE: tuple[float, float] = (3.5, 2.8)
"""Single-column Elsevier figure size in inches."""
FIGURE_SIZE_DOUBLE: tuple[float, float] = (7.2, 4.5)
"""Two-column (full-width) Elsevier figure size in inches."""
FONT_FAMILY: str = "DejaVu Sans"
"""Default sans-serif font for figures."""
# Matplotlib rcParams applied by :func:`apply_figure_defaults`.
_FIGURE_RC: dict[str, object] = {
"svg.fonttype": "none", # editable text in Inkscape
"savefig.dpi": FIGURE_DPI,
"savefig.bbox": "tight",
"font.family": FONT_FAMILY,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": False,
}
# ┌────────────────────────────────────────────────────────────┐
# │ Path templates « default output layout » │
# └────────────────────────────────────────────────────────────┘
RESULTS_DIR_DEFAULT: Path = Path("results")
"""Default top-level output directory (relative to working dir)."""
FIGURES_SUBDIR: str = "figures"
TABLES_SUBDIR: str = "tables"
# ┌────────────────────────────────────────────────────────────┐
# │ Canonical data paths « DINZ deer-2024 archive » │
# └────────────────────────────────────────────────────────────┘
#
# See [[Data Files — Source of Truth]] in the DINZ Obsidian folder
# for the full file map (what is canonical, derived, legacy, or
# corrupt). The paths below are the ones analysis scripts default
# to; pass --meta-dir / --data-file on the CLI to override.
HCS_SOURCE_DIR: Path = _local(
"hcs_source",
default="<hcs_source not configured - see local_paths.template.json>",
)
"""Read-only network archive of the 2024 deer dataset (3.0 TB).
Too slow for direct analysis — mirror the curated subset locally.
Resolved by :mod:`stag.local_paths`: ``STAG_HCS_DIR`` env var,
then ``local_paths.json`` ``hcs_source`` field, then the placeholder
default (which will crash any downstream read with a clear path)."""
LOCAL_DATA_DIR: Path = _local(
"data_root",
default="<data_root not configured - see local_paths.template.json>",
)
"""Local working copy on the NVMe data drive. Tier-1 footprint
≈ 43 GB. All path constants below are anchored here. Resolved by
:mod:`stag.local_paths`: ``STAG_DATA_DIR`` env var, then
``local_paths.json`` ``data_root`` field, then placeholder default."""
RAW_CLUSTERING_INPUT: Path = LOCAL_DATA_DIR / "clust_data_raw_20240412.npy"
"""Raw 8-column input the SLURM clustering actually read.
Shape ``(204_554_618, 8)`` float64. Columns 0–5 are the six
accelerometer axes; columns 6–7 are GPS-derived speed and
tortuosity (excluded from clustering)."""
MAXABS_CLUSTERING_INPUT: Path = LOCAL_DATA_DIR / "clust_data_maxabs_6col.npy"
"""Six-column MaxAbs-scaled feature matrix derived from
:data:`RAW_CLUSTERING_INPUT` by
``scripts/preprocess_clustering_data.py``. Shape
``(204_554_618, 6)`` float64. Each column is divided by its
absolute maximum, mapping the data to [-1, 1] per column —
reproducing the 2024 SLURM pipeline's normalisation exactly
(MaxAbsScaler + col-5 ±7.99 clip). This is the file every
internal- and external-validation analysis consumes."""
MAXABS_SCALER_CSV: Path = LOCAL_DATA_DIR / "clust_data_maxabs_6col.maxabs.csv"
"""Per-column max-abs divisors used to produce
:data:`MAXABS_CLUSTERING_INPUT`. Written alongside the .npy
by the preprocess script — needed to invert the scaling back
to physical units for centroid interpretation."""
DEER_DB: Path = LOCAL_DATA_DIR / "deer_data_gps.db"
"""SQLite copy of the canonical deer-2024 DB (~58 GB) — six tables:
``accelerometer_data`` (~222 M rows), ``cluster_labels`` (~204.5 M rows,
FK ``acc_id`` → ``accelerometer_data.data_id``), ``trajectory_data``
(GPS, ~207 M rows after upsampling), ``deer_info`` (26 animals),
``video_observation_reference`` (926 clips), ``video_availability``
(2.8 M rows). All required composite/FK indexes are already in
place. See ``scripts/cache_label_timeline.py`` for the canonical
join that aligns DB rows with the saved ``labels.npy``."""
LABEL_TIMELINE_DEER_IDS: Path = LOCAL_DATA_DIR / "label_timeline_deer_ids.npy"
"""Per-sample ``deer_id`` aligned with the saved k=8 ``labels.npy``
(shape ``(204_554_618,)`` int8). Built once by
``scripts/cache_label_timeline.py`` from the DEER_DB join
``cluster_labels.acc_id`` → ``accelerometer_data.data_id``."""
LABEL_TIMELINE_TIMESTAMPS: Path = LOCAL_DATA_DIR / "label_timeline_timestamps.npy"
"""Per-sample wall-clock timestamp aligned with the saved k=8
``labels.npy`` (shape ``(204_554_618,)`` int64 nanoseconds since
the Unix epoch, NZ local time as stored in the DB). Built once
by ``scripts/cache_label_timeline.py``."""
CLUSTER_RESULTS_DIR: Path = LOCAL_DATA_DIR / "cluster_results" / "deer6raw"
"""Root of the per-fit metadata + centroids + labels tree
produced by the Aoraki SLURM sweep. Per-fit JSONs and centroid
arrays are present in full; labels are present for the 24
representative runs only."""
CANONICAL_K8_LABELS: Path = (
CLUSTER_RESULTS_DIR / "delSize_0" / "k_8" / "labels"
/ "deer6raw_labels_k8_delSize0_partA.npy"
)
"""Manuscript-aligned k=8 labels.npy (Partition A).
Of the 50 fits saved at ``delSize_0/k_8``, 17 converged to the
basin whose centroids match :data:`centroid_label_info` exactly and
whose cluster IDs are in manuscript PM order
(0 = Quiescent, 1 = Resting, … 7 = Stationary grazing). None of
those 17 had their labels mirrored from the original sweep, so
this file is regenerated locally by nearest-manuscript-centroid
assignment on :data:`MAXABS_CLUSTERING_INPUT` — bit-equivalent to
the converged k-means assignment for that basin, prevalences match
the manuscript table to within 0.013 % per PM.
Use this constant — not a glob of the labels directory — wherever
downstream code wants "the k=8 labels."""