# ╔══════════════════════════════════════════════════════════════════╗
# ║ arista — db.schema ║
# ║ « SQLite CREATE statements + build_schema(conn) » ║
# ╠══════════════════════════════════════════════════════════════════╣
# ║ Single source of truth for the arista.db structure. ║
# ║ ║
# ║ Five dimensions (researchers, strains, cell_types, ║
# ║ stimulus_protocols, source_files) plus the half-dimension ║
# ║ animals, the recordings fact table, the samples long fact, and ║
# ║ the per-step stimulus_responses table. ║
# ║ ║
# ║ Mirrors [[Database Schema]] in the project notebook; the SQL ║
# ║ below is the canonical form. Foreign keys are enforced. ║
# ╚══════════════════════════════════════════════════════════════════╝
"""SQLite schema for the arista calcium-imaging corpus."""
from __future__ import annotations
import sqlite3
# ─────────────────────────────────────────────────────────────────
# Layout
# ─────────────────────────────────────────────────────────────────
# Schema build runs in two phases:
#
# 1. _TABLES_AND_VIEWS_SQL — tables + view in one executescript
# (cheap regardless of corpus size).
# 2. _INDEX_DDL — list of individual CREATE INDEX
# statements run one at a time so build_schema() can drive a
# tqdm progress bar. CREATE INDEX on a populated table can take
# seconds to minutes at scale and going silent during that wait
# makes the run look hung.
#
# SCHEMA_SQL combines both into the historical all-in-one string so
# external callers / docs / manual sqlite3 ".read" usage keep working.
_TABLES_AND_VIEWS_SQL: str = """
PRAGMA foreign_keys = ON;
-- ─────────────────────────────────────────────────────────────────
-- Dimension tables
-- ─────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS researchers (
researcher_id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
role TEXT,
period TEXT
);
CREATE TABLE IF NOT EXISTS strains (
strain_id INTEGER PRIMARY KEY,
strain_name TEXT NOT NULL UNIQUE,
description TEXT
);
CREATE TABLE IF NOT EXISTS cell_types (
cell_type_id INTEGER PRIMARY KEY,
code TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
description TEXT
);
CREATE TABLE IF NOT EXISTS stimulus_protocols (
stimulus_id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
family TEXT NOT NULL
CHECK (family IN ('thermal_step',
'thermal_adapt',
'mechanical')),
description TEXT,
target_sequence_json TEXT
);
CREATE TABLE IF NOT EXISTS source_files (
file_id INTEGER PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
kind TEXT NOT NULL
CHECK (kind IN ('fiji_csv',
'fiji_txt',
'sensor_mat',
'sensor_txt',
'processed_csv',
'processed_txt',
'raw_movie',
'raw_movie_tar')),
archive_path TEXT,
sha256 TEXT,
size_bytes INTEGER,
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
);
-- ─────────────────────────────────────────────────────────────────
-- Half-dimension / fact tables
-- ─────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS animals (
animal_id INTEGER PRIMARY KEY,
researcher_id INTEGER NOT NULL REFERENCES researchers(researcher_id),
strain_id INTEGER NOT NULL REFERENCES strains(strain_id),
recording_date TEXT NOT NULL,
sex TEXT NOT NULL CHECK (sex IN ('m','f','u')),
animal_number INTEGER NOT NULL,
arista_suffix TEXT,
notes TEXT,
UNIQUE (researcher_id, recording_date, sex, animal_number, arista_suffix)
);
CREATE TABLE IF NOT EXISTS recordings (
recording_id INTEGER PRIMARY KEY,
animal_id INTEGER NOT NULL REFERENCES animals(animal_id),
cell_type_id INTEGER NOT NULL REFERENCES cell_types(cell_type_id),
cell_number INTEGER NOT NULL,
stimulus_id INTEGER NOT NULL REFERENCES stimulus_protocols(stimulus_id),
hemisphere TEXT CHECK (hemisphere IN ('l','r') OR hemisphere IS NULL),
fps REAL NOT NULL DEFAULT 10.0,
n_samples INTEGER,
duration_s REAL,
drift_correction TEXT CHECK (drift_correction IN
('linear','poly','exp','none','unknown'))
DEFAULT 'unknown',
temperature_source TEXT CHECK (temperature_source IN
('original','median_template','interpolated','none'))
DEFAULT 'original',
qc_flag TEXT,
response_file_id INTEGER REFERENCES source_files(file_id),
sensor_file_id INTEGER REFERENCES source_files(file_id),
processed_file_id INTEGER REFERENCES source_files(file_id),
raw_movie_file_id INTEGER REFERENCES source_files(file_id),
notes TEXT,
-- hemisphere is part of the natural key: an animal can yield CC01
-- in the left arista AND CC01 in the right arista, which are
-- genuinely different cells under the same fly. NULL hemispheres
-- (pooled / unspecified) compare non-equal in SQLite, so multiple
-- pooled-hemisphere recordings with otherwise matching keys are
-- still permitted.
UNIQUE (animal_id, cell_type_id, cell_number, stimulus_id, hemisphere)
);
CREATE TABLE IF NOT EXISTS samples (
recording_id INTEGER NOT NULL REFERENCES recordings(recording_id),
frame INTEGER NOT NULL,
time_s REAL NOT NULL,
sensor_t_c REAL,
target_t_c REAL,
drive_t_c REAL,
dfbf REAL NOT NULL,
dfbf_drift_corrected REAL,
PRIMARY KEY (recording_id, frame)
) WITHOUT ROWID;
CREATE TABLE IF NOT EXISTS stimulus_responses (
response_id INTEGER PRIMARY KEY,
recording_id INTEGER NOT NULL REFERENCES recordings(recording_id),
step_index INTEGER NOT NULL,
target_temp_c REAL NOT NULL,
delta_target_c REAL,
observed_temp_median REAL,
dfbf_response_median REAL NOT NULL,
n_frames_in_window INTEGER,
UNIQUE (recording_id, step_index)
);
-- One exponential-decay fit per HotAdapt / ColdAdapt recording. The
-- table is intentionally narrow (one row per recording) with the
-- recording_id as PRIMARY KEY so re-running the processor with
-- ``OR REPLACE`` semantics is a single upsert. Recordings whose
-- stimulus is anything other than thermal_adapt never appear here.
CREATE TABLE IF NOT EXISTS adaptation_fits (
recording_id INTEGER PRIMARY KEY
REFERENCES recordings(recording_id),
tau_s REAL NOT NULL,
amplitude REAL NOT NULL,
asymptote REAL NOT NULL,
r_squared REAL,
fit_window_start_s REAL NOT NULL,
fit_window_end_s REAL NOT NULL,
n_points INTEGER NOT NULL
);
-- ─────────────────────────────────────────────────────────────────
-- Convenience views « pre-joined recording metadata »
-- ─────────────────────────────────────────────────────────────────
--
-- recordings.animal_id is the only direct FK to the animal-side
-- dimensions; researcher / strain / etc. are reached transitively
-- through animals. That's the correct normalised design but it makes
-- ad-hoc SQL noisy ("which recordings are by Alex?" should not need
-- a four-table JOIN by hand). v_recordings flattens the chain so
-- every recording row exposes its researcher_name, strain_name,
-- cell-type code and stimulus name in one shot.
CREATE VIEW IF NOT EXISTS v_recordings AS
SELECT
r.recording_id,
r.animal_id,
a.researcher_id,
res.name AS researcher_name,
a.strain_id,
s.strain_name,
a.recording_date,
a.sex,
a.animal_number,
a.arista_suffix,
r.cell_type_id,
ct.code AS cell_type,
r.cell_number,
r.hemisphere,
r.stimulus_id,
sp.name AS stimulus_name,
sp.family AS stimulus_family,
r.fps,
r.n_samples,
r.duration_s,
r.drift_correction,
r.temperature_source,
r.qc_flag,
r.response_file_id,
r.sensor_file_id,
r.processed_file_id,
r.raw_movie_file_id,
r.notes
FROM recordings r
JOIN animals a ON a.animal_id = r.animal_id
JOIN researchers res ON res.researcher_id = a.researcher_id
JOIN strains s ON s.strain_id = a.strain_id
JOIN cell_types ct ON ct.cell_type_id = r.cell_type_id
JOIN stimulus_protocols sp ON sp.stimulus_id = r.stimulus_id;
"""
# ─────────────────────────────────────────────────────────────────
# Index DDL « one statement per item so tqdm can report progress »
# ─────────────────────────────────────────────────────────────────
# Each tuple is ``(short_label, sql)``. The label is what shows in the
# tqdm bar's postfix ("idx_rec_animal"), the sql is the statement to
# execute. CREATE INDEX IF NOT EXISTS makes every entry idempotent.
_INDEX_DDL: list[tuple[str, str]] = [
# ── FK-acceleration indexes (Phase 1) ─────────────────────────
("idx_rec_animal",
"CREATE INDEX IF NOT EXISTS idx_rec_animal ON recordings (animal_id)"),
("idx_rec_stim_cell",
"CREATE INDEX IF NOT EXISTS idx_rec_stim_cell "
"ON recordings (stimulus_id, cell_type_id)"),
("idx_rec_qc",
"CREATE INDEX IF NOT EXISTS idx_rec_qc "
"ON recordings (qc_flag) WHERE qc_flag IS NOT NULL"),
("idx_animal_strain",
"CREATE INDEX IF NOT EXISTS idx_animal_strain ON animals (strain_id)"),
("idx_animal_researcher_date",
"CREATE INDEX IF NOT EXISTS idx_animal_researcher_date "
"ON animals (researcher_id, recording_date)"),
("idx_resp_rec",
"CREATE INDEX IF NOT EXISTS idx_resp_rec ON stimulus_responses (recording_id)"),
("idx_source_kind",
"CREATE INDEX IF NOT EXISTS idx_source_kind ON source_files (kind)"),
# ── Analysis-time single-column filter indexes (Phase 5) ──────
# idx_rec_stim_cell only helps queries filtering on BOTH columns;
# analysis routinely filters on one alone. Single-column indexes
# below give the planner a leaf-level scan rather than a table scan.
("idx_rec_cell_type", # "all CC cells", "all HC cells"
"CREATE INDEX IF NOT EXISTS idx_rec_cell_type ON recordings (cell_type_id)"),
("idx_rec_stimulus", # "all ascAmp recordings"
"CREATE INDEX IF NOT EXISTS idx_rec_stimulus ON recordings (stimulus_id)"),
("idx_rec_hemisphere", # "left vs right arista"
"CREATE INDEX IF NOT EXISTS idx_rec_hemisphere "
"ON recordings (hemisphere) WHERE hemisphere IS NOT NULL"),
("idx_rec_drift_correction", # "all poly-corrected" / "unknown"
"CREATE INDEX IF NOT EXISTS idx_rec_drift_correction "
"ON recordings (drift_correction)"),
("idx_animal_sex", # "all female flies"
"CREATE INDEX IF NOT EXISTS idx_animal_sex ON animals (sex)"),
# ── stimulus_responses indexes (Phase 6 will populate this table)
("idx_resp_step_index", # "step 0 medians across corpus"
"CREATE INDEX IF NOT EXISTS idx_resp_step_index "
"ON stimulus_responses (step_index)"),
("idx_resp_target_temp", # "responses at +6 °C"
"CREATE INDEX IF NOT EXISTS idx_resp_target_temp "
"ON stimulus_responses (target_temp_c)"),
]
# All-in-one string preserved for callers that want to .read it via
# the sqlite3 CLI or quote it in docs. tqdm progress is unavailable
# from this form — use build_schema(conn, progress=...) for that.
SCHEMA_SQL: str = (
_TABLES_AND_VIEWS_SQL.rstrip()
+ "\n\n-- ─── Indexes ───\n"
+ "\n".join(sql + ";" for _, sql in _INDEX_DDL)
+ "\n"
)
[docs]
def build_schema(
conn: sqlite3.Connection,
*,
progress: bool | None = None,
) -> None:
"""Apply the full schema to a database connection.
Idempotent: every ``CREATE`` uses ``IF NOT EXISTS`` so re-running
against an already-populated database is a no-op. Foreign-key
enforcement is enabled via ``PRAGMA``.
Tables and the ``v_recordings`` view are applied in one
``executescript`` call (cheap regardless of corpus size). Indexes
are applied one at a time so a tqdm progress bar can report which
index is currently being built — at scale a single ``CREATE
INDEX`` on a populated table can take seconds to minutes and
going silent during that wait makes the run look hung.
Args:
conn: An open SQLite connection (typically to ``arista.db`` or
``:memory:`` for tests).
progress: Controls the index-creation progress bar.
``None`` (default) auto-detects: shows when stdout is a
TTY, hides otherwise (the typical pytest setup, so test
fixtures don't need to pass anything).
``True`` forces the bar on; ``False`` forces it off.
"""
conn.executescript(_TABLES_AND_VIEWS_SQL)
_build_indexes(conn, progress=progress)
conn.commit()
def _build_indexes(
conn: sqlite3.Connection,
*,
progress: bool | None = None,
) -> None:
"""Create every index in :data:`_INDEX_DDL` one at a time.
Lazy ``tqdm`` import so the schema module stays cheap to load for
callers that only need the SQL string (docs / one-off SQLite CLI).
"""
from tqdm.auto import tqdm
disable = None if progress is None else not progress
bar = tqdm(_INDEX_DDL, desc="Building indexes", unit="idx", disable=disable)
for label, sql in bar:
bar.set_postfix_str(label, refresh=False)
conn.execute(sql)