Source code for wings.analysis.ingest_delta_p

#!/usr/bin/env python3
"""
W.I.N.G.S. — Δp sweep data ingestion.

Reads individual ABM CSV outputs from the delta-p sweep and combines
them into a single CSV for plotting.

Filename convention:
    {PHENOTYPE}_frac{NNN}_rep{R}.csv
    e.g. CI_frac050_rep3.csv  → phenotype=CI, fraction=0.50, rep=3

Each per-run CSV has columns: Day, Population Size, Infection Rate, ...

Output CSV adds: Phenotype, Infected Fraction, Replicate ID

Usage:
    python ingest_delta_p.py \
        --input-dir "$WINGS_DATA_ROOT/abm_delta_p" \
        --output data/combined_delta_p.csv
"""

import argparse
import os
import re
import sys
from pathlib import Path

import pandas as pd

# Filename pattern: PHENOTYPE_fracNNN_repR.csv
FNAME_RE = re.compile(
    r'^(?P<pheno>[A-Z_]+)_frac(?P<frac>\d{3})_rep(?P<rep>\d+)\.csv$'
)



[docs]
def parse_filename(fname):
    """Extract phenotype, initial fraction, and replicate ID from filename.

    Expected format: ``{PHENOTYPE}_frac{NNN}_rep{R}.csv``
    (e.g. ``CI_frac050_rep3.csv`` → phenotype=CI, fraction=0.50, rep=3).

    Args:
        fname (str): Filename (basename only).

    Returns:
        tuple or None: ``(phenotype, fraction, replicate)`` or ``None``.
    """
    m = FNAME_RE.match(fname)
    if not m:
        return None
    pheno = m.group('pheno')
    frac = int(m.group('frac')) / 100.0  # 050 → 0.50
    rep = int(m.group('rep'))
    return pheno, frac, rep




[docs]
def main():
    """CLI entry point for Δp sweep data ingestion.

    Scans the input directory for per-run CSVs, adds a ``Day``
    column from row index (raw CSVs lack a time column), and
    combines into a single dataset.
    """
    parser = argparse.ArgumentParser(
        description="W.I.N.G.S. — Δp sweep data ingestion"
    )
    parser.add_argument(
        '--input-dir', required=True,
        help='Directory containing per-run CSV files'
    )
    parser.add_argument(
        '--output', required=True,
        help='Output combined CSV path'
    )
    args = parser.parse_args()

    input_dir = Path(args.input_dir)
    if not input_dir.is_dir():
        print(f"Error: {input_dir} is not a directory", file=sys.stderr)
        sys.exit(1)

    csv_files = sorted(input_dir.glob('*.csv'))
    print(f"Found {len(csv_files)} CSV files in {input_dir}")

    frames = []
    skipped = 0

    for fpath in csv_files:
        parsed = parse_filename(fpath.name)
        if parsed is None:
            skipped += 1
            continue

        pheno, frac, rep = parsed

        try:
            df = pd.read_csv(fpath)
        except Exception as e:
            print(f"  [skip] {fpath.name}: {e}")
            skipped += 1
            continue

        if df.empty or 'Infection Rate' not in df.columns:
            skipped += 1
            continue

        # Raw CSVs have no time column — row index is the day
        if 'Day' not in df.columns:
            df.insert(0, 'Day', range(len(df)))

        df['Phenotype'] = pheno
        df['Infected Fraction'] = frac
        df['Replicate ID'] = rep
        frames.append(df)

    if not frames:
        print("Error: no valid CSV files found!", file=sys.stderr)
        sys.exit(1)

    combined = pd.concat(frames, ignore_index=True)

    # Ensure output directory exists
    os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
    combined.to_csv(args.output, index=False)

    n_pheno = combined['Phenotype'].nunique()
    n_frac = combined['Infected Fraction'].nunique()
    n_reps = combined.groupby(['Phenotype', 'Infected Fraction'])['Replicate ID'].nunique().median()

    print(f"\n  Combined: {len(combined):,} rows")
    print(f"  Phenotypes:  {n_pheno} ({', '.join(sorted(combined['Phenotype'].unique()))})")
    print(f"  Fractions:   {n_frac}")
    print(f"  Reps/cond:   ~{n_reps:.0f}")
    print(f"  Skipped:     {skipped}")
    print(f"  Saved to:    {args.output}")



if __name__ == '__main__':
    main()