Source code for stag.analysis.nan_handler

# ╔══════════════════════════════════════════════════════════════════╗
# ║  STAG — analysis.nan_handler                                     ║
# ║  « interpolate or drop NaN gaps »                                ║
# ╠══════════════════════════════════════════════════════════════════╣
# ║  Utility for repairing the feature matrix prior to               ║
# ║  clustering.  See the docstring for the gap-size cutoff.         ║
# ╚══════════════════════════════════════════════════════════════════╝
"""NaN detection and linear interpolation for sensor data."""

import numpy as np
import pandas as pd



[docs]
def load_data(filename):
    """Loads data from a file. Handles various potential file formats."""
    try:
        # Try loading as a NumPy array directly
        return np.load(filename)
    except ValueError:
        # File might be CSV or have mixed delimiters; use pandas for flexibility
        return pd.read_csv(filename, header=None).to_numpy()




[docs]
def find_nan_sequences(arr):
    """Finds sequences of NaN values within each column."""
    nan_sequences = []
    for col_idx in range(arr.shape[1]):
        col = arr[:, col_idx]
        is_nan = np.isnan(col)
        if any(is_nan):
            start = None
            for i, val in enumerate(is_nan):
                if val and start is None:
                    start = i
                elif not val and start is not None:
                    nan_sequences.append((col_idx, start, i - 1))
                    start = None
    return nan_sequences




[docs]
def interpolate_nan_sequences(arr, nan_sequences):
    """Interpolates NaN sequences linearly in each column."""
    for col_idx, start, end in nan_sequences:
        col = arr[:, col_idx]
        y1 = col[start - 1] if start > 0 else np.nan
        y2 = col[end + 1] if end < len(col) - 1 else np.nan
        interp_values = np.linspace(y1, y2, end - start + 1)
        col[start:end + 1] = interp_values



if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Interpolate NaN runs in a raw accelerometer .npy array.",
    )
    parser.add_argument("infile", help="Input .npy (or CSV) feature matrix.")
    parser.add_argument("outfile", help="Output .npy with NaN runs interpolated.")
    args = parser.parse_args()

    data = load_data(args.infile)
    interpolate_nan_sequences(data, find_nan_sequences(data))
    np.save(args.outfile, data)
    print("NaN sequences interpolated.")