crowncode-backend / app /training /compute_feature_stats.py
Rthur2003's picture
feat: update excluded columns in feature statistics computation to prevent data leakage
edc551d
"""Compute per-feature population statistics from training CSV.
Produces feature_stats_v1.json with mean, std, min, max, median, q25, q75
for each feature column. Used by XAI service for z-score computation.
Usage:
python -m app.training.compute_feature_stats DataSet/features.csv
"""
from __future__ import annotations
import csv
import json
import sys
from pathlib import Path
import numpy as np
def compute_stats(csv_path: str | Path, output_path: str | Path) -> None:
csv_path = Path(csv_path)
output_path = Path(output_path)
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
fieldnames = reader.fieldnames or []
excluded = {"file_path", "label_int", "duration_sec", "sample_rate"}
feature_cols = [c for c in fieldnames if c not in excluded]
stats: dict[str, dict[str, float]] = {}
for col in feature_cols:
values = []
for row in rows:
try:
v = float(row[col])
if not (np.isnan(v) or np.isinf(v)):
values.append(v)
except (ValueError, TypeError):
continue
if not values:
stats[col] = {
"mean": 0.0, "std": 1.0,
"min": 0.0, "max": 1.0,
"median": 0.0, "q25": 0.0, "q75": 0.0,
"count": 0,
}
continue
arr = np.array(values)
stats[col] = {
"mean": float(np.mean(arr)),
"std": float(np.std(arr)) or 1.0,
"min": float(np.min(arr)),
"max": float(np.max(arr)),
"median": float(np.median(arr)),
"q25": float(np.percentile(arr, 25)),
"q75": float(np.percentile(arr, 75)),
"count": len(values),
}
# Also split by class for AI vs Human distributions
by_class: dict[str, dict[str, dict[str, float]]] = {"ai": {}, "human": {}}
for cls_label, cls_int in (("ai", "1"), ("human", "0")):
for col in feature_cols:
values = []
for row in rows:
if row.get("label_int") != cls_int:
continue
try:
v = float(row[col])
if not (np.isnan(v) or np.isinf(v)):
values.append(v)
except (ValueError, TypeError):
continue
if values:
arr = np.array(values)
by_class[cls_label][col] = {
"mean": float(np.mean(arr)),
"std": float(np.std(arr)) or 1.0,
"median": float(np.median(arr)),
}
output = {**stats, "_by_class": by_class}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2)
print(f"Wrote {output_path} ({len(stats)} features, {len(rows)} samples)")
if __name__ == "__main__":
csv_in = sys.argv[1] if len(sys.argv) > 1 else "../DataSet/features.csv"
out = sys.argv[2] if len(sys.argv) > 2 else "models/feature_stats_v1.json"
compute_stats(csv_in, out)