Spaces:
Sleeping
Sleeping
| """Compute per-feature population statistics from training CSV. | |
| Produces feature_stats_v1.json with mean, std, min, max, median, q25, q75 | |
| for each feature column. Used by XAI service for z-score computation. | |
| Usage: | |
| python -m app.training.compute_feature_stats DataSet/features.csv | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import json | |
| import sys | |
| from pathlib import Path | |
| import numpy as np | |
| def compute_stats(csv_path: str | Path, output_path: str | Path) -> None: | |
| csv_path = Path(csv_path) | |
| output_path = Path(output_path) | |
| with open(csv_path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| rows = list(reader) | |
| fieldnames = reader.fieldnames or [] | |
| excluded = {"file_path", "label_int", "duration_sec", "sample_rate"} | |
| feature_cols = [c for c in fieldnames if c not in excluded] | |
| stats: dict[str, dict[str, float]] = {} | |
| for col in feature_cols: | |
| values = [] | |
| for row in rows: | |
| try: | |
| v = float(row[col]) | |
| if not (np.isnan(v) or np.isinf(v)): | |
| values.append(v) | |
| except (ValueError, TypeError): | |
| continue | |
| if not values: | |
| stats[col] = { | |
| "mean": 0.0, "std": 1.0, | |
| "min": 0.0, "max": 1.0, | |
| "median": 0.0, "q25": 0.0, "q75": 0.0, | |
| "count": 0, | |
| } | |
| continue | |
| arr = np.array(values) | |
| stats[col] = { | |
| "mean": float(np.mean(arr)), | |
| "std": float(np.std(arr)) or 1.0, | |
| "min": float(np.min(arr)), | |
| "max": float(np.max(arr)), | |
| "median": float(np.median(arr)), | |
| "q25": float(np.percentile(arr, 25)), | |
| "q75": float(np.percentile(arr, 75)), | |
| "count": len(values), | |
| } | |
| # Also split by class for AI vs Human distributions | |
| by_class: dict[str, dict[str, dict[str, float]]] = {"ai": {}, "human": {}} | |
| for cls_label, cls_int in (("ai", "1"), ("human", "0")): | |
| for col in feature_cols: | |
| values = [] | |
| for row in rows: | |
| if row.get("label_int") != cls_int: | |
| continue | |
| try: | |
| v = float(row[col]) | |
| if not (np.isnan(v) or np.isinf(v)): | |
| values.append(v) | |
| except (ValueError, TypeError): | |
| continue | |
| if values: | |
| arr = np.array(values) | |
| by_class[cls_label][col] = { | |
| "mean": float(np.mean(arr)), | |
| "std": float(np.std(arr)) or 1.0, | |
| "median": float(np.median(arr)), | |
| } | |
| output = {**stats, "_by_class": by_class} | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(output, f, indent=2) | |
| print(f"Wrote {output_path} ({len(stats)} features, {len(rows)} samples)") | |
| if __name__ == "__main__": | |
| csv_in = sys.argv[1] if len(sys.argv) > 1 else "../DataSet/features.csv" | |
| out = sys.argv[2] if len(sys.argv) > 2 else "models/feature_stats_v1.json" | |
| compute_stats(csv_in, out) | |