Spaces:
Sleeping
Sleeping
feat: add feature statistics computation from training CSV for enhanced analysis
Browse files
app/training/compute_feature_stats.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compute per-feature population statistics from training CSV.
|
| 2 |
+
|
| 3 |
+
Produces feature_stats_v1.json with mean, std, min, max, median, q25, q75
|
| 4 |
+
for each feature column. Used by XAI service for z-score computation.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python -m app.training.compute_feature_stats DataSet/features.csv
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import csv
|
| 13 |
+
import json
|
| 14 |
+
import sys
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import numpy as np
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def compute_stats(csv_path: str | Path, output_path: str | Path) -> None:
|
| 21 |
+
csv_path = Path(csv_path)
|
| 22 |
+
output_path = Path(output_path)
|
| 23 |
+
|
| 24 |
+
with open(csv_path, "r", encoding="utf-8") as f:
|
| 25 |
+
reader = csv.DictReader(f)
|
| 26 |
+
rows = list(reader)
|
| 27 |
+
fieldnames = reader.fieldnames or []
|
| 28 |
+
|
| 29 |
+
excluded = {"file_path", "label_int"}
|
| 30 |
+
feature_cols = [c for c in fieldnames if c not in excluded]
|
| 31 |
+
|
| 32 |
+
stats: dict[str, dict[str, float]] = {}
|
| 33 |
+
|
| 34 |
+
for col in feature_cols:
|
| 35 |
+
values = []
|
| 36 |
+
for row in rows:
|
| 37 |
+
try:
|
| 38 |
+
v = float(row[col])
|
| 39 |
+
if not (np.isnan(v) or np.isinf(v)):
|
| 40 |
+
values.append(v)
|
| 41 |
+
except (ValueError, TypeError):
|
| 42 |
+
continue
|
| 43 |
+
if not values:
|
| 44 |
+
stats[col] = {
|
| 45 |
+
"mean": 0.0, "std": 1.0,
|
| 46 |
+
"min": 0.0, "max": 1.0,
|
| 47 |
+
"median": 0.0, "q25": 0.0, "q75": 0.0,
|
| 48 |
+
"count": 0,
|
| 49 |
+
}
|
| 50 |
+
continue
|
| 51 |
+
arr = np.array(values)
|
| 52 |
+
stats[col] = {
|
| 53 |
+
"mean": float(np.mean(arr)),
|
| 54 |
+
"std": float(np.std(arr)) or 1.0,
|
| 55 |
+
"min": float(np.min(arr)),
|
| 56 |
+
"max": float(np.max(arr)),
|
| 57 |
+
"median": float(np.median(arr)),
|
| 58 |
+
"q25": float(np.percentile(arr, 25)),
|
| 59 |
+
"q75": float(np.percentile(arr, 75)),
|
| 60 |
+
"count": len(values),
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Also split by class for AI vs Human distributions
|
| 64 |
+
by_class: dict[str, dict[str, dict[str, float]]] = {"ai": {}, "human": {}}
|
| 65 |
+
for cls_label, cls_int in (("ai", "1"), ("human", "0")):
|
| 66 |
+
for col in feature_cols:
|
| 67 |
+
values = []
|
| 68 |
+
for row in rows:
|
| 69 |
+
if row.get("label_int") != cls_int:
|
| 70 |
+
continue
|
| 71 |
+
try:
|
| 72 |
+
v = float(row[col])
|
| 73 |
+
if not (np.isnan(v) or np.isinf(v)):
|
| 74 |
+
values.append(v)
|
| 75 |
+
except (ValueError, TypeError):
|
| 76 |
+
continue
|
| 77 |
+
if values:
|
| 78 |
+
arr = np.array(values)
|
| 79 |
+
by_class[cls_label][col] = {
|
| 80 |
+
"mean": float(np.mean(arr)),
|
| 81 |
+
"std": float(np.std(arr)) or 1.0,
|
| 82 |
+
"median": float(np.median(arr)),
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
output = {**stats, "_by_class": by_class}
|
| 86 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 88 |
+
json.dump(output, f, indent=2)
|
| 89 |
+
print(f"Wrote {output_path} ({len(stats)} features, {len(rows)} samples)")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
csv = sys.argv[1] if len(sys.argv) > 1 else "../DataSet/features.csv"
|
| 94 |
+
out = sys.argv[2] if len(sys.argv) > 2 else "models/feature_stats_v1.json"
|
| 95 |
+
compute_stats(csv, out)
|