Rthur2003 commited on
Commit
575dde4
·
1 Parent(s): ac52af9

feat: add feature statistics computation from training CSV for enhanced analysis

Browse files
app/training/compute_feature_stats.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compute per-feature population statistics from training CSV.
2
+
3
+ Produces feature_stats_v1.json with mean, std, min, max, median, q25, q75
4
+ for each feature column. Used by XAI service for z-score computation.
5
+
6
+ Usage:
7
+ python -m app.training.compute_feature_stats DataSet/features.csv
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import csv
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ import numpy as np
18
+
19
+
20
+ def compute_stats(csv_path: str | Path, output_path: str | Path) -> None:
21
+ csv_path = Path(csv_path)
22
+ output_path = Path(output_path)
23
+
24
+ with open(csv_path, "r", encoding="utf-8") as f:
25
+ reader = csv.DictReader(f)
26
+ rows = list(reader)
27
+ fieldnames = reader.fieldnames or []
28
+
29
+ excluded = {"file_path", "label_int"}
30
+ feature_cols = [c for c in fieldnames if c not in excluded]
31
+
32
+ stats: dict[str, dict[str, float]] = {}
33
+
34
+ for col in feature_cols:
35
+ values = []
36
+ for row in rows:
37
+ try:
38
+ v = float(row[col])
39
+ if not (np.isnan(v) or np.isinf(v)):
40
+ values.append(v)
41
+ except (ValueError, TypeError):
42
+ continue
43
+ if not values:
44
+ stats[col] = {
45
+ "mean": 0.0, "std": 1.0,
46
+ "min": 0.0, "max": 1.0,
47
+ "median": 0.0, "q25": 0.0, "q75": 0.0,
48
+ "count": 0,
49
+ }
50
+ continue
51
+ arr = np.array(values)
52
+ stats[col] = {
53
+ "mean": float(np.mean(arr)),
54
+ "std": float(np.std(arr)) or 1.0,
55
+ "min": float(np.min(arr)),
56
+ "max": float(np.max(arr)),
57
+ "median": float(np.median(arr)),
58
+ "q25": float(np.percentile(arr, 25)),
59
+ "q75": float(np.percentile(arr, 75)),
60
+ "count": len(values),
61
+ }
62
+
63
+ # Also split by class for AI vs Human distributions
64
+ by_class: dict[str, dict[str, dict[str, float]]] = {"ai": {}, "human": {}}
65
+ for cls_label, cls_int in (("ai", "1"), ("human", "0")):
66
+ for col in feature_cols:
67
+ values = []
68
+ for row in rows:
69
+ if row.get("label_int") != cls_int:
70
+ continue
71
+ try:
72
+ v = float(row[col])
73
+ if not (np.isnan(v) or np.isinf(v)):
74
+ values.append(v)
75
+ except (ValueError, TypeError):
76
+ continue
77
+ if values:
78
+ arr = np.array(values)
79
+ by_class[cls_label][col] = {
80
+ "mean": float(np.mean(arr)),
81
+ "std": float(np.std(arr)) or 1.0,
82
+ "median": float(np.median(arr)),
83
+ }
84
+
85
+ output = {**stats, "_by_class": by_class}
86
+ output_path.parent.mkdir(parents=True, exist_ok=True)
87
+ with open(output_path, "w", encoding="utf-8") as f:
88
+ json.dump(output, f, indent=2)
89
+ print(f"Wrote {output_path} ({len(stats)} features, {len(rows)} samples)")
90
+
91
+
92
+ if __name__ == "__main__":
93
+ csv = sys.argv[1] if len(sys.argv) > 1 else "../DataSet/features.csv"
94
+ out = sys.argv[2] if len(sys.argv) > 2 else "models/feature_stats_v1.json"
95
+ compute_stats(csv, out)