Rthur2003 commited on
Commit
4b87da0
·
1 Parent(s): 983f3ef

feat: exclude additional metadata columns from feature loading to prevent data leakage

Browse files
app/training/evaluate.py CHANGED
@@ -45,9 +45,10 @@ def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
45
 
46
  with open(path, "r", encoding="utf-8") as f:
47
  reader = csv.DictReader(f)
 
48
  feature_cols = [
49
  c for c in reader.fieldnames
50
- if c not in ("file_path", "label_int")
51
  ]
52
 
53
  for row in reader:
 
45
 
46
  with open(path, "r", encoding="utf-8") as f:
47
  reader = csv.DictReader(f)
48
+ _EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
49
  feature_cols = [
50
  c for c in reader.fieldnames
51
+ if c not in _EXCLUDE
52
  ]
53
 
54
  for row in reader:
app/training/train_classifier.py CHANGED
@@ -95,9 +95,12 @@ def train(
95
 
96
  with open(features_csv, "r", encoding="utf-8") as f:
97
  reader = csv.DictReader(f)
 
 
 
98
  feature_cols = [
99
  c for c in reader.fieldnames
100
- if c not in ("file_path", "label_int")
101
  ]
102
 
103
  # ── Handle NaN/Inf ─────────────────────────────
 
95
 
96
  with open(features_csv, "r", encoding="utf-8") as f:
97
  reader = csv.DictReader(f)
98
+ # duration_sec and sample_rate are metadata, not audio features —
99
+ # including them causes data leakage (duration correlates with source, not content)
100
+ _EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
101
  feature_cols = [
102
  c for c in reader.fieldnames
103
+ if c not in _EXCLUDE
104
  ]
105
 
106
  # ── Handle NaN/Inf ─────────────────────────────