Spaces:
Sleeping
Sleeping
feat: exclude additional metadata columns from feature loading to prevent data leakage
Browse files
app/training/evaluate.py
CHANGED
|
@@ -45,9 +45,10 @@ def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
|
|
| 45 |
|
| 46 |
with open(path, "r", encoding="utf-8") as f:
|
| 47 |
reader = csv.DictReader(f)
|
|
|
|
| 48 |
feature_cols = [
|
| 49 |
c for c in reader.fieldnames
|
| 50 |
-
if c not in
|
| 51 |
]
|
| 52 |
|
| 53 |
for row in reader:
|
|
|
|
| 45 |
|
| 46 |
with open(path, "r", encoding="utf-8") as f:
|
| 47 |
reader = csv.DictReader(f)
|
| 48 |
+
_EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
|
| 49 |
feature_cols = [
|
| 50 |
c for c in reader.fieldnames
|
| 51 |
+
if c not in _EXCLUDE
|
| 52 |
]
|
| 53 |
|
| 54 |
for row in reader:
|
app/training/train_classifier.py
CHANGED
|
@@ -95,9 +95,12 @@ def train(
|
|
| 95 |
|
| 96 |
with open(features_csv, "r", encoding="utf-8") as f:
|
| 97 |
reader = csv.DictReader(f)
|
|
|
|
|
|
|
|
|
|
| 98 |
feature_cols = [
|
| 99 |
c for c in reader.fieldnames
|
| 100 |
-
if c not in
|
| 101 |
]
|
| 102 |
|
| 103 |
# ── Handle NaN/Inf ─────────────────────────────
|
|
|
|
| 95 |
|
| 96 |
with open(features_csv, "r", encoding="utf-8") as f:
|
| 97 |
reader = csv.DictReader(f)
|
| 98 |
+
# duration_sec and sample_rate are metadata, not audio features —
|
| 99 |
+
# including them causes data leakage (duration correlates with source, not content)
|
| 100 |
+
_EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
|
| 101 |
feature_cols = [
|
| 102 |
c for c in reader.fieldnames
|
| 103 |
+
if c not in _EXCLUDE
|
| 104 |
]
|
| 105 |
|
| 106 |
# ── Handle NaN/Inf ─────────────────────────────
|