Spaces:
Sleeping
Sleeping
feat: enhance feature extraction with additional spectral and temporal metrics
Browse files
app/services/feature_extractor.py
CHANGED
|
@@ -124,15 +124,35 @@ def extract_features(
|
|
| 124 |
duration_sec=duration_sec,
|
| 125 |
sample_rate=actual_sr,
|
| 126 |
rms_energy=spectral["rms_mean"],
|
|
|
|
| 127 |
tempo_bpm=temporal["tempo_bpm"],
|
| 128 |
tempo_stability=temporal["tempo_stability"],
|
|
|
|
| 129 |
spectral_centroid_mean=spectral["centroid_mean"],
|
| 130 |
spectral_centroid_std=spectral["centroid_std"],
|
| 131 |
spectral_flatness_mean=spectral["flatness_mean"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
mfcc_variance=spectral["mfcc_variance"],
|
|
|
|
|
|
|
| 133 |
chroma_entropy=harmonic["chroma_entropy"],
|
|
|
|
|
|
|
| 134 |
harmonic_ratio=harmonic["harmonic_ratio"],
|
|
|
|
| 135 |
zero_crossing_rate=temporal["zcr_mean"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
)
|
| 137 |
|
| 138 |
|
|
|
|
| 124 |
duration_sec=duration_sec,
|
| 125 |
sample_rate=actual_sr,
|
| 126 |
rms_energy=spectral["rms_mean"],
|
| 127 |
+
rms_std=spectral["rms_std"],
|
| 128 |
tempo_bpm=temporal["tempo_bpm"],
|
| 129 |
tempo_stability=temporal["tempo_stability"],
|
| 130 |
+
tempo_cv=temporal["tempo_cv"],
|
| 131 |
spectral_centroid_mean=spectral["centroid_mean"],
|
| 132 |
spectral_centroid_std=spectral["centroid_std"],
|
| 133 |
spectral_flatness_mean=spectral["flatness_mean"],
|
| 134 |
+
spectral_flatness_std=spectral["flatness_std"],
|
| 135 |
+
spectral_bandwidth_mean=spectral["bandwidth_mean"],
|
| 136 |
+
spectral_bandwidth_std=spectral["bandwidth_std"],
|
| 137 |
+
spectral_rolloff_mean=spectral["rolloff_mean"],
|
| 138 |
+
spectral_rolloff_std=spectral["rolloff_std"],
|
| 139 |
+
spectral_contrast_mean=spectral["contrast_mean"],
|
| 140 |
+
spectral_contrast_std=spectral["contrast_std"],
|
| 141 |
mfcc_variance=spectral["mfcc_variance"],
|
| 142 |
+
mfcc_delta_var=spectral["mfcc_delta_var"],
|
| 143 |
+
mfcc_delta2_var=spectral["mfcc_delta2_var"],
|
| 144 |
chroma_entropy=harmonic["chroma_entropy"],
|
| 145 |
+
chroma_std=harmonic["chroma_std"],
|
| 146 |
+
chroma_transition_rate=harmonic["chroma_transition_rate"],
|
| 147 |
harmonic_ratio=harmonic["harmonic_ratio"],
|
| 148 |
+
tonnetz_std=harmonic["tonnetz_std"],
|
| 149 |
zero_crossing_rate=temporal["zcr_mean"],
|
| 150 |
+
zero_crossing_std=temporal["zcr_std"],
|
| 151 |
+
onset_strength_mean=temporal["onset_mean"],
|
| 152 |
+
onset_strength_std=temporal["onset_std"],
|
| 153 |
+
rms_dynamic_range=temporal["rms_dynamic_range"],
|
| 154 |
+
beat_count=temporal["beat_count"],
|
| 155 |
+
mel_flatness=spectral["mel_flatness"],
|
| 156 |
)
|
| 157 |
|
| 158 |
|
app/training/extract_features_batch.py
CHANGED
|
@@ -27,26 +27,49 @@ from app.services.feature_extractor import extract_features
|
|
| 27 |
from app.services.vocal_analyzer import analyze_vocals
|
| 28 |
|
| 29 |
|
| 30 |
-
# All raw features we extract per sample
|
| 31 |
FEATURE_COLUMNS = [
|
| 32 |
-
#
|
| 33 |
"duration_sec",
|
| 34 |
"sample_rate",
|
|
|
|
| 35 |
"rms_energy",
|
| 36 |
-
"
|
| 37 |
-
"tempo_stability",
|
| 38 |
"spectral_centroid_mean",
|
| 39 |
"spectral_centroid_std",
|
| 40 |
"spectral_flatness_mean",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"mfcc_variance",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
"chroma_entropy",
|
|
|
|
|
|
|
| 43 |
"harmonic_ratio",
|
| 44 |
-
"
|
| 45 |
-
# Heuristic scores (kept as features
|
| 46 |
"spectral_regularity",
|
| 47 |
"temporal_patterns",
|
| 48 |
"harmonic_structure",
|
| 49 |
-
#
|
| 50 |
"has_vocals",
|
| 51 |
"vocal_confidence",
|
| 52 |
"vocal_ai_score",
|
|
@@ -73,21 +96,41 @@ def extract_sample_features(audio_path: str) -> dict | None:
|
|
| 73 |
try:
|
| 74 |
path = Path(audio_path)
|
| 75 |
|
| 76 |
-
# Feature extraction
|
| 77 |
feat = extract_features(path)
|
| 78 |
row = {
|
| 79 |
"duration_sec": feat.duration_sec,
|
| 80 |
"sample_rate": feat.sample_rate,
|
| 81 |
"rms_energy": feat.rms_energy,
|
| 82 |
-
"
|
| 83 |
-
"tempo_stability": feat.tempo_stability,
|
| 84 |
"spectral_centroid_mean": feat.spectral_centroid_mean,
|
| 85 |
"spectral_centroid_std": feat.spectral_centroid_std,
|
| 86 |
"spectral_flatness_mean": feat.spectral_flatness_mean,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
"mfcc_variance": feat.mfcc_variance,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
"chroma_entropy": feat.chroma_entropy,
|
|
|
|
|
|
|
| 89 |
"harmonic_ratio": feat.harmonic_ratio,
|
| 90 |
-
"
|
| 91 |
"spectral_regularity": feat.spectral_regularity,
|
| 92 |
"temporal_patterns": feat.temporal_patterns,
|
| 93 |
"harmonic_structure": feat.harmonic_structure,
|
|
|
|
| 27 |
from app.services.vocal_analyzer import analyze_vocals
|
| 28 |
|
| 29 |
|
| 30 |
+
# All raw features we extract per sample — comprehensive set for paper
|
| 31 |
FEATURE_COLUMNS = [
|
| 32 |
+
# ── Basic metadata ──────────────────────────────────────────
|
| 33 |
"duration_sec",
|
| 34 |
"sample_rate",
|
| 35 |
+
# ── Spectral features ───────────────────────────────────────
|
| 36 |
"rms_energy",
|
| 37 |
+
"rms_std",
|
|
|
|
| 38 |
"spectral_centroid_mean",
|
| 39 |
"spectral_centroid_std",
|
| 40 |
"spectral_flatness_mean",
|
| 41 |
+
"spectral_flatness_std",
|
| 42 |
+
"spectral_bandwidth_mean",
|
| 43 |
+
"spectral_bandwidth_std",
|
| 44 |
+
"spectral_rolloff_mean",
|
| 45 |
+
"spectral_rolloff_std",
|
| 46 |
+
"spectral_contrast_mean",
|
| 47 |
+
"spectral_contrast_std",
|
| 48 |
"mfcc_variance",
|
| 49 |
+
"mfcc_delta_var",
|
| 50 |
+
"mfcc_delta2_var",
|
| 51 |
+
"mel_flatness",
|
| 52 |
+
# ── Temporal / rhythm features ──────────────────────────────
|
| 53 |
+
"tempo_bpm",
|
| 54 |
+
"tempo_stability",
|
| 55 |
+
"tempo_cv",
|
| 56 |
+
"zero_crossing_rate",
|
| 57 |
+
"zero_crossing_std",
|
| 58 |
+
"onset_strength_mean",
|
| 59 |
+
"onset_strength_std",
|
| 60 |
+
"rms_dynamic_range",
|
| 61 |
+
"beat_count",
|
| 62 |
+
# ── Harmonic / tonal features ───────────────────────────────
|
| 63 |
"chroma_entropy",
|
| 64 |
+
"chroma_std",
|
| 65 |
+
"chroma_transition_rate",
|
| 66 |
"harmonic_ratio",
|
| 67 |
+
"tonnetz_std",
|
| 68 |
+
# ── Heuristic composite scores (kept as features) ───────────
|
| 69 |
"spectral_regularity",
|
| 70 |
"temporal_patterns",
|
| 71 |
"harmonic_structure",
|
| 72 |
+
# ── Vocal analysis features ─────────────────────────────────
|
| 73 |
"has_vocals",
|
| 74 |
"vocal_confidence",
|
| 75 |
"vocal_ai_score",
|
|
|
|
| 96 |
try:
|
| 97 |
path = Path(audio_path)
|
| 98 |
|
| 99 |
+
# Feature extraction — all fields from AudioFeatures dataclass
|
| 100 |
feat = extract_features(path)
|
| 101 |
row = {
|
| 102 |
"duration_sec": feat.duration_sec,
|
| 103 |
"sample_rate": feat.sample_rate,
|
| 104 |
"rms_energy": feat.rms_energy,
|
| 105 |
+
"rms_std": feat.rms_std,
|
|
|
|
| 106 |
"spectral_centroid_mean": feat.spectral_centroid_mean,
|
| 107 |
"spectral_centroid_std": feat.spectral_centroid_std,
|
| 108 |
"spectral_flatness_mean": feat.spectral_flatness_mean,
|
| 109 |
+
"spectral_flatness_std": feat.spectral_flatness_std,
|
| 110 |
+
"spectral_bandwidth_mean": feat.spectral_bandwidth_mean,
|
| 111 |
+
"spectral_bandwidth_std": feat.spectral_bandwidth_std,
|
| 112 |
+
"spectral_rolloff_mean": feat.spectral_rolloff_mean,
|
| 113 |
+
"spectral_rolloff_std": feat.spectral_rolloff_std,
|
| 114 |
+
"spectral_contrast_mean": feat.spectral_contrast_mean,
|
| 115 |
+
"spectral_contrast_std": feat.spectral_contrast_std,
|
| 116 |
"mfcc_variance": feat.mfcc_variance,
|
| 117 |
+
"mfcc_delta_var": feat.mfcc_delta_var,
|
| 118 |
+
"mfcc_delta2_var": feat.mfcc_delta2_var,
|
| 119 |
+
"mel_flatness": feat.mel_flatness,
|
| 120 |
+
"tempo_bpm": feat.tempo_bpm,
|
| 121 |
+
"tempo_stability": feat.tempo_stability,
|
| 122 |
+
"tempo_cv": feat.tempo_cv,
|
| 123 |
+
"zero_crossing_rate": feat.zero_crossing_rate,
|
| 124 |
+
"zero_crossing_std": feat.zero_crossing_std,
|
| 125 |
+
"onset_strength_mean": feat.onset_strength_mean,
|
| 126 |
+
"onset_strength_std": feat.onset_strength_std,
|
| 127 |
+
"rms_dynamic_range": feat.rms_dynamic_range,
|
| 128 |
+
"beat_count": feat.beat_count,
|
| 129 |
"chroma_entropy": feat.chroma_entropy,
|
| 130 |
+
"chroma_std": feat.chroma_std,
|
| 131 |
+
"chroma_transition_rate": feat.chroma_transition_rate,
|
| 132 |
"harmonic_ratio": feat.harmonic_ratio,
|
| 133 |
+
"tonnetz_std": feat.tonnetz_std,
|
| 134 |
"spectral_regularity": feat.spectral_regularity,
|
| 135 |
"temporal_patterns": feat.temporal_patterns,
|
| 136 |
"harmonic_structure": feat.harmonic_structure,
|