Spaces:

Rthur2003
/

crowncode-backend

Running

App Files Files Community

Rthur2003 commited on Mar 28

Commit

bfbcec4

1 Parent(s): 621b77e

feat: add SONICS dataset loader for AURIS training pipeline

Browse files

Files changed (1) hide show

app/training/dataset_loader.py +171 -0

app/training/dataset_loader.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+SONICS dataset loader for AURIS training pipeline.
+Downloads AI-generated and human-composed music samples
+from the SONICS dataset on HuggingFace, saves audio files
+to disk, and creates a CSV manifest for training.
+SONICS: ~97K tracks from multiple AI generators and human sources.
+Paper: "SONICS: Synthetic Or Not — Identifying Counterfeit Songs"
+"""
+from __future__ import annotations
+import csv
+import io
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import soundfile as sf
+def load_sonics(
+    output_dir: str | Path,
+    max_samples: int = 20_000,
+    split: str = "train",
+    seed: int = 42,
+) -> Path:
+    """
+    Download SONICS dataset and create training manifest.
+    Args:
+        output_dir: Directory to save audio files and manifest.
+        max_samples: Maximum total samples (balanced AI/human).
+        split: Dataset split to use.
+        seed: Random seed for reproducibility.
+    Returns:
+        Path to the manifest CSV file.
+    """
+    from datasets import load_dataset
+    output_dir = Path(output_dir)
+    audio_dir = output_dir / "audio"
+    audio_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = output_dir / "manifest.csv"
+    print(f"Loading SONICS dataset (split={split})...")
+    ds = load_dataset(
+        "awesomejjay/sonics",
+        split=split,
+        streaming=True,
+        trust_remote_code=True,
+    )
+    half = max_samples // 2
+    ai_count = 0
+    human_count = 0
+    total = 0
+    with open(manifest_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=[
+            "file_path", "label", "label_int",
+            "generator", "duration_sec", "sample_rate",
+        ])
+        writer.writeheader()
+        for sample in ds:
+            # Determine label
+            is_ai = sample.get("is_ai", None)
+            label_str = sample.get("label", "")
+            generator = sample.get("generator", "unknown")
+            if is_ai is None:
+                # Try to infer from label field
+                if isinstance(label_str, str):
+                    is_ai = label_str.lower() in (
+                        "ai", "fake", "generated", "synthetic",
+                    )
+                elif isinstance(label_str, (int, float)):
+                    is_ai = bool(label_str)
+                else:
+                    continue
+            # Balance classes
+            if is_ai and ai_count >= half:
+                continue
+            if not is_ai and human_count >= half:
+                continue
+            # Extract audio
+            audio_data = sample.get("audio", None)
+            if audio_data is None:
+                continue
+            array = audio_data.get("array", None)
+            sr = audio_data.get("sampling_rate", 16000)
+            if array is None or len(array) < sr:
+                continue  # Skip very short clips
+            # Save audio file
+            label_tag = "ai" if is_ai else "human"
+            filename = f"{label_tag}_{total:06d}.wav"
+            filepath = audio_dir / filename
+            audio_array = np.array(array, dtype=np.float32)
+            # Truncate to 30 seconds max to save space
+            max_len = sr * 30
+            if len(audio_array) > max_len:
+                audio_array = audio_array[:max_len]
+            duration = len(audio_array) / sr
+            sf.write(str(filepath), audio_array, sr)
+            writer.writerow({
+                "file_path": str(filepath),
+                "label": label_tag,
+                "label_int": 1 if is_ai else 0,
+                "generator": generator,
+                "duration_sec": round(duration, 2),
+                "sample_rate": sr,
+            })
+            if is_ai:
+                ai_count += 1
+            else:
+                human_count += 1
+            total += 1
+            if total % 100 == 0:
+                print(
+                    f"  [{total}/{max_samples}] "
+                    f"AI={ai_count}, Human={human_count}"
+                )
+            if ai_count >= half and human_count >= half:
+                break
+    print(
+        f"\nDataset ready: {total} samples "
+        f"(AI={ai_count}, Human={human_count})"
+    )
+    print(f"Manifest: {manifest_path}")
+    print(f"Audio dir: {audio_dir}")
+    return manifest_path
+def load_manifest(manifest_path: str | Path) -> list[dict]:
+    """Load manifest CSV into list of dicts."""
+    rows = []
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            row["label_int"] = int(row["label_int"])
+            row["duration_sec"] = float(row["duration_sec"])
+            row["sample_rate"] = int(row["sample_rate"])
+            rows.append(row)
+    return rows
+if __name__ == "__main__":
+    out = sys.argv[1] if len(sys.argv) > 1 else "data/sonics"
+    n = int(sys.argv[2]) if len(sys.argv) > 2 else 2000
+    load_sonics(out, max_samples=n)