Spaces:
Sleeping
Sleeping
File size: 4,980 Bytes
bfbcec4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """
SONICS dataset loader for AURIS training pipeline.
Downloads AI-generated and human-composed music samples
from the SONICS dataset on HuggingFace, saves audio files
to disk, and creates a CSV manifest for training.
SONICS: ~97K tracks from multiple AI generators and human sources.
Paper: "SONICS: Synthetic Or Not — Identifying Counterfeit Songs"
"""
from __future__ import annotations
import csv
import io
import os
import sys
from pathlib import Path
from typing import Optional
import numpy as np
import soundfile as sf
def load_sonics(
output_dir: str | Path,
max_samples: int = 20_000,
split: str = "train",
seed: int = 42,
) -> Path:
"""
Download SONICS dataset and create training manifest.
Args:
output_dir: Directory to save audio files and manifest.
max_samples: Maximum total samples (balanced AI/human).
split: Dataset split to use.
seed: Random seed for reproducibility.
Returns:
Path to the manifest CSV file.
"""
from datasets import load_dataset
output_dir = Path(output_dir)
audio_dir = output_dir / "audio"
audio_dir.mkdir(parents=True, exist_ok=True)
manifest_path = output_dir / "manifest.csv"
print(f"Loading SONICS dataset (split={split})...")
ds = load_dataset(
"awesomejjay/sonics",
split=split,
streaming=True,
trust_remote_code=True,
)
half = max_samples // 2
ai_count = 0
human_count = 0
total = 0
with open(manifest_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=[
"file_path", "label", "label_int",
"generator", "duration_sec", "sample_rate",
])
writer.writeheader()
for sample in ds:
# Determine label
is_ai = sample.get("is_ai", None)
label_str = sample.get("label", "")
generator = sample.get("generator", "unknown")
if is_ai is None:
# Try to infer from label field
if isinstance(label_str, str):
is_ai = label_str.lower() in (
"ai", "fake", "generated", "synthetic",
)
elif isinstance(label_str, (int, float)):
is_ai = bool(label_str)
else:
continue
# Balance classes
if is_ai and ai_count >= half:
continue
if not is_ai and human_count >= half:
continue
# Extract audio
audio_data = sample.get("audio", None)
if audio_data is None:
continue
array = audio_data.get("array", None)
sr = audio_data.get("sampling_rate", 16000)
if array is None or len(array) < sr:
continue # Skip very short clips
# Save audio file
label_tag = "ai" if is_ai else "human"
filename = f"{label_tag}_{total:06d}.wav"
filepath = audio_dir / filename
audio_array = np.array(array, dtype=np.float32)
# Truncate to 30 seconds max to save space
max_len = sr * 30
if len(audio_array) > max_len:
audio_array = audio_array[:max_len]
duration = len(audio_array) / sr
sf.write(str(filepath), audio_array, sr)
writer.writerow({
"file_path": str(filepath),
"label": label_tag,
"label_int": 1 if is_ai else 0,
"generator": generator,
"duration_sec": round(duration, 2),
"sample_rate": sr,
})
if is_ai:
ai_count += 1
else:
human_count += 1
total += 1
if total % 100 == 0:
print(
f" [{total}/{max_samples}] "
f"AI={ai_count}, Human={human_count}"
)
if ai_count >= half and human_count >= half:
break
print(
f"\nDataset ready: {total} samples "
f"(AI={ai_count}, Human={human_count})"
)
print(f"Manifest: {manifest_path}")
print(f"Audio dir: {audio_dir}")
return manifest_path
def load_manifest(manifest_path: str | Path) -> list[dict]:
"""Load manifest CSV into list of dicts."""
rows = []
with open(manifest_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
row["label_int"] = int(row["label_int"])
row["duration_sec"] = float(row["duration_sec"])
row["sample_rate"] = int(row["sample_rate"])
rows.append(row)
return rows
if __name__ == "__main__":
out = sys.argv[1] if len(sys.argv) > 1 else "data/sonics"
n = int(sys.argv[2]) if len(sys.argv) > 2 else 2000
load_sonics(out, max_samples=n)
|