File size: 4,980 Bytes
bfbcec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
SONICS dataset loader for AURIS training pipeline.

Downloads AI-generated and human-composed music samples
from the SONICS dataset on HuggingFace, saves audio files
to disk, and creates a CSV manifest for training.

SONICS: ~97K tracks from multiple AI generators and human sources.
Paper: "SONICS: Synthetic Or Not — Identifying Counterfeit Songs"
"""

from __future__ import annotations

import csv
import io
import os
import sys
from pathlib import Path
from typing import Optional

import numpy as np
import soundfile as sf


def load_sonics(
    output_dir: str | Path,
    max_samples: int = 20_000,
    split: str = "train",
    seed: int = 42,
) -> Path:
    """
    Download SONICS dataset and create training manifest.

    Args:
        output_dir: Directory to save audio files and manifest.
        max_samples: Maximum total samples (balanced AI/human).
        split: Dataset split to use.
        seed: Random seed for reproducibility.

    Returns:
        Path to the manifest CSV file.
    """
    from datasets import load_dataset

    output_dir = Path(output_dir)
    audio_dir = output_dir / "audio"
    audio_dir.mkdir(parents=True, exist_ok=True)

    manifest_path = output_dir / "manifest.csv"

    print(f"Loading SONICS dataset (split={split})...")
    ds = load_dataset(
        "awesomejjay/sonics",
        split=split,
        streaming=True,
        trust_remote_code=True,
    )

    half = max_samples // 2
    ai_count = 0
    human_count = 0
    total = 0

    with open(manifest_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "file_path", "label", "label_int",
            "generator", "duration_sec", "sample_rate",
        ])
        writer.writeheader()

        for sample in ds:
            # Determine label
            is_ai = sample.get("is_ai", None)
            label_str = sample.get("label", "")
            generator = sample.get("generator", "unknown")

            if is_ai is None:
                # Try to infer from label field
                if isinstance(label_str, str):
                    is_ai = label_str.lower() in (
                        "ai", "fake", "generated", "synthetic",
                    )
                elif isinstance(label_str, (int, float)):
                    is_ai = bool(label_str)
                else:
                    continue

            # Balance classes
            if is_ai and ai_count >= half:
                continue
            if not is_ai and human_count >= half:
                continue

            # Extract audio
            audio_data = sample.get("audio", None)
            if audio_data is None:
                continue

            array = audio_data.get("array", None)
            sr = audio_data.get("sampling_rate", 16000)

            if array is None or len(array) < sr:
                continue  # Skip very short clips

            # Save audio file
            label_tag = "ai" if is_ai else "human"
            filename = f"{label_tag}_{total:06d}.wav"
            filepath = audio_dir / filename

            audio_array = np.array(array, dtype=np.float32)

            # Truncate to 30 seconds max to save space
            max_len = sr * 30
            if len(audio_array) > max_len:
                audio_array = audio_array[:max_len]

            duration = len(audio_array) / sr

            sf.write(str(filepath), audio_array, sr)

            writer.writerow({
                "file_path": str(filepath),
                "label": label_tag,
                "label_int": 1 if is_ai else 0,
                "generator": generator,
                "duration_sec": round(duration, 2),
                "sample_rate": sr,
            })

            if is_ai:
                ai_count += 1
            else:
                human_count += 1
            total += 1

            if total % 100 == 0:
                print(
                    f"  [{total}/{max_samples}] "
                    f"AI={ai_count}, Human={human_count}"
                )

            if ai_count >= half and human_count >= half:
                break

    print(
        f"\nDataset ready: {total} samples "
        f"(AI={ai_count}, Human={human_count})"
    )
    print(f"Manifest: {manifest_path}")
    print(f"Audio dir: {audio_dir}")

    return manifest_path


def load_manifest(manifest_path: str | Path) -> list[dict]:
    """Load manifest CSV into list of dicts."""
    rows = []
    with open(manifest_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            row["label_int"] = int(row["label_int"])
            row["duration_sec"] = float(row["duration_sec"])
            row["sample_rate"] = int(row["sample_rate"])
            rows.append(row)
    return rows


if __name__ == "__main__":
    out = sys.argv[1] if len(sys.argv) > 1 else "data/sonics"
    n = int(sys.argv[2]) if len(sys.argv) > 2 else 2000
    load_sonics(out, max_samples=n)