luckyhookin commited on
Commit
563ddb0
·
0 Parent(s):
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +12 -0
  3. app.py +118 -0
  4. packages.txt +2 -0
  5. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Speaker-diarization-diar Sortformer
3
+ emoji: 📚
4
+ colorFrom: gray
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import os
3
+ import gradio as gr
4
+ import spaces
5
+ from pydub import AudioSegment
6
+ import json
7
+ import requests
8
+ from nemo.collections.asr.models import SortformerEncLabelModel
9
+
10
+ diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2")
11
+ diar_model.eval()
12
+
13
+ diar_model.sortformer_modules.chunk_len = 340
14
+ diar_model.sortformer_modules.chunk_right_context = 40
15
+ diar_model.sortformer_modules.fifo_len = 40
16
+ diar_model.sortformer_modules.spkcache_update_period = 300
17
+ diar_model.sortformer_modules.spkcache_len = 188
18
+ diar_model.sortformer_modules._check_streaming_parameters()
19
+
20
+ def preprocess_audio(audio_path):
21
+ """Convert audio to mono, 16kHz WAV format suitable for pyannote."""
22
+ try:
23
+ if isinstance(audio_path, str):
24
+ bytes = False
25
+ else:
26
+ bytes = True
27
+
28
+ # Load audio with pydub
29
+ audio = AudioSegment.from_file(BytesIO(audio_path) if bytes else audio_path)
30
+ # Convert to mono and set sample rate to 16kHz
31
+ audio = audio.set_channels(1).set_frame_rate(16000)
32
+ # Export to temporary WAV file
33
+ temp_wav = "temp_audio.wav"
34
+ audio.export(temp_wav, format="wav")
35
+ return temp_wav
36
+ except Exception as e:
37
+ raise ValueError(f"Error preprocessing audio: {str(e)}")
38
+
39
+ def handle_audio(url, audio_path):
40
+ """Handle audio processing and diarization."""
41
+ if url:
42
+ response = requests.get(url, timeout=60)
43
+ audio_path = response.content
44
+
45
+ audio_path = preprocess_audio(audio_path)
46
+
47
+ res = diarize_audio_diar1(audio_path)
48
+
49
+ # Clean up temporary file
50
+ if os.path.exists(audio_path):
51
+ os.remove(audio_path)
52
+ return json.dumps(res)
53
+
54
+
55
+ @spaces.GPU(duration=120)
56
+ def diarize_audio_diar1(audio_path):
57
+ """Perform speaker diarization and return formatted results."""
58
+ try:
59
+ predicted_segments = diar_model.diarize(audio=audio_path, batch_size=1)
60
+ return format_results(predicted_segments[0])
61
+
62
+ except Exception as e:
63
+ return f"Error: {str(e)}", ""
64
+
65
+ def format_results(results):
66
+ """Format results into a readable string."""
67
+
68
+ if isinstance(results, str):
69
+ import json
70
+ results = json.loads(results)
71
+
72
+ if not isinstance(results, list):
73
+ return []
74
+
75
+ formatted_results = []
76
+ for item in results:
77
+ if isinstance(item, str):
78
+ parts = item.strip().split()
79
+ if len(parts) == 3:
80
+ formatted_results.append({
81
+ "start": float(parts[0]),
82
+ "end": float(parts[1]),
83
+ "speaker_id": parts[2]
84
+ })
85
+ elif isinstance(item, dict):
86
+ formatted_results.append({
87
+ "start": item.get("start", 0),
88
+ "end": item.get("end", 0),
89
+ "speaker_id": item.get("speaker", item.get("speaker_id", "unknown"))
90
+ })
91
+
92
+ formatted_results.sort(key=lambda x: x["start"])
93
+
94
+ return formatted_results
95
+
96
+ # Gradio interface
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("# Speaker Diarization with nvidia/diar_streaming_sortformer_4spk-v2")
99
+ gr.Markdown("Upload an audio file and specify the number of speakers to diarize the audio.")
100
+
101
+ with gr.Row():
102
+ url_input = gr.Textbox(label="URL")
103
+ audio_input = gr.Audio(label="Upload Audio File", type="filepath")
104
+
105
+ submit_btn = gr.Button("Diarize")
106
+
107
+ with gr.Row():
108
+ json_output = gr.Textbox(label="Diarization Results (JSON)")
109
+
110
+ submit_btn.click(
111
+ fn=handle_audio,
112
+ inputs=[url_input, audio_input],
113
+ outputs=[json_output],
114
+ concurrency_limit=2,
115
+ )
116
+
117
+ # Launch the Gradio app
118
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pydub
2
+ gradio
3
+ spaces
4
+ Cython
5
+ packaging
6
+ nemo_toolkit[asr]