Spaces:
Runtime error
Runtime error
File size: 5,090 Bytes
f5a24cf 5f41061 f5a24cf 87db0b4 f5a24cf 3346cf2 f5a24cf b555f8d 296bf1d b555f8d f5a24cf 1ed15b2 f5a24cf 3346cf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import gradio as gr
import numpy as np
import torch
import torchaudio
import sys
def convert_to_16_bit_wav(data):
# Based on: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.write.html
#breakpoint()
if data.dtype == np.float32:
print(
"Audio data is not in 16-bit integer format.",
"Trying to convert to 16-bit int format.",
file=sys.stderr
)
data = data / np.abs(data).max()
data = data * 32767
data = data.astype(np.int16)
elif data.dtype == np.int32:
print(
"Audio data is not in 16-bit integer format.",
"Trying to convert to 16-bit int format.",
file=sys.stderr
)
data = data / 65538
data = data.astype(np.int16)
elif data.dtype == np.int16:
pass
elif data.dtype == np.uint8:
print(
"Audio data is not in 16-bit integer format.",
"Trying to convert to 16-bit int format.",
file=sys.stderr
)
data = data * 257 - 32768
data = data.astype(np.int16)
else:
raise ValueError("Audio data cannot be converted to " "16-bit int format.")
return data
def pcm2float(sig, dtype='float32'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind not in 'iu':
raise TypeError("'sig' must be an array of integers")
dtype = np.dtype(dtype)
if dtype.kind != 'f':
raise TypeError("'dtype' must be a floating point type")
i = np.iinfo(sig.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig.astype(dtype) - offset) / abs_max
def float2pcm(sig, dtype='int16'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
@torch.no_grad()
def inference(audio, model_tag="hifigan_bn_tdnnf_wav2vec2_vq_48_v1"):
sr, audio = audio
audio = convert_to_16_bit_wav(audio)
audio = pcm2float(audio)
audio = torch.tensor(audio).unsqueeze(0)
audio = torchaudio.transforms.Resample(orig_freq=sr,
new_freq=16000)(audio)
print(model_tag, file=sys.stderr)
model = torch.hub.load("deep-privacy/SA-toolkit", "anonymization", tag_version=model_tag, trust_repo=True, force_reload=True)
model.eval()
wav_conv = model.convert(audio, target="6081") # hard coded target
return 16000, float2pcm(wav_conv.squeeze().cpu().numpy())
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2308.04455' target='_blank'>PhD thesis: Anonymizing Speech: Evaluating and Designing Speaker Anonymization Techniques</a> | <a href='https://github.com/deep-privacy/SA-toolkit' target='_blank'>Github Repo</a></p>"
with gr.Blocks() as interface:
gr.Markdown(
"""
# SA-toolkit
Demo: Speaker speech anonymization toolkit in python
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="File",
interactive=True, elem_id="melody-input")
model_tag = gr.Dropdown([
'hifigan_bn_tdnnf_wav2vec2_vq_48_v1+f0-transformation=quant_16_awgn_2',
'hifigan_clean_bn_tdnnf_wav2vec2_train_600_vq_48_v1',
'hifigan_clean_bn_tdnnf_wav2vec2_train_600_vq_48_v1+f0-transformation=quant_16_awgn_2',
'hifigan_inception_bn_tdnnf_wav2vec2_train_600_vq_48_v1+f0-transformation=quant_16_awgn_2',
'hifigan_bn_tdnnf_wav2vec2_vq_48_v1',
'hifigan_bn_tdnnf_wav2vec2_100h_aug_v1',
'hifigan_bn_tdnnf_600h_aug_v1',
'hifigan_bn_tdnnf_600h_vq_48_v1',
'hifigan_bn_tdnnf_100h_vq_64_v1',
'hifigan_bn_tdnnf_100h_vq_256_v1',
'hifigan_bn_tdnnf_100h_aug_v1'], type='value',
value='hifigan_bn_tdnnf_wav2vec2_vq_48_v1',
label='Model')
with gr.Row():
submit = gr.Button("Submit")
with gr.Column():
audio_output = gr.Audio(label="Output")
submit.click(inference, inputs=[audio_input, model_tag],
outputs=[audio_output], batch=False)
gr.Examples(fn=inference,
examples=[['3853-163249-0000.flac']],
inputs=[audio_input, "hifigan_bn_tdnnf_wav2vec2_vq_48_v1"],
outputs=[audio_output], batch=False)
gr.HTML(article)
interface.launch()
|