File size: 5,090 Bytes
f5a24cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f41061
f5a24cf
 
 
 
 
 
 
87db0b4
f5a24cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3346cf2
f5a24cf
b555f8d
 
296bf1d
 
b555f8d
 
f5a24cf
 
1ed15b2
f5a24cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3346cf2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import numpy as np
import torch
import torchaudio
import sys


def convert_to_16_bit_wav(data):
    # Based on: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.write.html
    #breakpoint()
    if data.dtype == np.float32:
        print(
            "Audio data is not in 16-bit integer format.",
            "Trying to convert to 16-bit int format.",
            file=sys.stderr
        )
        data = data / np.abs(data).max()
        data = data * 32767
        data = data.astype(np.int16)
    elif data.dtype == np.int32:
        print(
            "Audio data is not in 16-bit integer format.",
            "Trying to convert to 16-bit int format.",
            file=sys.stderr
        )
        data = data / 65538
        data = data.astype(np.int16)
    elif data.dtype == np.int16:
        pass
    elif data.dtype == np.uint8:
        print(
            "Audio data is not in 16-bit integer format.",
            "Trying to convert to 16-bit int format.",
            file=sys.stderr
        )
        data = data * 257 - 32768
        data = data.astype(np.int16)
    else:
        raise ValueError("Audio data cannot be converted to " "16-bit int format.")
    return data

def pcm2float(sig, dtype='float32'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind not in 'iu':
        raise TypeError("'sig' must be an array of integers")
    dtype = np.dtype(dtype)
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")

    i = np.iinfo(sig.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig.astype(dtype) - offset) / abs_max


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


@torch.no_grad()
def inference(audio, model_tag="hifigan_bn_tdnnf_wav2vec2_vq_48_v1"):
  sr, audio = audio
  audio = convert_to_16_bit_wav(audio)
  audio = pcm2float(audio)
  audio = torch.tensor(audio).unsqueeze(0)
  audio = torchaudio.transforms.Resample(orig_freq=sr,
                                         new_freq=16000)(audio)
  print(model_tag, file=sys.stderr)
  model = torch.hub.load("deep-privacy/SA-toolkit", "anonymization", tag_version=model_tag, trust_repo=True, force_reload=True)
  model.eval()
  wav_conv = model.convert(audio, target="6081") # hard coded target
  return 16000, float2pcm(wav_conv.squeeze().cpu().numpy())


article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2308.04455' target='_blank'>PhD thesis: Anonymizing Speech: Evaluating and Designing Speaker Anonymization Techniques</a> | <a href='https://github.com/deep-privacy/SA-toolkit' target='_blank'>Github Repo</a></p>"


with gr.Blocks() as interface:
  gr.Markdown(
            """
            # SA-toolkit
            Demo: Speaker speech anonymization toolkit in python 
            """
        )
  with gr.Row():
    with gr.Column():
      audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="File",
                        interactive=True, elem_id="melody-input")
      model_tag = gr.Dropdown([
                              'hifigan_bn_tdnnf_wav2vec2_vq_48_v1+f0-transformation=quant_16_awgn_2',
                              'hifigan_clean_bn_tdnnf_wav2vec2_train_600_vq_48_v1',
                              'hifigan_clean_bn_tdnnf_wav2vec2_train_600_vq_48_v1+f0-transformation=quant_16_awgn_2',
                              'hifigan_inception_bn_tdnnf_wav2vec2_train_600_vq_48_v1+f0-transformation=quant_16_awgn_2',
                              'hifigan_bn_tdnnf_wav2vec2_vq_48_v1',
                              'hifigan_bn_tdnnf_wav2vec2_100h_aug_v1',
                              'hifigan_bn_tdnnf_600h_aug_v1',
                              'hifigan_bn_tdnnf_600h_vq_48_v1',
                              'hifigan_bn_tdnnf_100h_vq_64_v1',
                              'hifigan_bn_tdnnf_100h_vq_256_v1',
                              'hifigan_bn_tdnnf_100h_aug_v1'], type='value',
                              value='hifigan_bn_tdnnf_wav2vec2_vq_48_v1',
                             label='Model')
      with gr.Row():
        submit = gr.Button("Submit")
    with gr.Column():
      audio_output = gr.Audio(label="Output")
  submit.click(inference, inputs=[audio_input, model_tag],
                 outputs=[audio_output], batch=False)
  gr.Examples(fn=inference,
              examples=[['3853-163249-0000.flac']],
              inputs=[audio_input, "hifigan_bn_tdnnf_wav2vec2_vq_48_v1"],
              outputs=[audio_output], batch=False)


  gr.HTML(article)
  interface.launch()