Spaces:

audeering
/

speech-analysis

Running

App Files Files Community

hagenw commited on Aug 29, 2024

Commit

f62c750

1 Parent(s): ba45a7b

Add expression model

Browse files

Files changed (1) hide show

app.py +92 -34

app.py CHANGED Viewed

@@ -12,8 +12,9 @@ import audresample
 device = 0 if torch.cuda.is_available() else "cpu"
-model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
 duration = 1  # limit processing of audio
 class AgeGenderHead(nn.Module):
@@ -66,10 +67,55 @@ class AgeGenderModel(Wav2Vec2PreTrainedModel):
         return hidden_states, logits_age, logits_gender
-# load model from hub
-processor = Wav2Vec2Processor.from_pretrained(model_name)
-model = AgeGenderModel.from_pretrained(model_name)
 def process_func(x: np.ndarray, sampling_rate: int) -> dict:
@@ -77,28 +123,38 @@ def process_func(x: np.ndarray, sampling_rate: int) -> dict:
     # run through processor to normalize signal
     # always returns a batch, so we just get the first entry
     # then we put it on the device
-    y = processor(x, sampling_rate=sampling_rate)
-    y = y['input_values'][0]
-    y = y.reshape(1, -1)
-    y = torch.from_numpy(y).to(device)
-    # run through model
-    with torch.no_grad():
-        y = model(y)
-        y = torch.hstack([y[1], y[2]])
-    # convert to numpy
-    y = y.detach().cpu().numpy()
-    # convert to dict
-    y = {
-        "age": 100 * y[0][0],
-        "female": y[0][1],
-        "male": y[0][2],
-        "child": y[0][3],
-    }
-    return y
 @spaces.GPU
@@ -117,17 +173,17 @@ def recognize(input_file):
     target_rate = 16000
     signal = audresample.resample(signal, sampling_rate, target_rate)
-    age_gender = process_func(signal, target_rate)
-    age = f"{round(age_gender['age'])} years"
-    gender = {k: v for k, v in age_gender.items() if k != "age"}
-    return age, gender
 outputs = gr.Label()
 title = "audEERING age and gender recognition"
 description = (
-    "Recognize age and gender of a microphone recording or audio file. "
-    f"Demo uses the checkpoint [{model_name}](https://huggingface.co/{model_name})."
 )
 allow_flagging = "never"
@@ -159,8 +215,8 @@ with gr.Blocks() as demo:
     gr.Markdown(description)
     with gr.Tab(label="Speech analysis"):
         with gr.Row():
-            gr.Markdown("Only the first second of the audio is processed.")
             with gr.Column():
                 input = gr.Audio(
                     sources=["upload", "microphone"],
                     type="filepath",
@@ -170,8 +226,10 @@ with gr.Blocks() as demo:
             with gr.Column():
                 output_age = gr.Textbox(label="Age")
                 output_gender = gr.Label(label="Gender")
-        submit_btn.click(recognize, input, [output_age, output_gender])
 demo.launch(debug=True)

 device = 0 if torch.cuda.is_available() else "cpu"
 duration = 1  # limit processing of audio
+age_gender_model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
+expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
 class AgeGenderHead(nn.Module):
         return hidden_states, logits_age, logits_gender
+class ExpressionHead(nn.Module):
+    r"""Expression model head."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class ExpressionModel(Wav2Vec2PreTrainedModel):
+    r"""speech expression model."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = ExpressionHead(config)
+        self.init_weights()
+    def forward(self, input_values):
+        outputs = self.wav2vec2(input_values)
+        hidden_states = outputs[0]
+        hidden_states = torch.mean(hidden_states, dim=1)
+        logits = self.classifier(hidden_states)
+        return hidden_states, logits
+# Load models from hub
+age_gender_processor = Wav2Vec2Processor.from_pretrained(age_gender_model_name)
+age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
+expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
+expression_model = ExpressionModel.from_pretrained(expression_model_name)
 def process_func(x: np.ndarray, sampling_rate: int) -> dict:
     # run through processor to normalize signal
     # always returns a batch, so we just get the first entry
     # then we put it on the device
+    results = []
+    for processor, model in zip(
+            [age_gender_processor, expression_processor],
+            [age_gender_model, expression_model],
+    ):
+        y = processor(x, sampling_rate=sampling_rate)
+        y = y['input_values'][0]
+        y = y.reshape(1, -1)
+        y = torch.from_numpy(y).to(device)
+        # run through model
+        with torch.no_grad():
+            y = model(y)
+            y = torch.hstack([y[1], y[2]])
+        # convert to numpy
+        y = y.detach().cpu().numpy()
+        results.append(y[0])
+    return (
+        100 * results[0][0],  # age
+        {
+            "female": results[0][1],
+            "male": results[0][2],
+            "child": results[0][3],
+        },
+        {
+            "arousal": results[1][0],
+            "dominance": results[1][1],
+            "valence": results[1][2],
+        }
+    )
 @spaces.GPU
     target_rate = 16000
     signal = audresample.resample(signal, sampling_rate, target_rate)
+    return process_func(signal, target_rate)
 outputs = gr.Label()
 title = "audEERING age and gender recognition"
 description = (
+    "Speech analysis of an audio file or microphone recording.  \n"
+    f"[{age_gender_model_name}](https://huggingface.co/{age_gender_model_name}) "
+    "is used for age and gender recognition, "
+    f"[{expression_model_name}](https://huggingface.co/{expression_model_name}) "
+    "is used for expression recognition."
 )
 allow_flagging = "never"
     gr.Markdown(description)
     with gr.Tab(label="Speech analysis"):
         with gr.Row():
             with gr.Column():
+                gr.Markdown("Only the first second of the audio is processed.")
                 input = gr.Audio(
                     sources=["upload", "microphone"],
                     type="filepath",
             with gr.Column():
                 output_age = gr.Textbox(label="Age")
                 output_gender = gr.Label(label="Gender")
+                output_expression = gr.Label(label="Expression")
+        outputs = [output_age, output_gender, output_expression]
+        submit_btn.click(recognize, input, outputs)
 demo.launch(debug=True)