Spaces:

wsntxxn
/

UniFlow-Audio

Running on Zero

App Files Files Community

wsntxxn commited on 12 days ago

Commit

37f392f

1 Parent(s): b929107

Fix g2p issue

Browse files

Files changed (3) hide show

inference_cli.py +26 -13
modeling_uniflow_audio.py +27 -6
utils/phonemize.py +87 -0

inference_cli.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from typing import Any, Callable
 import json
-import os
 import fire
 import torch
@@ -11,7 +10,8 @@ import soundfile as sf
 import numpy as np
 from modeling_uniflow_audio import UniFlowAudioModel
-from constants import TIME_ALIGNED_TASKS, NON_TIME_ALIGNED_TASKS
 class InferenceCLI:
@@ -21,6 +21,7 @@ class InferenceCLI:
             "cuda" if torch.cuda.is_available() else "cpu"
         )
         self.g2p = None
         self.speaker_model = None
         self.svs_processor = None
         self.singer_mapping = None
@@ -82,7 +83,7 @@ class InferenceCLI:
     @staticmethod
     def add_prehook(func: Callable, ):
         def wrapper(self, *args, **kwargs):
-            model_name = kwargs["model_name"]
             self.on_inference_start(model_name)
             return func(self, *args, **kwargs)
@@ -144,22 +145,33 @@ class InferenceCLI:
         num_steps: int = 25,
         output_path: str = "./output.wav",
     ):
-        from g2p_en import G2p
-        import nltk
         self.init_speaker_model()
         if not self.g2p:
-            if not os.path.exists(
-                os.path.expanduser(
-                    "~/nltk_data/taggers/averaged_perceptron_tagger_eng"
                 )
-            ):
-                nltk.download("averaged_perceptron_tagger_eng")
-            self.g2p = G2p()
-        phonemes = self.g2p(transcript)
-        phonemes = [ph for ph in phonemes if ph != " "]
         phone_indices = [
             self.model.tts_phone2id.get(
                 p, self.model.tts_phone2id.get("spn", 0)
@@ -370,6 +382,7 @@ class InferenceCLI:
         )
         waveform = waveform[0, 0].cpu().numpy()
         if not output_path.endswith(".mp4"):
             sf.write(output_path, waveform, self.sample_rate)

 from typing import Any, Callable
 import json
 import fire
 import torch
 import numpy as np
 from modeling_uniflow_audio import UniFlowAudioModel
+from constants import TIME_ALIGNED_TASKS
+from utils.phonemize import sentence_to_phones
 class InferenceCLI:
             "cuda" if torch.cuda.is_available() else "cpu"
         )
         self.g2p = None
+        self.word2phone = None
         self.speaker_model = None
         self.svs_processor = None
         self.singer_mapping = None
     @staticmethod
     def add_prehook(func: Callable, ):
         def wrapper(self, *args, **kwargs):
+            model_name = kwargs.get("model_name", "UniFlow-Audio-large")
             self.on_inference_start(model_name)
             return func(self, *args, **kwargs)
         num_steps: int = 25,
         output_path: str = "./output.wav",
     ):
+        from montreal_forced_aligner.g2p.generator import PyniniConsoleGenerator
         self.init_speaker_model()
         if not self.g2p:
+            self.g2p = PyniniConsoleGenerator(
+                g2p_model_path=self.model.g2p_model_path,
+                strict_graphemes=False,
+                num_pronunciations=1,
+                include_bracketed=False
+            )
+            self.g2p.setup()
+        if not self.word2phone:
+            self.word2phone = json.load(
+                open(
+                    self.model.tts_word2phone_dict_path, "r", encoding="utf-8"
                 )
+            )
+        # OOV word will use a g2p model to predict phoneme
+        phonemes, OOV_list = sentence_to_phones(
+            transcript, self.word2phone, self.g2p
+        )
+        # print(phonemes)
         phone_indices = [
             self.model.tts_phone2id.get(
                 p, self.model.tts_phone2id.get("spn", 0)
         )
         waveform = waveform[0, 0].cpu().numpy()
+        output_path = output_path.__str__()
         if not output_path.endswith(".mp4"):
             sf.write(output_path, waveform, self.sample_rate)

modeling_uniflow_audio.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Any, Sequence
 from pathlib import Path
 import json
 import shutil
 import h5py
@@ -28,6 +29,17 @@ class UniFlowAudioModel(nn.Module):
         self.config["model"]["autoencoder"]["pretrained_ckpt"] = str(
             model_dir / self.config["model"]["autoencoder"]["pretrained_ckpt"]
         )
         self.model = hydra.utils.instantiate(
             self.config["model"], _convert_="all"
         )
@@ -42,6 +54,7 @@ class UniFlowAudioModel(nn.Module):
             shutil.copy(ori_model_path, self.g2p_model_path)
         self.tts_phone_set_path = model_dir / "mfa_g2p" / "phone_set.json"
         self.build_tts_phone_mapping()
         self.svs_phone_set_path = model_dir / "svs" / "phone_set.json"
         singers = json.load(open(model_dir / "svs" / "spk_set.json", "r"))
@@ -65,12 +78,20 @@ class UniFlowAudioModel(nn.Module):
         self.tts_phone2id = {p: i for i, p in enumerate(phone_set)}
     def init_instruction_encoder(self):
-        self.instruction_tokenizer = T5Tokenizer.from_pretrained(
-            "google/flan-t5-large"
-        )
-        self.instruction_encoder = T5EncoderModel.from_pretrained(
-            "google/flan-t5-large"
-        )
         self.instruction_encoder.eval()
     @torch.inference_mode()

 from typing import Any, Sequence
 from pathlib import Path
 import json
+import os
 import shutil
 import h5py
         self.config["model"]["autoencoder"]["pretrained_ckpt"] = str(
             model_dir / self.config["model"]["autoencoder"]["pretrained_ckpt"]
         )
+        flan_t5_path = os.environ.get("FLAN_T5_PATH", "google/flan-t5-large")
+        try:
+            tokenizer = T5Tokenizer.from_pretrained(flan_t5_path)
+            encoder = T5EncoderModel.from_pretrained(flan_t5_path)
+        except Exception as e:
+            raise RuntimeError(
+                "Failed to initialize Flan-T5, please download it manually and set the `FLAN_T5_PATH`"
+                "environment variable to the path of the downloaded model."
+            ) from e
+        self.config["model"]["content_encoder"]["text_encoder"]["model_name"
+                                                               ] = flan_t5_path
         self.model = hydra.utils.instantiate(
             self.config["model"], _convert_="all"
         )
             shutil.copy(ori_model_path, self.g2p_model_path)
         self.tts_phone_set_path = model_dir / "mfa_g2p" / "phone_set.json"
+        self.tts_word2phone_dict_path = model_dir / "mfa_g2p" / "word2phone.json"
         self.build_tts_phone_mapping()
         self.svs_phone_set_path = model_dir / "svs" / "phone_set.json"
         singers = json.load(open(model_dir / "svs" / "spk_set.json", "r"))
         self.tts_phone2id = {p: i for i, p in enumerate(phone_set)}
     def init_instruction_encoder(self):
+        flan_t5_path = os.environ.get("FLAN_T5_PATH", "google/flan-t5-large")
+        try:
+            self.instruction_tokenizer = T5Tokenizer.from_pretrained(
+                flan_t5_path
+            )
+            self.instruction_encoder = T5EncoderModel.from_pretrained(
+                flan_t5_path
+            )
+        except Exception as e:
+            raise RuntimeError(
+                "Failed to initialize Flan-T5, please download it manually and set the `FLAN_T5_PATH`"
+                "environment variable to the path of the downloaded model."
+            ) from e
         self.instruction_encoder.eval()
     @torch.inference_mode()

utils/phonemize.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import re
+def g2p_resolve(word, g2p_model):
+    """Call G2P to generate pronunciation (used for handling OOV words)."""
+    try:
+        result = g2p_model.rewriter(word.lower())
+        if result and result[0][0]:
+            return result[0][0].split()
+    except Exception:
+        return None
+    return None
+def text_norm(s):
+    """
+    Text normalization (keep internal apostrophes like don't, it's; remove quote-like apostrophes and other punctuation):
+    1. Lowercase the text
+    2. Keep apostrophes between letters (e.g. don't)
+    3. Remove apostrophes that are not between letters (used as quotes or standalone)
+    4. Remove other common punctuation marks (.,;!?()[]-"“” etc.)
+    5. Collapse multiple spaces into a single space
+    """
+    s = s.lower()
+    # First temporarily replace apostrophes between letters (a'b) with a placeholder to avoid deletion
+    # Support both ASCII ' and Unicode ’, ‘
+    APOST = "<<<APOST>>>"  # Placeholder string (ensured not to appear in normal sentences)
+    s = re.sub(r"(?<=[A-Za-z0-9])['\u2019\u2018](?=[A-Za-z0-9])", APOST, s)
+    # Remove all remaining apostrophes (these are quotes or isolated marks)
+    s = re.sub(r"['\u2019\u2018]", " ", s)
+    # Remove other punctuation (while keeping internal apostrophes protected by the placeholder)
+    s = re.sub(r"[,\.\!\?\;\:\(\)\[\]\"“”\-]", " ", s)
+    # Restore internal apostrophes back to ASCII apostrophe (or to the original character if needed)
+    s = s.replace(APOST, "'")
+    # Merge extra spaces
+    s = " ".join(s.split())
+    return s
+# ---------------- Core conversion ----------------
+def sentence_to_phones(sentence, word2phones, g2p_model):
+    """
+    Convert sentence to phones:
+    1. Split the original sentence and keep punctuation positions to insert sil later
+    2. Insert sil at punctuation positions
+    3. Add sil at the beginning and end of the sentence
+    """
+    original_sentence = sentence  # Save the original sentence
+    sentence = text_norm(sentence)
+    phone_sequence = ["sil"]  # Initial silence
+    oov_list = []
+    # Split the original sentence to locate punctuation positions
+    tokens = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[.,;!?]", original_sentence)
+    for token in tokens:
+        if re.match(r"[.,;!?]", token):  # Punctuation
+            phone_sequence.append("sil")
+        else:
+            word = text_norm(token)  # Normalize word
+            if word not in word2phones:
+                g2p_ph = g2p_resolve(word, g2p_model)
+                if g2p_ph:
+                    phone_sequence.extend(g2p_ph)
+                else:
+                    phone_sequence.append(
+                        "spn"
+                    )  # If it really cannot be handled, use a short pause
+                oov_list.append(word)
+            else:
+                pron, _ = max(word2phones[word].items(), key=lambda x: x[1])
+                phone_sequence.extend(pron.split())
+    if phone_sequence[-1] != 'sil':
+        phone_sequence.append("sil")  # Ending silence
+    return phone_sequence, oov_list