UniFlow-Audio / utils /phonemize.py
wsntxxn's picture
Change to g2p_en
2d1110f
import re
def g2p_resolve(word, g2p_model):
"""Call G2P to generate pronunciation (used for handling OOV words)."""
try:
result = g2p_model(word)
return result
except Exception:
return None
def text_norm(s):
"""
Text normalization (keep internal apostrophes like don't, it's; remove quote-like apostrophes and other punctuation):
1. Lowercase the text
2. Keep apostrophes between letters (e.g. don't)
3. Remove apostrophes that are not between letters (used as quotes or standalone)
4. Remove other common punctuation marks (.,;!?()[]-"β€œβ€ etc.)
5. Collapse multiple spaces into a single space
"""
s = s.lower()
# First temporarily replace apostrophes between letters (a'b) with a placeholder to avoid deletion
# Support both ASCII ' and Unicode ’, β€˜
APOST = "<<<APOST>>>" # Placeholder string (ensured not to appear in normal sentences)
s = re.sub(r"(?<=[A-Za-z0-9])['\u2019\u2018](?=[A-Za-z0-9])", APOST, s)
# Remove all remaining apostrophes (these are quotes or isolated marks)
s = re.sub(r"['\u2019\u2018]", " ", s)
# Remove other punctuation (while keeping internal apostrophes protected by the placeholder)
s = re.sub(r"[,\.\!\?\;\:\(\)\[\]\"β€œβ€\-]", " ", s)
# Restore internal apostrophes back to ASCII apostrophe (or to the original character if needed)
s = s.replace(APOST, "'")
# Merge extra spaces
s = " ".join(s.split())
return s
# ---------------- Core conversion ----------------
def sentence_to_phones(sentence, word2phones, g2p_model):
"""
Convert sentence to phones:
1. Split the original sentence and keep punctuation positions to insert sil later
2. Insert sil at punctuation positions
3. Add sil at the beginning and end of the sentence
"""
original_sentence = sentence # Save the original sentence
sentence = text_norm(sentence)
phone_sequence = ["sil"] # Initial silence
oov_list = []
# Split the original sentence to locate punctuation positions
tokens = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[.,;!?]", original_sentence)
for token in tokens:
if re.match(r"[.,;!?]", token): # Punctuation
phone_sequence.append("sil")
else:
word = text_norm(token) # Normalize word
if word not in word2phones:
g2p_ph = g2p_resolve(word, g2p_model)
if g2p_ph:
phone_sequence.extend(g2p_ph)
else:
phone_sequence.append(
"spn"
) # If it really cannot be handled, use a short pause
oov_list.append(word)
else:
pron, _ = max(word2phones[word].items(), key=lambda x: x[1])
phone_sequence.extend(pron.split())
if phone_sequence[-1] != 'sil':
phone_sequence.append("sil") # Ending silence
return phone_sequence, oov_list