| | |
| | |
| | import re |
| | import unicodedata |
| | import os |
| |
|
| | from transformers import AutoTokenizer |
| |
|
| | from . import punctuation, symbols |
| |
|
| |
|
| | from num2words import num2words |
| | from .ko_dictionary import english_dictionary, etc_dictionary |
| | from anyascii import anyascii |
| | from jamo import hangul_to_jamo |
| |
|
| | def normalize(text): |
| | text = text.strip() |
| | text = re.sub("[โบ-โบโบ-โปณโผ-โฟใ
ใใก-ใฉใธ-ใบใปใ-ไถตไธ-้ฟ่ฑ-้ถดไพฎ-้ ปไธฆ-้พ]", "", text) |
| | text = normalize_with_dictionary(text, etc_dictionary) |
| | text = normalize_english(text) |
| | text = text.lower() |
| | return text |
| |
|
| |
|
| | def normalize_with_dictionary(text, dic): |
| | if any(key in text for key in dic.keys()): |
| | pattern = re.compile("|".join(re.escape(key) for key in dic.keys())) |
| | return pattern.sub(lambda x: dic[x.group()], text) |
| | return text |
| |
|
| |
|
| | def normalize_english(text): |
| | def fn(m): |
| | word = m.group() |
| | if word in english_dictionary: |
| | return english_dictionary.get(word) |
| | return word |
| |
|
| | text = re.sub("([A-Za-z]+)", fn, text) |
| | return text |
| |
|
| |
|
| | g2p_kr = None |
| | def korean_text_to_phonemes(text, character: str = "hangeul") -> str: |
| | """ |
| | |
| | The input and output values look the same, but they are different in Unicode. |
| | |
| | example : |
| | |
| | input = 'ํ๋' (Unicode : \ud558\ub298), (ํ + ๋) |
| | output = 'แแ
กแแ
ณแฏ' (Unicode :\u1112\u1161\u1102\u1173\u11af), (แ + แ
ก + แ + แ
ณ + แฏ) |
| | |
| | """ |
| | global g2p_kr |
| | if g2p_kr is None: |
| | from g2pkk import G2p |
| |
|
| | g2p_kr = G2p() |
| |
|
| | if character == "english": |
| | from anyascii import anyascii |
| | text = normalize(text) |
| | text = g2p_kr(text) |
| | text = anyascii(text) |
| | return text |
| |
|
| | text = normalize(text) |
| | text = g2p_kr(text) |
| | text = list(hangul_to_jamo(text)) |
| | return "".join(text) |
| |
|
| | def text_normalize(text): |
| | |
| | |
| | |
| | |
| | text = normalize(text) |
| | return text |
| |
|
| |
|
| | def distribute_phone(n_phone, n_word): |
| | phones_per_word = [0] * n_word |
| | for task in range(n_phone): |
| | min_tasks = min(phones_per_word) |
| | min_index = phones_per_word.index(min_tasks) |
| | phones_per_word[min_index] += 1 |
| | return phones_per_word |
| |
|
| |
|
| |
|
| | |
| |
|
| | model_id = 'kykim/bert-kor-base' |
| | if not os.path.exists(model_id): |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | tokenizer.save_pretrained(model_id) |
| | else: |
| | tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=f"./{model_id}") |
| |
|
| | def g2p(norm_text): |
| | tokenized = tokenizer.tokenize(norm_text) |
| | phs = [] |
| | ph_groups = [] |
| | for t in tokenized: |
| | if not t.startswith("#"): |
| | ph_groups.append([t]) |
| | else: |
| | ph_groups[-1].append(t.replace("#", "")) |
| | word2ph = [] |
| | for group in ph_groups: |
| | text = "" |
| | for ch in group: |
| | text += ch |
| | if text == '[UNK]': |
| | phs += ['_'] |
| | word2ph += [1] |
| | continue |
| | elif text in punctuation: |
| | phs += [text] |
| | word2ph += [1] |
| | continue |
| | |
| | |
| | |
| | phonemes = korean_text_to_phonemes(text) |
| | |
| | |
| | |
| | |
| | phone_len = len(phonemes) |
| | word_len = len(group) |
| |
|
| | aaa = distribute_phone(phone_len, word_len) |
| | assert len(aaa) == word_len |
| | word2ph += aaa |
| |
|
| | phs += phonemes |
| | phones = ["_"] + phs + ["_"] |
| | tones = [0 for i in phones] |
| | word2ph = [1] + word2ph + [1] |
| | assert len(word2ph) == len(tokenized) + 2 |
| | return phones, tones, word2ph |
| |
|
| | def get_bert_feature(text, word2ph, device='cuda'): |
| | from . import japanese_bert |
| | return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | from text.symbols import symbols |
| | text = "์ ์ ์ผ์ ๊ฐ์น์ ํฐํ์ธ ๋์ค๋ค์ด ํ ์ผ์ ์๋ฏธ๋ฅผ ์ ์๋๋ค. ์์ผ๋ก๋ ์ ์ ์ผ์ ์๋ถ์ฌ์ ๊ฐ๊ณ ์ด์๊ฐ ๊ฒ๋๋ค" |
| | import json |
| |
|
| | |
| | genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json')) |
| | from tqdm import tqdm |
| | new_symbols = [] |
| | for key, item in tqdm(genshin_data.items()): |
| | texts = item.get('voiceContent', '') |
| | if isinstance(texts, list): |
| | texts = ','.join(texts) |
| | if texts is None: |
| | continue |
| | if len(texts) == 0: |
| | continue |
| |
|
| | text = text_normalize(text) |
| | phones, tones, word2ph = g2p(text) |
| | bert = get_bert_feature(text, word2ph) |
| | import pdb; pdb.set_trace() |
| | for ph in phones: |
| | if ph not in symbols and ph not in new_symbols: |
| | new_symbols.append(ph) |
| | print('update!, now symbols:') |
| | print(new_symbols) |
| | with open('korean_symbol.txt', 'w') as f: |
| | f.write(f'{new_symbols}') |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |