File size: 5,170 Bytes
b4bbb92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from typing import Any, Sequence
from pathlib import Path
import json
import shutil

import h5py
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from safetensors.torch import load_file
import hydra
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import T5EncoderModel, T5Tokenizer


class UniFlowAudioModel(nn.Module):
    def __init__(self, model_name: str = "wsntxxn/UniFlow-Audio-large"):
        assert model_name in (
            "wsntxxn/UniFlow-Audio-large",
            "wsntxxn/UniFlow-Audio-medium",
            "wsntxxn/UniFlow-Audio-small",
        )
        super().__init__()
        model_dir = snapshot_download(repo_id=model_name)
        model_dir = Path(model_dir)
        self.config = OmegaConf.load(model_dir / "config.yaml")
        self.config["model"]["autoencoder"]["pretrained_ckpt"] = str(
            model_dir / self.config["model"]["autoencoder"]["pretrained_ckpt"]
        )
        self.model = hydra.utils.instantiate(
            self.config["model"], _convert_="all"
        )
        state_dict = load_file(model_dir / "model.safetensors")
        self.model.load_pretrained(state_dict)
        self.model.eval()

        self.g2p_model_path = model_dir / "mfa_g2p" / "english_us_arpa_unhashed.zip"
        if not self.g2p_model_path.exists():
            ori_model_path = (model_dir / "mfa_g2p" /
                              "english_us_arpa.zip").resolve()
            shutil.copy(ori_model_path, self.g2p_model_path)

        self.tts_phone_set_path = model_dir / "mfa_g2p" / "phone_set.json"
        self.build_tts_phone_mapping()
        self.svs_phone_set_path = model_dir / "svs" / "phone_set.json"
        singers = json.load(open(model_dir / "svs" / "spk_set.json", "r"))
        self.svs_singer_mapping = {
            singer: i
            for i, singer in enumerate(singers)
        }
        self.svs_pinyin2ph = model_dir / "svs" / "m4singer_pinyin2ph.txt"

        self.task_to_instructions = {}
        with h5py.File(model_dir / "instructions" / "t5_embeddings.h5") as hf:
            for key in hf.keys():
                self.task_to_instructions[key] = hf[key][()]

        self.init_instruction_encoder()

    def build_tts_phone_mapping(self):
        with open(self.tts_phone_set_path, "r", encoding="utf-8") as f:
            phone_set = json.load(f)

        self.tts_phone2id = {p: i for i, p in enumerate(phone_set)}

    def init_instruction_encoder(self):
        self.instruction_tokenizer = T5Tokenizer.from_pretrained(
            "google/flan-t5-large"
        )
        self.instruction_encoder = T5EncoderModel.from_pretrained(
            "google/flan-t5-large"
        )
        self.instruction_encoder.eval()

    @torch.inference_mode()
    def encode_instruction(self, instruction: list[str], device: torch.device):
        with torch.amp.autocast(enabled=False):
            tokens = self.instruction_tokenizer(
                instruction,
                max_length=self.instruction_tokenizer.model_max_length,
                padding=True,
                truncation=True,
                return_tensors="pt",
            )
            input_ids = tokens.input_ids.to(device)
            attention_mask = tokens.attention_mask.to(device)
            output = self.instruction_encoder(
                input_ids=input_ids, attention_mask=attention_mask
            )
            output = output.last_hidden_state
            length = attention_mask.sum(dim=1)
            return output, length

    @torch.inference_mode()
    def sample(
        self,
        content: list[Any],
        task: list[str],
        is_time_aligned: Sequence[bool],
        instruction: list[str] | None = None,
        instruction_idx: list[int] | None = None,
        num_steps: int = 20,
        sway_sampling_coef: float | None = -1.0,
        guidance_scale: float = 3.0,
        disable_progress: bool = True,
    ):
        device = self.model.dummy_param.device

        if instruction is None:
            instructions = []
            instruction_lengths = []
            for sample_idx, task_ in enumerate(task):
                if instruction_idx:
                    instruction_idx_ = instruction_idx[sample_idx]
                else:
                    instruction_idx_ = 0
                instruction_ = self.task_to_instructions[
                    f"{task_}_{instruction_idx_}"]
                instructions.append(torch.as_tensor(instruction_))
                instruction_lengths.append(instruction_.shape[0])
            instructions = pad_sequence(instructions,
                                        batch_first=True).to(device)
            instruction_lengths = torch.as_tensor(instruction_lengths
                                                 ).to(device)
        else:
            instructions, instruction_lengths = self.encode_instruction(
                instruction, device
            )

        return self.model.inference(
            content, task, is_time_aligned, instructions, instruction_lengths,
            num_steps, sway_sampling_coef, guidance_scale, disable_progress
        )