vec2wav2.0-demo

Build error

App Files Files Community

cantabile-kwok commited on Nov 11, 2024

Commit

05005db

1 Parent(s): 8bd60fe

prepare demo page

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +51 -0
pretrained/WavLM-Large.pt +3 -0
pretrained/config.yml +201 -0
pretrained/generator.ckpt +3 -0
pretrained/vq-wav2vec_kmeans.pt +3 -0
requirements.txt +25 -0
vec2wav2/__init__.py +3 -0
vec2wav2/__pycache__/__init__.cpython-310.pyc +0 -0
vec2wav2/__pycache__/__init__.cpython-311.pyc +0 -0
vec2wav2/__pycache__/__init__.cpython-39.pyc +0 -0
vec2wav2/bin/.DS_Store +0 -0
vec2wav2/bin/__init__.py +0 -0
vec2wav2/bin/__pycache__/__init__.cpython-310.pyc +0 -0
vec2wav2/bin/__pycache__/vc.cpython-310.pyc +0 -0
vec2wav2/bin/decode.py +163 -0
vec2wav2/bin/gradio_app.py +51 -0
vec2wav2/bin/train.py +1007 -0
vec2wav2/bin/vc.py +128 -0
vec2wav2/datasets/__init__.py +1 -0
vec2wav2/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
vec2wav2/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
vec2wav2/datasets/__pycache__/scp_dataset.cpython-310.pyc +0 -0
vec2wav2/datasets/__pycache__/scp_dataset.cpython-39.pyc +0 -0
vec2wav2/datasets/scp_dataset.py +300 -0
vec2wav2/distributed/__init__.py +0 -0
vec2wav2/distributed/launch.py +163 -0
vec2wav2/layers/__init__.py +6 -0
vec2wav2/layers/__pycache__/__init__.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/__init__.cpython-39.pyc +0 -0
vec2wav2/layers/__pycache__/activations.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/causal_conv.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/causal_conv.cpython-39.pyc +0 -0
vec2wav2/layers/__pycache__/pqmf.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/pqmf.cpython-39.pyc +0 -0
vec2wav2/layers/__pycache__/residual_block.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/residual_block.cpython-39.pyc +0 -0
vec2wav2/layers/__pycache__/residual_stack.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/residual_stack.cpython-39.pyc +0 -0
vec2wav2/layers/__pycache__/tade_res_block.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/tade_res_block.cpython-39.pyc +0 -0
vec2wav2/layers/__pycache__/upsample.cpython-310.pyc +0 -0
vec2wav2/layers/__pycache__/upsample.cpython-39.pyc +0 -0
vec2wav2/layers/activations.py +197 -0
vec2wav2/layers/causal_conv.py +66 -0
vec2wav2/layers/pqmf.py +150 -0
vec2wav2/layers/residual_block.py +222 -0
vec2wav2/layers/residual_stack.py +85 -0
vec2wav2/layers/tade_res_block.py +160 -0
vec2wav2/layers/upsample.py +194 -0
vec2wav2/losses/__init__.py +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import gradio as gr
+import logging
+import yaml
+import soundfile as sf
+import os
+from pathlib import Path
+from vec2wav2.bin.vc import VoiceConverter, configure_logging, vc_args
+# Create Gradio interface
+def create_interface():
+    args = vc_args()
+    logger = configure_logging(args.verbose)
+    voice_converter = VoiceConverter(
+        expdir=args.expdir,
+        token_extractor=args.token_extractor,
+        prompt_extractor=args.prompt_extractor,
+        prompt_output_layer=args.prompt_output_layer,
+        checkpoint=args.checkpoint,
+        script_logger=logger
+    )
+    with gr.Blocks(title="Voice Conversion") as demo:
+        gr.Markdown("# vec2wav 2.0 Voice Conversion Demo")
+        gr.Markdown("Upload source audio and target speaker audio to convert the voice.")
+        with gr.Row():
+            source_audio = gr.Audio(label="Source Audio", type="filepath")
+            target_audio = gr.Audio(label="Target Speaker Audio", type="filepath")
+        examples = [
+            ["examples/Zuckerberg.wav", "examples/Rachel.wav"],
+            ["examples/TheresaMay.wav", "examples/OptimusPrime.wav"]
+        ]
+        gr.Examples(examples, label="Examples", inputs=[source_audio, target_audio])
+        convert_btn = gr.Button("Convert Voice")
+        output_audio = gr.Audio(label="Converted Audio")
+        convert_btn.click(
+            fn=voice_converter.voice_conversion,
+            inputs=[source_audio, target_audio],
+            outputs=output_audio
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=True)

pretrained/WavLM-Large.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fb4b3c3e6aa567f0a997b30855859cb81528ee8078802af439f7b2da0bf100f
+size 1261965425

pretrained/config.yml ADDED Viewed

	@@ -0,0 +1,201 @@

+allow_cache: false
+batch_frames: 3600
+config: conf/ctxv2w.v1.yaml
+crop_max_frames: 100
+discriminator_adv_loss_params:
+  average_by_discriminators: false
+discriminator_grad_norm: -1
+discriminator_optimizer_params:
+  betas:
+  - 0.5
+  - 0.9
+  lr: 0.0002
+  weight_decay: 0.0
+discriminator_optimizer_type: Adam
+discriminator_params:
+  follow_official_norm: true
+  period_discriminator_params:
+    bias: true
+    channels: 32
+    downsample_scales:
+    - 3
+    - 3
+    - 3
+    - 3
+    - 1
+    in_channels: 1
+    kernel_sizes:
+    - 5
+    - 3
+    max_downsample_channels: 1024
+    nonlinear_activation: LeakyReLU
+    nonlinear_activation_params:
+      negative_slope: 0.1
+    out_channels: 1
+    use_spectral_norm: false
+    use_weight_norm: true
+  periods:
+  - 2
+  - 3
+  - 5
+  - 7
+  - 11
+  scale_discriminator_params:
+    bias: true
+    channels: 128
+    downsample_scales:
+    - 4
+    - 4
+    - 4
+    - 4
+    - 1
+    in_channels: 1
+    kernel_sizes:
+    - 15
+    - 41
+    - 5
+    - 3
+    max_downsample_channels: 1024
+    max_groups: 16
+    nonlinear_activation: LeakyReLU
+    nonlinear_activation_params:
+      negative_slope: 0.1
+    out_channels: 1
+  scale_downsample_pooling: AvgPool1d
+  scale_downsample_pooling_params:
+    kernel_size: 4
+    padding: 2
+    stride: 2
+  scales: 3
+discriminator_scheduler_params:
+  gamma: 0.5
+  milestones:
+  - 200000
+  - 400000
+  - 600000
+  - 800000
+discriminator_scheduler_type: MultiStepLR
+discriminator_train_start_steps: 0
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+distributed: true
+dropout_features: 0.0
+eval_interval_steps: 100000
+feat_match_loss_params:
+  average_by_discriminators: false
+  average_by_layers: false
+  include_final_outputs: false
+frontend_mel_prediction_stop_steps: 200000
+frontend_params:
+  conformer_params:
+    activation_type: swish
+    attention_dim: 184
+    attention_dropout_rate: 0.2
+    attention_heads: 2
+    cnn_module_kernel: 31
+    concat_after: false
+    dropout_rate: 0.2
+    linear_units: 1536
+    macaron_style: true
+    normalize_before: true
+    num_blocks: 2
+    pos_enc_layer_type: rel_pos
+    positional_dropout_rate: 0.2
+    positionwise_conv_kernel_size: 3
+    positionwise_layer_type: conv1d
+    selfattention_layer_type: rel_selfattn
+    use_cnn_module: true
+  prompt_channels: 1024
+  vqvec_channels: 512
+generator_adv_loss_params:
+  average_by_discriminators: false
+generator_grad_norm: -1
+generator_optimizer_params:
+  betas:
+  - 0.5
+  - 0.9
+  lr: 0.0002
+  weight_decay: 0.0
+generator_optimizer_type: Adam
+generator_params:
+  bias: true
+  channels: 512
+  condition_dim: 1024
+  in_channels: 184
+  kernel_size: 7
+  nonlinear_activation: snakebeta-condition
+  out_channels: 1
+  resblock: '1'
+  resblock_dilations:
+  - - 1
+    - 3
+    - 5
+  - - 1
+    - 3
+    - 5
+  - - 1
+    - 3
+    - 5
+  resblock_kernel_sizes:
+  - 3
+  - 7
+  - 11
+  snake_logscale: true
+  upsample_kernel_sizes:
+  - 16
+  - 10
+  - 6
+  - 4
+  upsample_scales:
+  - 8
+  - 5
+  - 3
+  - 2
+  use_additional_convs: true
+  use_weight_norm: true
+generator_scheduler_params:
+  gamma: 0.5
+  milestones:
+  - 200000
+  - 400000
+  - 600000
+  - 800000
+generator_scheduler_type: MultiStepLR
+generator_train_start_steps: 1
+generator_type: BigVGAN
+hop_size: 240
+lambda_adv: 1.0
+lambda_aux: 45.0
+lambda_feat_match: 2.0
+lambda_frontend_mel_prediction: 60
+log_interval_steps: 1000
+max_num_frames: 3000
+mel_loss_params:
+  fft_size: 2048
+  fmax: 8000
+  fmin: 40
+  fs: 24000
+  hop_size: 300
+  log_base: null
+  num_mels: 80
+  win_length: 1200
+  window: hann
+min_num_frames: 600
+num_mels: 80
+num_save_intermediate_results: 4
+num_workers: 8
+outdir: exp/train_all_ctxv2w.v1
+pin_memory: true
+pretrain: ''
+prompt_fold_by_2: true
+prompt_net_type: ConvPromptPrenet
+rank: 0
+sampling_rate: 24000
+save_interval_steps: 10000
+use_feat_match_loss: true
+use_mel_loss: true
+use_stft_loss: false
+verbose: 1
+version: 0.5.3
+vq_codebook: feats/vqidx/codebook.npy
+win_length: 697
+world_size: 4

pretrained/generator.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a10b9df62462bbf48382970ffba267b458b00b361bcb245701e3d3c0b6bd19f
+size 161604549

pretrained/vq-wav2vec_kmeans.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c975a93479dc5f3cfc4339032e1547c6034eddd15eb1cba73364c20786b42a5a
+size 336509919

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+torchaudio==0.13.1
+auraloss==0.4.0
+cython==3.0.10
+einops
+debugpy==1.8.0
+fairseq==0.12.2
+filelock~=3.12.2
+h5py
+kaldiio~=2.18.0
+librosa==0.8.1
+matplotlib~=3.4.3
+nltk==3.8.1
+numpy
+pathlib~=1.0.1
+pyyaml~=6.0
+scikit-learn
+scipy~=1.7.1
+setuptools==65.6.3
+six==1.16.0
+soundfile~=0.10.3.post1
+sox
+tensorboard
+tensorboardx~=2.5.1
+tqdm~=4.62.3
+transformers==4.42.3

vec2wav2/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # -- coding: utf-8 --
2	+
3	+ __version__ = ""

vec2wav2/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (211 Bytes). View file

vec2wav2/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (289 Bytes). View file

vec2wav2/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (225 Bytes). View file

vec2wav2/bin/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

vec2wav2/bin/__init__.py ADDED Viewed

File without changes

vec2wav2/bin/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (199 Bytes). View file

vec2wav2/bin/__pycache__/vc.cpython-310.pyc ADDED Viewed

Binary file (4.76 kB). View file

vec2wav2/bin/decode.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# Modified by Yiwei Guo, 2024
+"""Decode with trained vec2wav Generator."""
+import argparse
+import logging
+import os
+import time
+import numpy as np
+import soundfile as sf
+import torch
+import yaml
+from tqdm import tqdm
+from vec2wav2.datasets import MelSCPDataset
+from vec2wav2.utils import load_model, load_feat_codebook, idx2vec
+def set_loglevel(verbose):
+    # set logger
+    if verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+def main():
+    """Run decoding process."""
+    parser = argparse.ArgumentParser(
+        description="Decode from audio tokens and acoustic prompts with trained vec2wav model"
+        "(See detail in vec2wav2/bin/decode.py)."
+    )
+    parser.add_argument(
+        "--feats-scp",
+        "--scp",
+        default=None,
+        type=str,
+        required=True,
+        help="kaldi-style feats.scp file. "
+    )
+    parser.add_argument(
+        "--prompt-scp",
+        default=None,
+        type=str,
+        help="kaldi-style prompt.scp file. Similar to feats.scp."
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="directory to save generated speech.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="checkpoint file to be loaded.",
+    )
+    parser.add_argument(
+        "--config",
+        default=None,
+        type=str,
+        help="yaml format configuration file. if not explicitly provided, "
+        "it will be searched in the checkpoint directory. (default=None)",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    args = parser.parse_args()
+    set_loglevel(args.verbose)
+    # check directory existence
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    # load config
+    if args.config is None:
+        dirname = os.path.dirname(args.checkpoint)
+        args.config = os.path.join(dirname, "config.yml")
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+    # get dataset
+    dataset = MelSCPDataset(
+        vqidx_scp=args.feats_scp,
+        prompt_scp=args.prompt_scp,
+        return_utt_id=True,
+    )
+    logging.info(f"The number of features to be decoded = {len(dataset)}.")
+    # setup model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logging.info(f"Using {'GPU' if torch.cuda.is_available() else 'CPU'}.")
+    model = load_model(args.checkpoint, config)
+    logging.info(f"Loaded model parameters from {args.checkpoint}.")
+    model.backend.remove_weight_norm()
+    model = model.eval().to(device)
+    # load vq codebook
+    feat_codebook, feat_codebook_numgroups = load_feat_codebook(np.load(config["vq_codebook"], allow_pickle=True), device)
+    # start generation
+    total_rtf = 0.0
+    with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar:
+        for idx, batch in enumerate(pbar, 1):
+            utt_id, vqidx, prompt = batch[0], batch[1], batch[2]
+            vqidx = torch.tensor(vqidx).to(device)  # (L, G)
+            prompt = torch.tensor(prompt).unsqueeze(0).to(device)  # (1, L', D')
+            vqidx = vqidx.long()
+            vqvec = idx2vec(feat_codebook, vqidx, feat_codebook_numgroups).unsqueeze(0)  # (1, L, D)
+            # generate
+            start = time.time()
+            y = model.inference(vqvec, prompt)[-1].view(-1)
+            rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
+            pbar.set_postfix({"RTF": rtf})
+            total_rtf += rtf
+            tgt_dir = os.path.dirname(os.path.join(config["outdir"], f"{utt_id}.wav"))
+            os.makedirs(tgt_dir, exist_ok=True)
+            basename = os.path.basename(f"{utt_id}.wav")
+            # save as PCM 16 bit wav file
+            sf.write(
+                os.path.join(tgt_dir, basename),
+                y.cpu().numpy(),
+                config["sampling_rate"],
+                "PCM_16",
+            )
+    # report average RTF
+    logging.info(f"Finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).")
+if __name__ == "__main__":
+    main()

vec2wav2/bin/gradio_app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import gradio as gr
+import logging
+import yaml
+import soundfile as sf
+import os
+from pathlib import Path
+from vec2wav2.bin.vc import VoiceConverter, configure_logging, vc_args
+# Create Gradio interface
+def create_interface():
+    args = vc_args()
+    logger = configure_logging(args.verbose)
+    voice_converter = VoiceConverter(
+        expdir=args.expdir,
+        token_extractor=args.token_extractor,
+        prompt_extractor=args.prompt_extractor,
+        prompt_output_layer=args.prompt_output_layer,
+        checkpoint=args.checkpoint,
+        script_logger=logger
+    )
+    with gr.Blocks(title="Voice Conversion") as demo:
+        gr.Markdown("# vec2wav 2.0 Voice Conversion Demo")
+        gr.Markdown("Upload source audio and target speaker audio to convert the voice.")
+        with gr.Row():
+            source_audio = gr.Audio(label="Source Audio", type="filepath")
+            target_audio = gr.Audio(label="Target Speaker Audio", type="filepath")
+        examples = [
+            ["examples/Zuckerberg.wav", "examples/Rachel.wav"],
+            ["examples/TheresaMay.wav", "examples/OptimusPrime.wav"]
+        ]
+        gr.Examples(examples, label="Examples", inputs=[source_audio, target_audio])
+        convert_btn = gr.Button("Convert Voice")
+        output_audio = gr.Audio(label="Converted Audio")
+        convert_btn.click(
+            fn=voice_converter.voice_conversion,
+            inputs=[source_audio, target_audio],
+            outputs=output_audio
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=True)

vec2wav2/bin/train.py ADDED Viewed

	@@ -0,0 +1,1007 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# Modified by Yiwei Guo, 2024
+"""Train vec2wav."""
+import argparse
+import logging
+import os
+import sys
+import random
+from collections import defaultdict
+import matplotlib
+import numpy as np
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+import yaml
+import torch.multiprocessing as mp
+from tensorboardX import SummaryWriter
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import vec2wav2
+import vec2wav2.models
+import vec2wav2.optimizers
+from torch.utils.data.distributed import DistributedSampler
+from vec2wav2.datasets import AudioMelSCPDataset
+from vec2wav2.layers import PQMF
+from vec2wav2.losses import DiscriminatorAdversarialLoss
+from vec2wav2.losses import FeatureMatchLoss
+from vec2wav2.losses import GeneratorAdversarialLoss
+from vec2wav2.losses import MelSpectrogramLoss
+from vec2wav2.losses import MultiResolutionSTFTLoss
+from vec2wav2.utils import crop_seq, load_feat_codebook, idx2vec
+from vec2wav2.utils.espnet_utils import pad_list, make_non_pad_mask
+# set to avoid matplotlib error in CLI environment
+matplotlib.use("Agg")
+def set_loglevel(verbose):
+    # set logger
+    if verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            stream=sys.stdout,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            stream=sys.stdout,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            stream=sys.stdout,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+class Trainer(object):
+    """Customized trainer module for Parallel WaveGAN training."""
+    def __init__(
+            self,
+            steps,
+            epochs,
+            data_loader,
+            sampler,
+            model,
+            criterion,
+            optimizer,
+            scheduler,
+            config,
+            device=torch.device("cpu"),
+    ):
+        """Initialize trainer.
+        Args:
+            steps (int): Initial global steps.
+            epochs (int): Initial global epochs.
+            data_loader (dict): Dict of data loaders. It must contain "train" and "dev" loaders.
+            model (dict): Dict of models. It must contain "generator" and "discriminator" models.
+            criterion (dict): Dict of criteria. It must contain "stft" and "mse" criteria.
+            optimizer (dict): Dict of optimizers. It must contain "generator" and "discriminator" optimizers.
+            scheduler (dict): Dict of schedulers. It must contain "generator" and "discriminator" schedulers.
+            config (dict): Config dict loaded from yaml format configuration file.
+            device (torch.deive): Pytorch device instance.
+        """
+        self.steps = steps
+        self.epochs = epochs
+        self.data_loader = data_loader
+        self.sampler = sampler
+        self.model = model
+        self.criterion = criterion
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.config = config
+        self.device = device
+        self.writer = SummaryWriter(config["outdir"])
+        self.finish_train = False
+        self.total_train_loss = defaultdict(float)
+        self.total_eval_loss = defaultdict(float)
+        # load vq codebook
+        feat_codebook_path = self.config["vq_codebook"]
+        self.feat_codebook, self.feat_codebook_numgroups = load_feat_codebook(np.load(feat_codebook_path, allow_pickle=True), device)
+    def run(self):
+        """Run training."""
+        self.tqdm = tqdm(initial=self.steps, total=self.config["train_max_steps"], desc="[train]")
+        while True:
+            # train one epoch
+            self._train_epoch()
+            # check whether training is finished
+            if self.finish_train:
+                break
+        self.tqdm.close()
+        logging.info("Finished training.")
+    def save_checkpoint(self, checkpoint_path):
+        """Save checkpoint.
+        Args:
+            checkpoint_path (str): Checkpoint path to be saved.
+        """
+        state_dict = {
+            "optimizer": {
+                "generator": self.optimizer["generator"].state_dict(),
+                "discriminator": self.optimizer["discriminator"].state_dict(),
+            },
+            "scheduler": {
+                "generator": self.scheduler["generator"].state_dict(),
+                "discriminator": self.scheduler["discriminator"].state_dict(),
+            },
+            "steps": self.steps,
+            "epochs": self.epochs,
+        }
+        if self.config["distributed"]:
+            state_dict["model"] = {
+                "generator": self.model["generator"].module.state_dict(),
+                "discriminator": self.model["discriminator"].module.state_dict(),
+            }
+        else:
+            state_dict["model"] = {
+                "generator": self.model["generator"].state_dict(),
+                "discriminator": self.model["discriminator"].state_dict(),
+            }
+        if not os.path.exists(os.path.dirname(checkpoint_path)):
+            os.makedirs(os.path.dirname(checkpoint_path))
+        torch.save(state_dict, checkpoint_path)
+    def load_checkpoint(self, checkpoint_path, load_only_params=False):
+        """Load checkpoint.
+        Args:
+            checkpoint_path (str): Checkpoint path to be loaded.
+            load_only_params (bool): Whether to load only model parameters.
+        """
+        state_dict = torch.load(checkpoint_path, map_location="cpu")
+        if self.config["distributed"]:
+            self.model["generator"].module.load_state_dict(
+                state_dict["model"]["generator"]
+            )
+            self.model["discriminator"].module.load_state_dict(
+                state_dict["model"]["discriminator"]
+            )
+        else:
+            self.model["generator"].load_state_dict(state_dict["model"]["generator"])
+            self.model["discriminator"].load_state_dict(
+                state_dict["model"]["discriminator"]
+            )
+        if not load_only_params:
+            self.steps = state_dict["steps"]
+            self.epochs = state_dict["epochs"]
+            self.optimizer["generator"].load_state_dict(state_dict["optimizer"]["generator"])
+            self.optimizer["discriminator"].load_state_dict(state_dict["optimizer"]["discriminator"])
+            self.scheduler["generator"].load_state_dict(state_dict["scheduler"]["generator"])
+            self.scheduler["discriminator"].load_state_dict(state_dict["scheduler"]["discriminator"])
+    def _train_step(self, batch):
+        """Train model one step."""
+        # parse batch
+        vqidx, mel, prompt, y, xlens, prompt_lens = batch
+        vqidx = vqidx.to(self.device)
+        mel = mel.to(self.device)
+        prompt = prompt.to(self.device)
+        vqvec = idx2vec(self.feat_codebook, vqidx, self.feat_codebook_numgroups)  # (B, L, D)
+        y = y.unsqueeze(-2).to(self.device)  # (B, 1, T)
+        # build mask
+        mask = make_non_pad_mask(xlens).to(self.device)  # (B, L)
+        prompt_mask = make_non_pad_mask(prompt_lens).to(self.device)  # (B, L_prompt)
+        # crop wav sequence
+        crop_xlen = min(self.config["crop_max_frames"], min(xlens))
+        x_offsets = [np.random.randint(0, l - crop_xlen + 1) for l in xlens]
+        crop_ylen = crop_xlen * self.config["hop_size"]
+        y_offsets = [o * self.config["hop_size"] for o in x_offsets]
+        y = crop_seq(y, y_offsets, crop_ylen)
+        #######################
+        #      Generator      #
+        #######################
+        if self.steps > self.config.get("generator_train_start_steps", 0):
+            mel_, _, y_ = self.model["generator"](vqvec, prompt, mask, prompt_mask, crop_xlen, x_offsets)  # (B, L, 80), (B, C, T)
+            # initialize
+            gen_loss, aux_loss = 0.0, 0.0
+            # frontend mel prediction loss
+            if self.steps <= self.config.get("frontend_mel_prediction_stop_steps", 0):
+                frontend_mel_pred_loss = F.l1_loss(torch.masked_select(mel, mask.unsqueeze(-1)),
+                                                   torch.masked_select(mel_, mask.unsqueeze(-1)))
+                self.total_train_loss["train/frontend_mel_pred_loss"] += frontend_mel_pred_loss.item()
+                gen_loss += self.config["lambda_frontend_mel_prediction"] * frontend_mel_pred_loss
+            # multi-resolution sfft loss
+            if self.config["use_stft_loss"]:
+                sc_loss, mag_loss = self.criterion["stft"](y_, y)
+                aux_loss += sc_loss + mag_loss
+                self.total_train_loss["train/spectral_convergence_loss"] += sc_loss.item()
+                self.total_train_loss["train/log_stft_magnitude_loss"] += mag_loss.item()
+            # subband multi-resolution stft loss
+            if self.config["use_subband_stft_loss"]:
+                aux_loss *= 0.5  # for balancing with subband stft loss
+                y_mb = self.criterion["pqmf"].analysis(y)
+                y_mb_ = self.criterion["pqmf"].analysis(y_)
+                sub_sc_loss, sub_mag_loss = self.criterion["sub_stft"](y_mb_, y_mb)
+                aux_loss += 0.5 * (sub_sc_loss + sub_mag_loss)
+                self.total_train_loss["train/sub_spectral_convergence_loss"] += sub_sc_loss.item()
+                self.total_train_loss["train/sub_log_stft_magnitude_loss"] += sub_mag_loss.item()
+            # mel spectrogram loss
+            if self.config["use_mel_loss"]:
+                mel_loss = self.criterion["mel"](y_, y)
+                aux_loss += mel_loss
+                self.total_train_loss["train/mel_loss"] += mel_loss.item()
+            # weighting aux loss
+            gen_loss += self.config.get("lambda_aux", 1.0) * aux_loss
+            # adversarial loss
+            if self.steps > self.config["discriminator_train_start_steps"]:
+                p_ = self.model["discriminator"](y_)
+                adv_loss = self.criterion["gen_adv"](p_)
+                self.total_train_loss["train/adversarial_loss"] += adv_loss.item()
+                # feature matching loss
+                if self.config["use_feat_match_loss"]:
+                    # no need to track gradients
+                    with torch.no_grad():
+                        p = self.model["discriminator"](y)
+                    fm_loss = self.criterion["feat_match"](p_, p)
+                    self.total_train_loss["train/feature_matching_loss"] += fm_loss.item()
+                    adv_loss += self.config["lambda_feat_match"] * fm_loss
+                # add adversarial loss to generator loss
+                gen_loss += self.config["lambda_adv"] * adv_loss
+            self.total_train_loss["train/generator_loss"] += gen_loss.item()
+            # update generator
+            self.optimizer["generator"].zero_grad()
+            gen_loss.backward()
+            if self.config["generator_grad_norm"] > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model["generator"].parameters(),
+                    self.config["generator_grad_norm"],
+                )
+            self.optimizer["generator"].step()
+            self.scheduler["generator"].step()
+        #######################
+        #    Discriminator    #
+        #######################
+        if self.steps > self.config["discriminator_train_start_steps"]:
+            # re-compute y_ which leads better quality
+            with torch.no_grad():
+                # logging.info(f"{vqvec.shape, prompt.shape, mask.shape, prompt_mask.shape}")
+                _, _, y_ = self.model["generator"](vqvec, prompt, mask, prompt_mask, crop_xlen, x_offsets)  # (B, L, 80), (B, C, T)
+            if self.config["generator_params"]["out_channels"] > 1:
+                y_ = self.criterion["pqmf"].synthesis(y_)
+            # discriminator loss
+            p = self.model["discriminator"](y)
+            p_ = self.model["discriminator"](y_.detach())
+            real_loss, fake_loss = self.criterion["dis_adv"](p_, p)
+            dis_loss = real_loss + fake_loss
+            self.total_train_loss["train/real_loss"] += real_loss.item()
+            self.total_train_loss["train/fake_loss"] += fake_loss.item()
+            self.total_train_loss["train/discriminator_loss"] += dis_loss.item()
+            # update discriminator
+            self.optimizer["discriminator"].zero_grad()
+            dis_loss.backward()
+            if self.config["discriminator_grad_norm"] > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model["discriminator"].parameters(),
+                    self.config["discriminator_grad_norm"],
+                )
+            self.optimizer["discriminator"].step()
+            self.scheduler["discriminator"].step()
+        # update counts
+        self.steps += 1
+        self.tqdm.update(1)
+        self._check_train_finish()
+    def _train_epoch(self):
+        """Train model one epoch."""
+        for train_steps_per_epoch, batch in enumerate(self.data_loader["train"], 1):
+            # train one step
+            self._train_step(batch)
+            # check interval
+            if self.config["rank"] == 0:
+                self._check_log_interval()
+                self._check_eval_interval()
+                self._check_save_interval()
+            # check whether training is finished
+            if self.finish_train:
+                return
+        # update
+        self.epochs += 1
+        self.train_steps_per_epoch = train_steps_per_epoch
+        logging.info(
+            f"(Steps: {self.steps}) Finished {self.epochs} epoch training "
+            f"({self.train_steps_per_epoch} steps per epoch)."
+        )
+        # needed for shuffle in distributed training
+        if self.config["distributed"]:
+            self.sampler["train"].set_epoch(self.epochs)
+    @torch.no_grad()
+    def _eval_step(self, batch):
+        """Evaluate model one step."""
+        # parse batch
+        vqidx, mel, prompt, y, xlens, prompt_lens = batch
+        vqidx = vqidx.to(self.device).long()
+        mel = mel.to(self.device)
+        prompt = prompt.to(self.device)
+        vqvec = idx2vec(self.feat_codebook, vqidx, self.feat_codebook_numgroups)
+        y = y.unsqueeze(-2).to(self.device)  # (B, 1, T)
+        # build mask
+        mask = make_non_pad_mask(xlens).to(self.device)  # (B, L)
+        prompt_mask = make_non_pad_mask(prompt_lens).to(self.device)  # (B, L_prompt)
+        #######################
+        #      Generator      #
+        #######################
+        mel_, _, y_ = self.model["generator"](vqvec, prompt, mask, prompt_mask)  # (B, L, 80), (B, C, T)
+        # reconstruct the signal from multi-band signal
+        if self.config["generator_params"]["out_channels"] > 1:
+            y_mb_ = y_
+            y_ = self.criterion["pqmf"].synthesis(y_mb_)
+        # initialize
+        gen_loss = 0.0
+        aux_loss = 0.0
+        # frontend mel prediction loss
+        frontend_mel_pred_loss = F.l1_loss(torch.masked_select(mel, mask.unsqueeze(-1)),
+                                           torch.masked_select(mel_, mask.unsqueeze(-1)))
+        self.total_eval_loss["eval/frontend_mel_pred_loss"] += frontend_mel_pred_loss.item()
+        gen_loss += self.config["lambda_frontend_mel_prediction"] * frontend_mel_pred_loss
+        # multi-resolution stft loss
+        if self.config["use_stft_loss"]:
+            sc_loss, mag_loss = self.criterion["stft"](y_, y)
+            aux_loss += sc_loss + mag_loss
+            self.total_eval_loss["eval/spectral_convergence_loss"] += sc_loss.item()
+            self.total_eval_loss["eval/log_stft_magnitude_loss"] += mag_loss.item()
+        # subband multi-resolution stft loss
+        if self.config.get("use_subband_stft_loss", False):
+            aux_loss *= 0.5  # for balancing with subband stft loss
+            y_mb = self.criterion["pqmf"].analysis(y)
+            sub_sc_loss, sub_mag_loss = self.criterion["sub_stft"](y_mb_, y_mb)
+            self.total_eval_loss["eval/sub_spectral_convergence_loss"] += sub_sc_loss.item()
+            self.total_eval_loss["eval/sub_log_stft_magnitude_loss"] += sub_mag_loss.item()
+            aux_loss += 0.5 * (sub_sc_loss + sub_mag_loss)
+        # mel spectrogram loss
+        if self.config["use_mel_loss"]:
+            mel_loss = self.criterion["mel"](y_, y)
+            aux_loss += mel_loss
+            self.total_eval_loss["eval/mel_loss"] += mel_loss.item()
+        # weighting stft loss
+        gen_loss += aux_loss * self.config.get("lambda_aux", 1.0)
+        # adversarial loss
+        p_ = self.model["discriminator"](y_)
+        adv_loss = self.criterion["gen_adv"](p_)
+        gen_loss += self.config["lambda_adv"] * adv_loss
+        # feature matching loss
+        if self.config["use_feat_match_loss"]:
+            p = self.model["discriminator"](y)
+            fm_loss = self.criterion["feat_match"](p_, p)
+            self.total_eval_loss["eval/feature_matching_loss"] += fm_loss.item()
+            gen_loss += (
+                    self.config["lambda_adv"] * self.config["lambda_feat_match"] * fm_loss
+            )
+        #######################
+        #    Discriminator    #
+        #######################
+        p = self.model["discriminator"](y)
+        p_ = self.model["discriminator"](y_)
+        # discriminator loss
+        real_loss, fake_loss = self.criterion["dis_adv"](p_, p)
+        dis_loss = real_loss + fake_loss
+        # add to total eval loss
+        self.total_eval_loss["eval/adversarial_loss"] += adv_loss.item()
+        self.total_eval_loss["eval/generator_loss"] += gen_loss.item()
+        self.total_eval_loss["eval/real_loss"] += real_loss.item()
+        self.total_eval_loss["eval/fake_loss"] += fake_loss.item()
+        self.total_eval_loss["eval/discriminator_loss"] += dis_loss.item()
+    def _eval_epoch(self):
+        """Evaluate model one epoch."""
+        logging.info(f"(Steps: {self.steps}) Start evaluation.")
+        # change mode
+        for key in self.model.keys():
+            self.model[key].eval()
+        # calculate loss for each batch
+        for eval_steps_per_epoch, batch in enumerate(tqdm(self.data_loader["dev"], desc="[eval]"), 1):
+            # eval one step
+            self._eval_step(batch)
+        logging.info(
+            f"(Steps: {self.steps}) Finished evaluation "
+            f"({eval_steps_per_epoch} steps per epoch)."
+        )
+        # average loss
+        for key in self.total_eval_loss.keys():
+            self.total_eval_loss[key] /= eval_steps_per_epoch
+            logging.info(f"(Steps: {self.steps}) {key} = {self.total_eval_loss[key]:.4f}.")
+        # record
+        self._write_to_tensorboard(self.total_eval_loss)
+        # reset
+        self.total_eval_loss = defaultdict(float)
+        # restore mode
+        for key in self.model.keys():
+            self.model[key].train()
+    def _write_to_tensorboard(self, loss):
+        """Write to tensorboard."""
+        for key, value in loss.items():
+            self.writer.add_scalar(key, value, self.steps)
+    def _check_save_interval(self):
+        if self.steps % self.config["save_interval_steps"] == 0:
+            self.save_checkpoint(os.path.join(self.config["outdir"],
+                                              f"checkpoint-{self.steps}steps.pkl"))
+            logging.info(f"Successfully saved checkpoint @ {self.steps} steps.")
+    def _check_eval_interval(self):
+        if self.steps % self.config["eval_interval_steps"] == 0:
+            self._eval_epoch()
+    def _check_log_interval(self):
+        if self.steps % self.config["log_interval_steps"] == 0:
+            for key in self.total_train_loss.keys():
+                self.total_train_loss[key] /= self.config["log_interval_steps"]
+                logging.info(f"(Steps: {self.steps}) {key} = {self.total_train_loss[key]:.4f}.")
+            self._write_to_tensorboard(self.total_train_loss)
+            # reset
+            self.total_train_loss = defaultdict(float)
+    def _check_train_finish(self):
+        if self.steps >= self.config["train_max_steps"]:
+            self.finish_train = True
+class Collator(object):
+    """Customized collator for Pytorch DataLoader in training."""
+    def __init__(
+            self,
+            hop_size=256,
+            win_length=1024,
+            sampling_rate=16000,
+            prompt_dim=1024,
+            prompt_fold_by_2=False
+    ):
+        """Initialize customized collator for PyTorch DataLoader.
+        Args:
+            hop_size (int): Hop size of features, in sampling points.
+            win_length (int): window length of features.
+            sampling_rate (int): sampling rate of waveform data
+            prompt_dim (int): number of prompt embedding dimensions
+        """
+        self.hop_size = hop_size
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.prompt_dim = prompt_dim
+        if prompt_fold_by_2:
+            self.prompt_len_factor = 2
+        else:
+            self.prompt_len_factor = 1
+    def construct_prompt(self, mel_lens):
+        prompt_lens = [random.randint(int(l / (3 * self.prompt_len_factor)), int(l / (2 * self.prompt_len_factor))) for l in mel_lens]
+        prompt_starts = []
+        is_from_start = []
+        for ml, pl in zip(mel_lens, prompt_lens):
+            if random.random() > 0.5:
+                # from start
+                prompt_start = random.randint(0, 1 * self.sampling_rate // (self.hop_size * self.prompt_len_factor))
+                is_from_start.append(True)
+            else:
+                # from ending
+                prompt_start = random.randint((ml - 1 * self.sampling_rate // self.hop_size) // self.prompt_len_factor, ml // self.prompt_len_factor) - pl
+                is_from_start.append(False)
+            prompt_starts.append(prompt_start)
+        return prompt_lens, prompt_starts, is_from_start
+    def __call__(self, batch):
+        """Convert into batch tensors.
+        Args:
+            batch (list): list of tuple of the pair of audio and features.
+        This collator will automatically determine the prompt segment (acoustic context) for each utterance.
+        The prompt is cut off from the current utterance, ranging from one third to half of the original utterance.
+        The prompt can be cut from either the starting or the ending of the utterance, within 1 second margin.
+        The other features include 2-dim VQ features (2 is the number of groups), and D-dim prompts (e.g. WavLM features)
+        Returns:
+            Tensor ys: waveform batch (B, T).
+            Tensors vqs, mels: Auxiliary feature batch (B, C, T'), where T' = T / hop_size.
+            Tensor prompts: prompt feature batch (B, C, T'')
+            List c_lengths, prompt_lengths: list of lengths
+        """
+        batch = batch[0]
+        # check length
+        batch = [self._adjust_length(*b) for b in batch]
+        ys, vqs, mels, prompts_old = list(map(list, zip(*batch)))  # [(a,b), (c,d)] -> [a, c], [b, d]
+        batch_size = len(vqs)
+        prompt_lengths, prompt_starts, is_from_starts = self.construct_prompt([len(m) for m in mels])
+        c_lengths = []
+        prompts = torch.zeros(batch_size, max(prompt_lengths), self.prompt_dim)
+        for i in range(batch_size):
+            prompts[i, :prompt_lengths[i]] = torch.tensor(prompts_old[i][prompt_starts[i]:prompt_starts[i]+prompt_lengths[i], :])
+            if is_from_starts[i]:
+                start_idx = (prompt_starts[i] + prompt_lengths[i])*self.prompt_len_factor
+                mels[i] = mels[i][start_idx:]
+                vqs[i] = vqs[i][start_idx:]
+                ys[i] = ys[i][start_idx * self.hop_size: ]
+            else:
+                end_idx = prompt_starts[i]*self.prompt_len_factor
+                mels[i] = mels[i][:end_idx]
+                vqs[i] = vqs[i][:end_idx]
+                ys[i] = ys[i][:end_idx * self.hop_size]
+            c_lengths.append(len(mels[i]))
+        vqs = pad_list([torch.tensor(c) for c in vqs], pad_value=0) # (B, L, Groups)
+        vqs = vqs.long()
+        mels = pad_list([torch.tensor(c) for c in mels], pad_value=0)  # (B, L, 80)
+        ys = pad_list([torch.tensor(y, dtype=torch.float) for y in ys], pad_value=0)[:, :mels.size(1) * self.hop_size]  # (B, T)
+        assert ys.size(1) == mels.size(1) * self.hop_size == vqs.size(1) * self.hop_size
+        return vqs, mels, prompts, ys, c_lengths, prompt_lengths
+    def _adjust_length(self, x, c, *args):
+        """Adjust the audio and feature lengths.
+        Note:
+            Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
+            features, this process will be needed.
+        """
+        if len(x) > len(c) * self.hop_size:
+            x = x[(self.win_length - self.hop_size) // 2:]
+            x = x[:len(c) * self.hop_size]
+        # check the legnth is valid
+        assert len(x) == len(c) * self.hop_size
+        return x, c, *args
+def main(rank, n_gpus):
+    """Run training process."""
+    parser = argparse.ArgumentParser(
+        description="Train vec2wav2 (See detail in vec2wav2/bin/train.py)."
+    )
+    parser.add_argument(
+        "--train-wav-scp",
+        default=None,
+        type=str,
+        help="kaldi-style wav.scp file for training. "
+    )
+    parser.add_argument(
+        "--train-vqidx-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file for training. "
+    )
+    parser.add_argument(
+        "--train-mel-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file for training. "
+    )
+    parser.add_argument(
+        "--train-prompt-scp",
+        default=None,
+        type=str,
+        help="prompt scp (in this case, utt to path)"
+    )
+    parser.add_argument(
+        "--train-segments",
+        default=None,
+        type=str,
+        help="kaldi-style segments file for training.",
+    )
+    parser.add_argument(
+        "--train-num-frames",
+        default=None,
+        type=str,
+        help="kaldi-style utt2num_frames file for training.",
+    )
+    parser.add_argument(
+        "--dev-wav-scp",
+        default=None,
+        type=str,
+        help="kaldi-style wav.scp file for validation. "
+    )
+    parser.add_argument(
+        "--dev-vqidx-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file for vaidation. "
+    )
+    parser.add_argument(
+        "--dev-mel-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file for vaidation. "
+    )
+    parser.add_argument(
+        "--dev-prompt-scp",
+        default=None,
+        type=str,
+        help="prompt scp (in this case, utt to path)"
+    )
+    parser.add_argument(
+        "--dev-segments",
+        default=None,
+        type=str,
+        help="kaldi-style segments file for validation.",
+    )
+    parser.add_argument(
+        "--dev-num-frames",
+        default=None,
+        type=str,
+        help="kaldi-style utt2num_frames file for validation.",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="directory to save checkpoints.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="yaml format configuration file.",
+    )
+    parser.add_argument(
+        "--pretrain",
+        default="",
+        type=str,
+        nargs="?",
+        help='checkpoint file path to load pretrained params. (default="")',
+    )
+    parser.add_argument(
+        "--resume",
+        default="",
+        type=str,
+        nargs="?",
+        help='checkpoint file path to resume training. (default="")',
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    parser.add_argument("--vq-codebook", default=None, type=str)
+    # parser.add_argument("--sampling-rate", type=int)
+    # parser.add_argument("--num-mels", type=int)
+    # parser.add_argument("--hop-size", type=int)
+    # parser.add_argument("--win-length", type=int)
+    args = parser.parse_args()
+    # init distributed training
+    device = torch.device("cuda")
+    # effective when using fixed size inputs
+    # see https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
+    torch.backends.cudnn.benchmark = True
+    # setup for distributed training
+    # see example: https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed
+    if n_gpus == 1:
+        assert rank == 0
+    set_loglevel(args.verbose)
+    # check directory existence
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    # init process group
+    logging.info("Synchronizing between all workers.")
+    torch.distributed.init_process_group(backend="nccl", init_method="env://", world_size=n_gpus, rank=rank)
+    torch.cuda.set_device(rank)
+    logging.info("Finished init process group.")
+    # load and save config
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+    config['rank'] = rank
+    config['distributed'] = True
+    config['world_size'] = n_gpus
+    config["version"] = vec2wav2.__version__  # add version info
+    if rank == 0:
+        with open(os.path.join(args.outdir, "config.yml"), "w") as f:
+            yaml.dump(config, f, Dumper=yaml.Dumper)
+        for key, value in config.items():
+            logging.info(f"{key} = {value}")
+    # get dataset
+    train_dataset = AudioMelSCPDataset(
+        wav_scp=args.train_wav_scp,
+        vqidx_scp=args.train_vqidx_scp,
+        mel_scp=args.train_mel_scp,
+        prompt_scp=args.train_prompt_scp,
+        utt2num_frames=args.train_num_frames,
+        segments=args.train_segments,
+        batch_frames=config.get("batch_frames", None),
+        batch_size=config.get("batch_size", None),
+        min_num_frames=config.get("min_num_frames", None),
+        max_num_frames=config.get("max_num_frames", None),
+        allow_cache=config.get("allow_cache", False),  # keep compatibility
+        length_tolerance=config.get("length_tolerance", 2),
+        prompt_fold_by_2=config.get("prompt_fold_by_2", True)
+    )
+    if rank == 0:
+        logging.info(f"The number of training batches = {len(train_dataset)}.")
+    dev_dataset = AudioMelSCPDataset(
+        wav_scp=args.dev_wav_scp,
+        vqidx_scp=args.dev_vqidx_scp,
+        mel_scp=args.dev_mel_scp,
+        prompt_scp=args.dev_prompt_scp,
+        utt2num_frames=args.dev_num_frames,
+        segments=args.dev_segments,
+        min_num_frames=config.get("min_num_frames", None),
+        max_num_frames=config.get("max_num_frames", None),
+        allow_cache=config.get("allow_cache", False),  # keep compatibility
+        length_tolerance=config.get("length_tolerance", 2),
+        prompt_fold_by_2=config.get("prompt_fold_by_2", True)
+    )
+    if rank == 0:
+        logging.info(f"The number of development batches = {len(dev_dataset)}.")
+    dataset = {
+        "train": train_dataset,
+        "dev": dev_dataset,
+    }
+    # get data loader
+    collator = Collator(
+        hop_size=config["hop_size"],
+        win_length=config["win_length"],
+        sampling_rate=config["sampling_rate"],
+        prompt_dim=config['frontend_params']['prompt_channels'],
+        prompt_fold_by_2=config.get("prompt_fold_by_2", True)
+    )
+    sampler = {
+        "train": DistributedSampler(
+            dataset=dataset["train"],
+            num_replicas=n_gpus,
+            rank=rank,
+            shuffle=True,
+        ),
+        "dev": DistributedSampler(
+            dataset=dataset["dev"],
+            num_replicas=n_gpus,
+            rank=rank,
+            shuffle=False,
+        )}
+    data_loader = {
+        "train": DataLoader(
+            dataset=dataset["train"],
+            shuffle=False,
+            collate_fn=collator,
+            num_workers=config["num_workers"],
+            sampler=sampler["train"],
+            pin_memory=config["pin_memory"],
+        ),
+        "dev": DataLoader(
+            dataset=dataset["dev"],
+            shuffle=False,
+            collate_fn=collator,
+            num_workers=config["num_workers"],
+            sampler=sampler["dev"],
+            pin_memory=config["pin_memory"],
+        ),
+    }
+    # define models
+    generator_class = getattr(
+        vec2wav2.models,
+        # keep compatibility
+        config.get("generator_type", "ParallelWaveGANGenerator"),
+    )
+    discriminator_class = getattr(
+        vec2wav2.models,
+        # keep compatibility
+        config.get("discriminator_type", "ParallelWaveGANDiscriminator"),
+    )
+    model = {
+        "generator": vec2wav2.models.VEC2WAV2Generator(
+            vec2wav2.models.CTXVEC2WAVFrontend(config["prompt_net_type"], config["num_mels"], **config["frontend_params"]),
+            generator_class(**config["generator_params"])
+        ).to(device),
+        "discriminator": discriminator_class(
+            **config["discriminator_params"],
+        ).to(device),
+    }
+    # define criteria
+    criterion = {
+        "gen_adv": GeneratorAdversarialLoss(
+            # keep compatibility
+            **config.get("generator_adv_loss_params", {})
+        ).to(device),
+        "dis_adv": DiscriminatorAdversarialLoss(
+            # keep compatibility
+            **config.get("discriminator_adv_loss_params", {})
+        ).to(device),
+    }
+    if config.get("use_stft_loss", True):  # keep compatibility
+        config["use_stft_loss"] = True
+        criterion["stft"] = MultiResolutionSTFTLoss(**config["stft_loss_params"]).to(device)
+    if config.get("use_subband_stft_loss", False):  # keep compatibility
+        assert config["generator_params"]["out_channels"] > 1
+        criterion["sub_stft"] = MultiResolutionSTFTLoss(**config["subband_stft_loss_params"]).to(device)
+    else:
+        config["use_subband_stft_loss"] = False
+    if config.get("use_feat_match_loss", False):  # keep compatibility
+        criterion["feat_match"] = FeatureMatchLoss(
+            # keep compatibility
+            **config.get("feat_match_loss_params", {}),
+        ).to(device)
+    else:
+        config["use_feat_match_loss"] = False
+    if config.get("use_mel_loss", False):  # keep compatibility
+        criterion["mel"] = MelSpectrogramLoss(**config["mel_loss_params"],).to(device)
+    else:
+        config["use_mel_loss"] = False
+    # define optimizers and schedulers
+    generator_optimizer_class = getattr(
+        vec2wav2.optimizers,
+        # keep compatibility
+        config.get("generator_optimizer_type", "RAdam"),
+    )
+    discriminator_optimizer_class = getattr(
+        vec2wav2.optimizers,
+        # keep compatibility
+        config.get("discriminator_optimizer_type", "RAdam"),
+    )
+    optimizer = {
+        "generator": generator_optimizer_class(
+            model["generator"].parameters(),
+            **config["generator_optimizer_params"],
+        ),
+        "discriminator": discriminator_optimizer_class(
+            model["discriminator"].parameters(),
+            **config["discriminator_optimizer_params"],
+        ),
+    }
+    generator_scheduler_class = getattr(
+        torch.optim.lr_scheduler,
+        # keep compatibility
+        config.get("generator_scheduler_type", "StepLR"),
+    )
+    discriminator_scheduler_class = getattr(
+        torch.optim.lr_scheduler,
+        # keep compatibility
+        config.get("discriminator_scheduler_type", "StepLR"),
+    )
+    scheduler = {
+        "generator": generator_scheduler_class(
+            optimizer=optimizer["generator"],
+            **config["generator_scheduler_params"],
+        ),
+        "discriminator": discriminator_scheduler_class(
+            optimizer=optimizer["discriminator"],
+            **config["discriminator_scheduler_params"],
+        ),
+    }
+    from torch.nn.parallel import DistributedDataParallel
+    model["generator"] = DistributedDataParallel(model["generator"], device_ids=[rank], find_unused_parameters=True)
+    model["discriminator"] = DistributedDataParallel(model["discriminator"], device_ids=[rank], find_unused_parameters=True)
+    if rank == 0:
+        # show settings
+        logging.info(model["generator"])
+        logging.info(f"Generator has nparams: {sum([p.numel() for p in model['generator'].parameters()])}")
+        logging.info(model["discriminator"])
+        logging.info(f"Discriminator has nparams: {sum([p.numel() for p in model['discriminator'].parameters()])}")
+        logging.info(optimizer["generator"])
+        logging.info(optimizer["discriminator"])
+    # define trainer
+    trainer = Trainer(
+        steps=0,
+        epochs=0,
+        data_loader=data_loader,
+        sampler=sampler,
+        model=model,
+        criterion=criterion,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        config=config,
+        device=device,
+    )
+    # load pretrained parameters from checkpoint
+    if len(args.pretrain) != 0:
+        trainer.load_checkpoint(args.pretrain, load_only_params=True)
+        if rank == 0:
+            logging.info(f"Successfully load parameters from {args.pretrain}.")
+    # resume from checkpoint
+    if len(args.resume) != 0:
+        trainer.load_checkpoint(args.resume)
+        if rank == 0:
+            logging.info(f"Successfully resumed from {args.resume}.")
+    # run training loop
+    try:
+        trainer.run()
+    finally:
+        if rank == 0:
+            trainer.save_checkpoint(os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl"))
+            logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
+if __name__ == "__main__":
+    assert torch.cuda.is_available(), "CPU training is not allowed."
+    n_gpus = torch.cuda.device_count()
+    print(f"============> using {n_gpus} GPUS")
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "8000"
+    mp.spawn(
+        main,
+        nprocs=n_gpus,
+        args=(n_gpus,)
+    )

vec2wav2/bin/vc.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2024 Yiwei Guo
+""" Run VC inference with trained model """
+import vec2wav2
+from vec2wav2.ssl_models.vqw2v_extractor import Extractor as VQW2VExtractor
+from vec2wav2.ssl_models.wavlm_extractor import Extractor as WavLMExtractor
+# from vec2wav2.ssl_models.w2v2_extractor import Extractor as W2V2Extractor
+import torch
+import logging
+import argparse
+from vec2wav2.utils.utils import load_model, load_feat_codebook, idx2vec, read_wav_16k
+import soundfile as sf
+import yaml
+import os
+def configure_logging(verbose):
+    if verbose:
+        logging.getLogger("vec2wav2.ssl_models.WavLM").setLevel(logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.getLogger("vec2wav2.ssl_models.WavLM").setLevel(logging.ERROR)
+        logging.getLogger().setLevel(logging.ERROR)
+        logging.basicConfig(level=logging.ERROR)
+    script_logger = logging.getLogger("script_logger")
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s | %(levelname)s | %(message)s'))
+    script_logger.addHandler(handler)
+    script_logger.setLevel(logging.INFO)
+    script_logger.propagate = False
+    return script_logger
+def vc_args():
+    parser = argparse.ArgumentParser()
+    # required arguments
+    parser.add_argument("-s", "--source", default="examples/source.wav", type=str,
+                        help="source wav path")
+    parser.add_argument("-t", "--target", default="examples/target.wav", type=str,
+                        help="target speaker prompt path")
+    parser.add_argument("-o", "--output", default="output.wav", type=str,
+                        help="path of the output wav file")
+    # optional arguments
+    parser.add_argument("--expdir", default="pretrained/", type=str,
+                        help="path to find model checkpoints and configs. Will load expdir/generator.ckpt and expdir/config.yml.")
+    parser.add_argument('--checkpoint', default=None, type=str, help="checkpoint path (.pkl). If provided, will override expdir.")
+    parser.add_argument("--token-extractor", default="pretrained/vq-wav2vec_kmeans.pt", type=str,
+                        help="checkpoint or model flag of input token extractor")
+    parser.add_argument("--prompt-extractor", default="pretrained/WavLM-Large.pt", type=str,
+                        help="checkpoint or model flag of speaker prompt extractor")
+    parser.add_argument("--prompt-output-layer", default=6, type=int,
+                        help="output layer when prompt is extracted from WavLM.")
+    parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
+    args = parser.parse_args()
+    return args
+class VoiceConverter:
+    def __init__(self, expdir="pretrained/", token_extractor="pretrained/vq-wav2vec_kmeans.pt",
+                 prompt_extractor="pretrained/WavLM-Large.pt", prompt_output_layer=6,
+                 checkpoint=None, script_logger=None):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.script_logger = script_logger
+        self.log_if_possible(f"Using device: {self.device}")
+        # set up token extractor
+        self.token_extractor = VQW2VExtractor(checkpoint=token_extractor, device=self.device)
+        feat_codebook, feat_codebook_numgroups = load_feat_codebook(self.token_extractor.get_codebook(), self.device)
+        self.feat_codebook = feat_codebook
+        self.feat_codebook_numgroups = feat_codebook_numgroups
+        self.log_if_possible(f"Successfully set up token extractor from {token_extractor}")
+        # set up prompt extractor
+        self.prompt_extractor = WavLMExtractor(prompt_extractor, device=self.device, output_layer=prompt_output_layer)
+        self.log_if_possible(f"Successfully set up prompt extractor from {prompt_extractor}")
+        # load VC model
+        self.config_path = os.path.join(expdir, "config.yml")
+        with open(self.config_path) as f:
+            self.config = yaml.load(f, Loader=yaml.Loader)
+        if checkpoint is not None:
+            checkpoint = os.path.join(expdir, checkpoint)
+        else:
+            checkpoint = os.path.join(expdir, "generator.ckpt")
+        self.model = load_model(checkpoint, self.config)
+        self.log_if_possible(f"Successfully set up VC model from {checkpoint}")
+        self.model.backend.remove_weight_norm()
+        self.model.eval().to(self.device)
+    @torch.no_grad()
+    def voice_conversion(self, source_audio, target_audio, output_path="output.wav"):
+        self.log_if_possible(f"Performing VC from {source_audio} to {target_audio}")
+        source_wav = read_wav_16k(source_audio)
+        target_wav = read_wav_16k(target_audio)
+        vq_idx = self.token_extractor.extract(source_wav).long().to(self.device)
+        vqvec = idx2vec(self.feat_codebook, vq_idx, self.feat_codebook_numgroups).unsqueeze(0)
+        prompt = self.prompt_extractor.extract(target_wav).unsqueeze(0).to(self.device)
+        converted = self.model.inference(vqvec, prompt)[-1].view(-1)
+        sf.write(output_path, converted.cpu().numpy(), self.config['sampling_rate'])
+        self.log_if_possible(f"Saved audio file to {output_path}")
+        return output_path
+    def log_if_possible(self, msg):
+        if self.script_logger is not None:
+            self.script_logger.info(msg)
+if __name__ == "__main__":
+    args = vc_args()
+    script_logger = configure_logging(args.verbose)
+    source_wav = read_wav_16k(args.source)
+    target_prompt = read_wav_16k(args.target)
+    with torch.no_grad():
+        voice_converter = VoiceConverter(expdir=args.expdir, token_extractor=args.token_extractor,
+                                         prompt_extractor=args.prompt_extractor, prompt_output_layer=args.prompt_output_layer,
+                                         checkpoint=args.checkpoint, script_logger=script_logger)
+        voice_converter.voice_conversion(args.source, args.target, args.output)

vec2wav2/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .scp_dataset import * # NOQA

vec2wav2/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (288 Bytes). View file

vec2wav2/datasets/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (241 Bytes). View file

vec2wav2/datasets/__pycache__/scp_dataset.cpython-310.pyc ADDED Viewed

Binary file (8.4 kB). View file

vec2wav2/datasets/__pycache__/scp_dataset.cpython-39.pyc ADDED Viewed

Binary file (8.95 kB). View file

vec2wav2/datasets/scp_dataset.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# Modified by Yiwei Guo, 2024
+"""Dataset modules based on kaldi-style scp files."""
+import logging
+import random
+import copy
+from multiprocessing import Manager
+import kaldiio
+import numpy as np
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from vec2wav2.utils import HDF5ScpLoader
+from vec2wav2.utils import NpyScpLoader
+def _get_feats_scp_loader(feats_scp):
+    # read the first line of feats.scp file
+    with open(feats_scp) as f:
+        key, value = f.readlines()[0].replace("\n", "").split()
+    # check scp type
+    if ":" in value:
+        value_1, value_2 = value.split(":")
+        if value_1.endswith(".ark"):
+            # kaldi-ark case: utt_id_1 /path/to/utt_id_1.ark:index
+            return kaldiio.load_scp(feats_scp)
+        elif value_1.endswith(".h5"):
+            # hdf5 case with path in hdf5: utt_id_1 /path/to/utt_id_1.h5:feats
+            return HDF5ScpLoader(feats_scp)
+        else:
+            raise ValueError("Not supported feats.scp type.")
+    else:
+        if value.endswith(".h5"):
+            # hdf5 case without path in hdf5: utt_id_1 /path/to/utt_id_1.h5
+            return HDF5ScpLoader(feats_scp)
+        elif value.endswith(".npy"):
+            # npy case: utt_id_1 /path/to/utt_id_1.npy
+            return NpyScpLoader(feats_scp)
+        else:
+            raise ValueError("Not supported feats.scp type.")
+class AudioMelSCPDataset(Dataset):
+    """PyTorch compatible audio and feat dataset based on kaldi-stype scp files."""
+    def __init__(
+        self,
+        wav_scp,
+        vqidx_scp,
+        mel_scp,
+        prompt_scp,
+        utt2num_frames=None,
+        segments=None,
+        batch_frames=None,
+        batch_size=None,
+        min_num_frames=None,
+        max_num_frames=None,
+        return_utt_id=False,
+        return_sampling_rate=False,
+        allow_cache=False,
+        length_tolerance=2,
+        prompt_fold_by_2=True
+    ):
+        """Initialize dataset.
+        Args:
+            wav_scp (str): Kaldi-style wav.scp file.
+            vqidx_scp (str): Kaldi-style fests.scp file.
+            mel_scp (str): Kaldi-style fests.scp file.
+            segments (str): Kaldi-style segments file.
+            min_num_frames (int): Threshold to remove short feature files.
+            max_num_frames (int): Threshold to remove long feature files.
+            return_utt_id (bool): Whether to return utterance id.
+            return_sampling_rate (bool): Whether to return sampling rate.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+            prompt_fold_by_2 (bool): if true, then prompt have half the length of vqidx sequence.
+        """
+        # load scp as lazy dict
+        self.audio_loader = kaldiio.load_scp(wav_scp, segments=segments)
+        self.vqidx_loader = _get_feats_scp_loader(vqidx_scp)
+        self.mel_loader = _get_feats_scp_loader(mel_scp)
+        self.prompt_loader = _get_feats_scp_loader(prompt_scp)
+        self.utt_ids = list(self.mel_loader.keys())
+        self.return_utt_id = return_utt_id
+        self.return_sampling_rate = return_sampling_rate
+        self.allow_cache = allow_cache
+        utt2num_frames_loader = None
+        if utt2num_frames is not None:
+            with open(utt2num_frames, 'r') as f:
+                utt2num_frames_loader = dict([(x.split()[0], int(x.split()[1])) for x in f.readlines()])
+        else:
+            utt2num_frames_loader = dict([(k, mel.shape[0]) for k, mel in self.mel_loader.items()])
+        self.utt2num_frames_loader = utt2num_frames_loader
+        # filter by threshold
+        if (min_num_frames or max_num_frames) is not None:
+            mel_lengths = [utt2num_frames_loader[key] for key in self.utt_ids]
+            idxs = [
+                idx
+                for idx in range(len(self.utt_ids))
+                if (min_num_frames and mel_lengths[idx] >= min_num_frames) and (max_num_frames and mel_lengths[idx] <= max_num_frames)
+            ]
+            if len(self.utt_ids) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by mel length threshold "
+                    f"({len(self.utt_ids)} -> {len(idxs)})."
+                )
+            self.utt_ids = [self.utt_ids[idx] for idx in idxs]
+        # batchify
+        if batch_frames is not None:
+            self.batches = self.batchify(utt2num_frames_loader, batch_frames=batch_frames)
+        elif batch_size is not None:
+            self.batches = self.batchify(utt2num_frames_loader, batch_size=batch_size)
+        else:
+            self.batches = [[utt_id] for utt_id in self.utt_ids]
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.dict()
+        self.length_tolerance = length_tolerance
+        if prompt_fold_by_2:
+            self.prompt_len_factor = 2
+        else:
+            self.prompt_len_factor = 1
+    def batchify(self, utt2num_frames_loader, batch_frames=None, batch_size=None, min_batch_size=1, drop_last=True):
+        assert batch_size is None or batch_size > min_batch_size
+        batches = []
+        batch = []
+        accum_num_frames = 0
+        utt_ids_set = set(self.utt_ids)
+        for utt_id, mel_length in tqdm(sorted(list(utt2num_frames_loader.items()), key=lambda x: x[1], reverse=True)):
+            if utt_id not in utt_ids_set:
+                continue
+            if (batch_frames is not None and accum_num_frames + mel_length > batch_frames and len(batch) > min_batch_size) or (batch_size is not None and len(batch) == batch_size):
+                batches.append(batch)
+                batch = []
+                accum_num_frames = 0
+            batch.append(utt_id)
+            accum_num_frames += mel_length
+        if len(batch) > min_batch_size and not drop_last:
+            batches.append(batch)
+        return batches
+    def __getitem__(self, idx):
+        """Get specified idx items.
+        Args:
+            idx (int): Index of the item.
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray or tuple: Audio signal (T,) or (w/ sampling rate if return_sampling_rate = True).
+            ndarrays: Features (T', C).
+        """
+        batch = self.batches[idx]
+        batch_items = []
+        for utt_id in batch:
+            if self.allow_cache and self.caches.get(utt_id) is not None:
+                items = self.caches[utt_id]
+            else:
+                fs, audio = self.audio_loader[utt_id]
+                mel = self.mel_loader[utt_id]
+                prompt = self.prompt_loader[utt_id]
+                vqidx = self.vqidx_loader[utt_id]
+                min_len = min(len(mel), len(vqidx), len(prompt)*self.prompt_len_factor)
+                assert ((abs(len(mel) - min_len) <= self.length_tolerance) and
+                        (abs(len(vqidx) - min_len) <= self.length_tolerance) and
+                        (abs(len(prompt)*self.prompt_len_factor - min_len) <= self.length_tolerance)), \
+                    f"Audio feature lengths difference exceeds length tolerance for {utt_id}"
+                mel, vqidx, prompt = mel[:min_len], vqidx[:min_len], prompt[:min_len//self.prompt_len_factor]
+                # normalize audio signal to be [-1, 1]
+                audio = audio.astype(np.float32)
+                audio /= 1 << (16 - 1)  # assume that wav is PCM 16 bit
+                if self.return_sampling_rate:
+                    audio = (audio, fs)
+                if self.return_utt_id:
+                    items = utt_id, audio, vqidx, mel, prompt
+                else:
+                    items = audio, vqidx, mel, prompt
+                if self.allow_cache:
+                    self.caches[utt_id] = items
+            batch_items.append(items)
+        return batch_items
+    def __len__(self):
+        """Return dataset length.
+        Returns:
+            int: The length of dataset.
+        """
+        return len(self.batches)
+class MelSCPDataset(Dataset):
+    """PyTorch compatible feat dataset based on kaldi-stype scp files."""
+    def __init__(
+        self,
+        vqidx_scp,
+        prompt_scp,
+        return_utt_id=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+        Args:
+            vqidx_scp (str): Kaldi-style fests.scp file.
+            prompt_scp (str): Kaldi-style scp file. In this file, every utt is associated with its prompt's mel-spectrogram.
+            min_num_frames (int): Threshold to remove short feature files.
+            max_num_frames (int): Threshold to remove long feature files.
+            return_utt_id (bool): Whether to return utterance id.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+        """
+        # load scp as lazy dict
+        vqidx_loader = _get_feats_scp_loader(vqidx_scp)
+        self.prompt_loader = _get_feats_scp_loader(prompt_scp)
+        # self.prompt_loader = dict()
+        # with open(prompt_scp, 'r') as fr:
+            # for line in fr.readlines():
+                # terms = line.strip().split()
+                # self.prompt_loader[terms[0]] = terms[1]
+        vqidx_keys = list(set(self.prompt_loader.keys()) & set(vqidx_loader.keys()))
+        # NOTE: this dataset does not apply filtering, because it is usually used for decoding
+        self.vqidx_loader = vqidx_loader
+        self.utt_ids = vqidx_keys
+        self.return_utt_id = return_utt_id
+        self.allow_cache = allow_cache
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(self.utt_ids))]
+    def __getitem__(self, idx):
+        """Get specified idx items.
+        Args:
+            idx (int): Index of the item.
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray: Feature (T', C).
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+        utt_id = self.utt_ids[idx]
+        vqidx = self.vqidx_loader[utt_id].astype(int)
+        # prompt = torch.load(self.prompt_loader[utt_id]).float().numpy()
+        prompt = self.prompt_loader[utt_id]
+        if self.return_utt_id:
+            items = utt_id, vqidx, prompt
+        else:
+            items = vqidx, prompt
+        if self.allow_cache:
+            self.caches[idx] = items
+        return items
+    def __len__(self):
+        """Return dataset length.
+        Returns:
+            int: The length of dataset.
+        """
+        return len(self.utt_ids)

vec2wav2/distributed/__init__.py ADDED Viewed

File without changes

vec2wav2/distributed/launch.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Distributed process launcher.
+This code is modified from https://github.com/pytorch/pytorch/blob/v1.3.0/torch/distributed/launch.py.
+"""
+import os
+import subprocess
+import sys
+from argparse import ArgumentParser
+from argparse import REMAINDER
+def parse_args():
+    """Parse arguments."""
+    parser = ArgumentParser(
+        description="PyTorch distributed training launch "
+        "helper utilty that will spawn up "
+        "multiple distributed processes"
+    )
+    # Optional arguments for the launch helper
+    parser.add_argument(
+        "--nnodes",
+        type=int,
+        default=1,
+        help="The number of nodes to use for distributed " "training",
+    )
+    parser.add_argument(
+        "--node_rank",
+        type=int,
+        default=0,
+        help="The rank of the node for multi-node distributed " "training",
+    )
+    parser.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=1,
+        help="The number of processes to launch on each node, "
+        "for GPU training, this is recommended to be set "
+        "to the number of GPUs in your system so that "
+        "each process can be bound to a single GPU.",
+    )
+    parser.add_argument(
+        "--master_addr",
+        default="127.0.0.1",
+        type=str,
+        help="Master node (rank 0)'s address, should be either "
+        "the IP address or the hostname of node 0, for "
+        "single node multi-proc training, the "
+        "--master_addr can simply be 127.0.0.1",
+    )
+    parser.add_argument(
+        "--master_port",
+        default=29500,
+        type=int,
+        help="Master node (rank 0)'s free port that needs to "
+        "be used for communciation during distributed "
+        "training",
+    )
+    parser.add_argument(
+        "--use_env",
+        default=False,
+        action="store_true",
+        help="Use environment variable to pass "
+        "'local rank'. For legacy reasons, the default value is False. "
+        "If set to True, the script will not pass "
+        "--local_rank as argument, and will instead set LOCAL_RANK.",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script "
+        "as a python module, executing with the same behavior as"
+        "'python -m'.",
+    )
+    parser.add_argument(
+        "-c",
+        "--command",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script " "as a command.",
+    )
+    # positional
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help="The full path to the single GPU training "
+        "program/script/command to be launched in parallel, "
+        "followed by all the arguments for the "
+        "training script",
+    )
+    # rest from the training program
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+def main():
+    """Launch distributed processes."""
+    args = parse_args()
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+    processes = []
+    if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1:
+        current_env["OMP_NUM_THREADS"] = str(1)
+        print(
+            "*****************************************\n"
+            "Setting OMP_NUM_THREADS environment variable for each process "
+            "to be {} in default, to avoid your system being overloaded, "
+            "please further tune the variable for optimal performance in "
+            "your application as needed. \n"
+            "*****************************************".format(
+                current_env["OMP_NUM_THREADS"]
+            )
+        )
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+        # spawn the processes
+        if args.command:
+            cmd = [args.training_script]
+        else:
+            cmd = [sys.executable, "-u"]
+            if args.module:
+                cmd.append("-m")
+            cmd.append(args.training_script)
+        if not args.use_env:
+            cmd.append("--local_rank={}".format(local_rank))
+        cmd.extend(args.training_script_args)
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+    for process in processes:
+        process.wait()
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
+if __name__ == "__main__":
+    main()

vec2wav2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .causal_conv import *  # NOQA
+from .pqmf import *  # NOQA
+from .residual_block import *  # NOQA
+from .residual_stack import *  # NOQA
+from .tade_res_block import *  # NOQA
+from .upsample import *  # NOQA

vec2wav2/layers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (344 Bytes). View file

vec2wav2/layers/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (353 Bytes). View file

vec2wav2/layers/__pycache__/activations.cpython-310.pyc ADDED Viewed

Binary file (6.64 kB). View file

vec2wav2/layers/__pycache__/causal_conv.cpython-310.pyc ADDED Viewed

Binary file (2.23 kB). View file

vec2wav2/layers/__pycache__/causal_conv.cpython-39.pyc ADDED Viewed

Binary file (2.24 kB). View file

vec2wav2/layers/__pycache__/pqmf.cpython-310.pyc ADDED Viewed

Binary file (4.14 kB). View file

vec2wav2/layers/__pycache__/pqmf.cpython-39.pyc ADDED Viewed

Binary file (4.14 kB). View file

vec2wav2/layers/__pycache__/residual_block.cpython-310.pyc ADDED Viewed

Binary file (6.21 kB). View file

vec2wav2/layers/__pycache__/residual_block.cpython-39.pyc ADDED Viewed

Binary file (6.18 kB). View file

vec2wav2/layers/__pycache__/residual_stack.cpython-310.pyc ADDED Viewed

Binary file (2.51 kB). View file

vec2wav2/layers/__pycache__/residual_stack.cpython-39.pyc ADDED Viewed

Binary file (2.51 kB). View file

vec2wav2/layers/__pycache__/tade_res_block.cpython-310.pyc ADDED Viewed

Binary file (3.59 kB). View file

vec2wav2/layers/__pycache__/tade_res_block.cpython-39.pyc ADDED Viewed

Binary file (3.56 kB). View file

vec2wav2/layers/__pycache__/upsample.cpython-310.pyc ADDED Viewed

Binary file (6.01 kB). View file

vec2wav2/layers/__pycache__/upsample.cpython-39.pyc ADDED Viewed

Binary file (6 kB). View file

vec2wav2/layers/activations.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+# Modified by Yiwei Guo, 2024
+# including conditioned snakebeta activation
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake := x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x, cond=None):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBetaWithCondition(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Condition: (B, D), where D-dimension will be mapped to C dimensions
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+        - condition_alpha_prenet - trainable parameter that controls alpha and beta using condition
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256, 128)
+        >>> x = torch.randn(256)
+        >>> cond = torch.randn(128)
+        >>> x = a1(x, cond)
+    '''
+    def __init__(self, in_features, condition_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: dimension of the input
+            - condition_features: dimension of the condition vectors
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha, beta will be trained along with the rest of your model.
+        '''
+        super(SnakeBetaWithCondition, self).__init__()
+        self.in_features = in_features
+        self.condition_alpha_prenet = torch.nn.Linear(condition_features, in_features)
+        # self.condition_beta_prenet = torch.nn.Linear(condition_features, in_features)
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x, condition):
+        '''
+        condition: [B, D]
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta := x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        condition = torch.tanh(self.condition_alpha_prenet(condition).unsqueeze(-1))  # Same prenet for both alpha and beta, to save parameters
+        alpha = alpha + condition
+        beta = beta + 0.5 * condition  # multiply 0.5 for avoiding beta being too small
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

vec2wav2/layers/causal_conv.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Causal convolusion layer modules."""
+import torch
+class CausalConv1d(torch.nn.Module):
+    """CausalConv1d module with customized initialization."""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation=1,
+        bias=True,
+        pad="ConstantPad1d",
+        pad_params={"value": 0.0},
+    ):
+        """Initialize CausalConv1d module."""
+        super(CausalConv1d, self).__init__()
+        self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
+        self.conv = torch.nn.Conv1d(
+            in_channels, out_channels, kernel_size, dilation=dilation, bias=bias
+        )
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+        """
+        return self.conv(self.pad(x))[:, :, : x.size(2)]
+class CausalConvTranspose1d(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
+        """Initialize CausalConvTranspose1d module."""
+        super(CausalConvTranspose1d, self).__init__()
+        self.deconv = torch.nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride, bias=bias
+        )
+        self.stride = stride
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        return self.deconv(x)[:, :, : -self.stride]

vec2wav2/layers/pqmf.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Pseudo QMF modules."""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.signal import kaiser
+def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
+    """Design prototype filter for PQMF.
+    This method is based on `A Kaiser window approach for the design of prototype
+    filters of cosine modulated filterbanks`_.
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray: Impluse response of prototype filter (taps + 1,).
+    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+        https://ieeexplore.ieee.org/abstract/document/681427
+    """
+    # check the arguments are valid
+    assert taps % 2 == 0, "The number of taps mush be even number."
+    assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
+    # make initial filter
+    omega_c = np.pi * cutoff_ratio
+    with np.errstate(invalid="ignore"):
+        h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
+            np.pi * (np.arange(taps + 1) - 0.5 * taps)
+        )
+    h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
+    # apply kaiser window
+    w = kaiser(taps + 1, beta)
+    h = h_i * w
+    return h
+class PQMF(torch.nn.Module):
+    """PQMF module.
+    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
+    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
+        https://ieeexplore.ieee.org/document/258122
+    """
+    def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0):
+        """Initilize PQMF module.
+        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
+        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
+        """
+        super(PQMF, self).__init__()
+        # build analysis & synthesis filter coefficients
+        h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
+        h_analysis = np.zeros((subbands, len(h_proto)))
+        h_synthesis = np.zeros((subbands, len(h_proto)))
+        for k in range(subbands):
+            h_analysis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - (taps / 2))
+                    + (-1) ** k * np.pi / 4
+                )
+            )
+            h_synthesis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - (taps / 2))
+                    - (-1) ** k * np.pi / 4
+                )
+            )
+        # convert to tensor
+        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
+        synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
+        # register coefficients as beffer
+        self.register_buffer("analysis_filter", analysis_filter)
+        self.register_buffer("synthesis_filter", synthesis_filter)
+        # filter for downsampling & upsampling
+        updown_filter = torch.zeros((subbands, subbands, subbands)).float()
+        for k in range(subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.register_buffer("updown_filter", updown_filter)
+        self.subbands = subbands
+        # keep padding info
+        self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
+    def analysis(self, x):
+        """Analysis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
+        """
+        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
+        return F.conv1d(x, self.updown_filter, stride=self.subbands)
+    def synthesis(self, x):
+        """Synthesis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+        """
+        # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
+        #   Not sure this is the correct way, it is better to check again.
+        # TODO(kan-bayashi): Understand the reconstruction procedure
+        x = F.conv_transpose1d(
+            x, self.updown_filter * self.subbands, stride=self.subbands
+        )
+        return F.conv1d(self.pad_fn(x), self.synthesis_filter)

vec2wav2/layers/residual_block.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# -*- coding: utf-8 -*-
+"""Residual block modules.
+References:
+    - https://github.com/r9y9/wavenet_vocoder
+    - https://github.com/jik876/hifi-gan
+"""
+import math
+import torch
+import torch.nn.functional as F
+class Conv1d(torch.nn.Conv1d):
+    """Conv1d module with customized initialization."""
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv1d module."""
+        super(Conv1d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        """Reset parameters."""
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class Conv1d1x1(Conv1d):
+    """1x1 Conv1d with customized initialization."""
+    def __init__(self, in_channels, out_channels, bias):
+        """Initialize 1x1 Conv1d module."""
+        super(Conv1d1x1, self).__init__(
+            in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias
+        )
+class WaveNetResidualBlock(torch.nn.Module):
+    """Residual block module in WaveNet."""
+    def __init__(
+        self,
+        kernel_size=3,
+        residual_channels=64,
+        gate_channels=128,
+        skip_channels=64,
+        aux_channels=80,
+        dropout=0.0,
+        dilation=1,
+        bias=True,
+        use_causal_conv=False,
+    ):
+        """Initialize WaveNetResidualBlock module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            residual_channels (int): Number of channels for residual connection.
+            skip_channels (int): Number of channels for skip connection.
+            aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
+            dropout (float): Dropout probability.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
+        """
+        super().__init__()
+        self.dropout = dropout
+        # no future time stamps available
+        if use_causal_conv:
+            padding = (kernel_size - 1) * dilation
+        else:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+            padding = (kernel_size - 1) // 2 * dilation
+        self.use_causal_conv = use_causal_conv
+        # dilation conv
+        self.conv = Conv1d(
+            residual_channels,
+            gate_channels,
+            kernel_size,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        # local conditioning
+        if aux_channels > 0:
+            self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
+        else:
+            self.conv1x1_aux = None
+        # conv output is split into two groups
+        gate_out_channels = gate_channels // 2
+        self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
+        self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, residual_channels, T).
+            c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor for residual connection (B, residual_channels, T).
+            Tensor: Output tensor for skip connection (B, skip_channels, T).
+        """
+        residual = x
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = self.conv(x)
+        # remove future time steps if use_causal_conv conv
+        x = x[:, :, : residual.size(-1)] if self.use_causal_conv else x
+        # split into two part for gated activation
+        splitdim = 1
+        xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
+        # local conditioning
+        if c is not None:
+            assert self.conv1x1_aux is not None
+            c = self.conv1x1_aux(c)
+            ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
+            xa, xb = xa + ca, xb + cb
+        x = torch.tanh(xa) * torch.sigmoid(xb)
+        # for skip connection
+        s = self.conv1x1_skip(x)
+        # for residual connection
+        x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
+        return x, s
+class HiFiGANResidualBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN."""
+    def __init__(
+        self,
+        kernel_size=3,
+        channels=512,
+        dilations=(1, 3, 5),
+        bias=True,
+        use_additional_convs=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+    ):
+        """Initialize HiFiGANResidualBlock module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = torch.nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = torch.nn.ModuleList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        for dilation in dilations:
+            self.convs1 += [
+                torch.nn.Sequential(
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                    torch.nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        bias=bias,
+                        padding=(kernel_size - 1) // 2 * dilation,
+                    ),
+                )
+            ]
+            if use_additional_convs:
+                self.convs2 += [
+                    torch.nn.Sequential(
+                        getattr(torch.nn, nonlinear_activation)(
+                            **nonlinear_activation_params
+                        ),
+                        torch.nn.Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            bias=bias,
+                            padding=(kernel_size - 1) // 2,
+                        ),
+                    )
+                ]
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x

vec2wav2/layers/residual_stack.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# -*- coding: utf-8 -*-
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""Residual stack module in MelGAN."""
+import torch
+from vec2wav2.layers import CausalConv1d
+class ResidualStack(torch.nn.Module):
+    """Residual stack module introduced in MelGAN."""
+    def __init__(
+        self,
+        kernel_size=3,
+        channels=32,
+        dilation=1,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_causal_conv=False,
+    ):
+        """Initialize ResidualStack module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+        """
+        super(ResidualStack, self).__init__()
+        # defile residual stack part
+        if not use_causal_conv:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+            self.stack = torch.nn.Sequential(
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
+                torch.nn.Conv1d(
+                    channels, channels, kernel_size, dilation=dilation, bias=bias
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                torch.nn.Conv1d(channels, channels, 1, bias=bias),
+            )
+        else:
+            self.stack = torch.nn.Sequential(
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                CausalConv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                torch.nn.Conv1d(channels, channels, 1, bias=bias),
+            )
+        # defile extra layer for skip connection
+        self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, chennels, T).
+        """
+        return self.stack(c) + self.skip_layer(c)

vec2wav2/layers/tade_res_block.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""StyleMelGAN's TADEResBlock Modules."""
+from functools import partial
+import torch
+class TADELayer(torch.nn.Module):
+    """TADE Layer module."""
+    def __init__(
+        self,
+        in_channels=64,
+        aux_channels=80,
+        kernel_size=9,
+        bias=True,
+        upsample_factor=2,
+        upsample_mode="nearest",
+    ):
+        """Initilize TADE layer."""
+        super().__init__()
+        self.norm = torch.nn.InstanceNorm1d(in_channels)
+        self.aux_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                aux_channels,
+                in_channels,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            # NOTE(kan-bayashi): Use non-linear activation?
+        )
+        self.gated_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels,
+                in_channels * 2,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            # NOTE(kan-bayashi): Use non-linear activation?
+        )
+        self.upsample = torch.nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode
+        )
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T').
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * in_upsample_factor).
+            Tensor: Upsampled aux tensor (B, in_channels, T * aux_upsample_factor).
+        """
+        x = self.norm(x)
+        c = self.upsample(c)
+        c = self.aux_conv(c)
+        cg = self.gated_conv(c)
+        cg1, cg2 = cg.split(cg.size(1) // 2, dim=1)
+        # NOTE(kan-bayashi): Use upsample for noise input here?
+        y = cg1 * self.upsample(x) + cg2
+        # NOTE(kan-bayashi): Return upsampled aux here?
+        return y, c
+class TADEResBlock(torch.nn.Module):
+    """TADEResBlock module."""
+    def __init__(
+        self,
+        in_channels=64,
+        aux_channels=80,
+        kernel_size=9,
+        dilation=2,
+        bias=True,
+        upsample_factor=2,
+        upsample_mode="nearest",
+        gated_function="softmax",
+    ):
+        """Initialize TADEResBlock module."""
+        super().__init__()
+        self.tade1 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=aux_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            # NOTE(kan-bayashi): Use upsample in the first TADE layer?
+            upsample_factor=1,
+            upsample_mode=upsample_mode,
+        )
+        self.gated_conv1 = torch.nn.Conv1d(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias=bias,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.tade2 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=in_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=upsample_factor,
+            upsample_mode=upsample_mode,
+        )
+        self.gated_conv2 = torch.nn.Conv1d(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias=bias,
+            dilation=dilation,
+            padding=(kernel_size - 1) // 2 * dilation,
+        )
+        self.upsample = torch.nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode
+        )
+        if gated_function == "softmax":
+            self.gated_function = partial(torch.softmax, dim=1)
+        elif gated_function == "sigmoid":
+            self.gated_function = torch.sigmoid
+        else:
+            raise ValueError(f"{gated_function} is not supported.")
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T').
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * in_upsample_factor).
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * in_upsample_factor).
+        """
+        residual = x
+        x, c = self.tade1(x, c)
+        x = self.gated_conv1(x)
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        x = self.gated_function(xa) * torch.tanh(xb)
+        x, c = self.tade2(x, c)
+        x = self.gated_conv2(x)
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        x = self.gated_function(xa) * torch.tanh(xb)
+        # NOTE(kan-bayashi): Return upsampled aux here?
+        return self.upsample(residual) + x, c

vec2wav2/layers/upsample.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# -*- coding: utf-8 -*-
+"""Upsampling module.
+This code is modified from https://github.com/r9y9/wavenet_vocoder.
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from vec2wav2.layers import Conv1d
+class Stretch2d(torch.nn.Module):
+    """Stretch2d module."""
+    def __init__(self, x_scale, y_scale, mode="nearest"):
+        """Initialize Stretch2d module.
+        Args:
+            x_scale (int): X scaling factor (Time axis in spectrogram).
+            y_scale (int): Y scaling factor (Frequency axis in spectrogram).
+            mode (str): Interpolation mode.
+        """
+        super(Stretch2d, self).__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, C, F, T).
+        Returns:
+            Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
+        """
+        return F.interpolate(
+            x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode
+        )
+class Conv2d(torch.nn.Conv2d):
+    """Conv2d module with customized initialization."""
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv2d module."""
+        super(Conv2d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.weight.data.fill_(1.0 / np.prod(self.kernel_size))
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class UpsampleNetwork(torch.nn.Module):
+    """Upsampling network module."""
+    def __init__(
+        self,
+        upsample_scales,
+        nonlinear_activation=None,
+        nonlinear_activation_params={},
+        interpolate_mode="nearest",
+        freq_axis_kernel_size=1,
+        use_causal_conv=False,
+    ):
+        """Initialize upsampling network module.
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            interpolate_mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+        """
+        super(UpsampleNetwork, self).__init__()
+        self.use_causal_conv = use_causal_conv
+        self.up_layers = torch.nn.ModuleList()
+        for scale in upsample_scales:
+            # interpolation layer
+            stretch = Stretch2d(scale, 1, interpolate_mode)
+            self.up_layers += [stretch]
+            # conv layer
+            assert (
+                freq_axis_kernel_size - 1
+            ) % 2 == 0, "Not support even number freq axis kernel size."
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            if use_causal_conv:
+                padding = (freq_axis_padding, scale * 2)
+            else:
+                padding = (freq_axis_padding, scale)
+            conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
+            self.up_layers += [conv]
+            # nonlinear
+            if nonlinear_activation is not None:
+                nonlinear = getattr(torch.nn, nonlinear_activation)(
+                    **nonlinear_activation_params
+                )
+                self.up_layers += [nonlinear]
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c : Input tensor (B, C, T).
+        Returns:
+            Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
+        """
+        c = c.unsqueeze(1)  # (B, 1, C, T)
+        for f in self.up_layers:
+            if self.use_causal_conv and isinstance(f, Conv2d):
+                c = f(c)[..., : c.size(-1)]
+            else:
+                c = f(c)
+        return c.squeeze(1)  # (B, C, T')
+class ConvInUpsampleNetwork(torch.nn.Module):
+    """Convolution + upsampling network module."""
+    def __init__(
+        self,
+        upsample_scales,
+        nonlinear_activation=None,
+        nonlinear_activation_params={},
+        interpolate_mode="nearest",
+        freq_axis_kernel_size=1,
+        aux_channels=80,
+        aux_context_window=0,
+        use_causal_conv=False,
+    ):
+        """Initialize convolution + upsampling network module.
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+            aux_channels (int): Number of channels of pre-convolutional layer.
+            aux_context_window (int): Context window size of the pre-convolutional layer.
+            use_causal_conv (bool): Whether to use causal structure.
+        """
+        super(ConvInUpsampleNetwork, self).__init__()
+        self.aux_context_window = aux_context_window
+        self.use_causal_conv = use_causal_conv and aux_context_window > 0
+        # To capture wide-context information in conditional features
+        kernel_size = (
+            aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
+        )
+        # NOTE(kan-bayashi): Here do not use padding because the input is already padded
+        self.conv_in = Conv1d(
+            aux_channels, aux_channels, kernel_size=kernel_size, bias=False
+        )
+        self.upsample = UpsampleNetwork(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            use_causal_conv=use_causal_conv,
+        )
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c : Input tensor (B, C, T').
+        Returns:
+            Tensor: Upsampled tensor (B, C, T),
+                where T = (T' - aux_context_window * 2) * prod(upsample_scales).
+        Note:
+            The length of inputs considers the context window size.
+        """
+        c_ = self.conv_in(c)
+        c = c_[:, :, : -self.aux_context_window] if self.use_causal_conv else c_
+        return self.upsample(c)

vec2wav2/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .adversarial_loss import *  # NOQA
+from .feat_match_loss import *  # NOQA
+from .mel_loss import *  # NOQA
+from .stft_loss import *  # NOQA