Spaces:

mehdi999
/

pardi-speech

Running on Zero

App Files Files Community

Mehdi Lakbar commited on Oct 29

Commit

56cfa73

1 Parent(s): 4905304

Initial demo of Lina-speech (pardi-speech)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

codec/__init__.py +2 -0
codec/__pycache__/__init__.cpython-312.pyc +0 -0
codec/__pycache__/train_patchvae.cpython-312.pyc +0 -0
codec/__pycache__/train_wavvae.cpython-312.pyc +0 -0
codec/__pycache__/train_zflowae.cpython-312.pyc +0 -0
codec/datamodules.py +249 -0
codec/models/__init__.py +2 -0
codec/models/__pycache__/__init__.cpython-312.pyc +0 -0
codec/models/components/__init__.py +0 -0
codec/models/components/__pycache__/__init__.cpython-312.pyc +0 -0
codec/models/components/__pycache__/convnext.cpython-312.pyc +0 -0
codec/models/components/convnext.py +221 -0
codec/models/components/transformer.py +224 -0
codec/models/pardi_tokenizer.py +10 -0
codec/models/patchvae/__pycache__/model.cpython-312.pyc +0 -0
codec/models/patchvae/__pycache__/modules.cpython-312.pyc +0 -0
codec/models/patchvae/model.py +262 -0
codec/models/patchvae/modules.py +396 -0
codec/models/wavvae/__init__.py +0 -0
codec/models/wavvae/__pycache__/__init__.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/discriminators.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/heads.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/layers.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/loss.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/model.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/modules.cpython-312.pyc +0 -0
codec/models/wavvae/__pycache__/spectral_ops.cpython-312.pyc +0 -0
codec/models/wavvae/dataset.py +84 -0
codec/models/wavvae/discriminators.py +211 -0
codec/models/wavvae/experiment.py +3 -0
codec/models/wavvae/heads.py +194 -0
codec/models/wavvae/helpers.py +71 -0
codec/models/wavvae/layers.py +282 -0
codec/models/wavvae/loss.py +142 -0
codec/models/wavvae/model.py +140 -0
codec/models/wavvae/modules.py +213 -0
codec/models/wavvae/spectral_ops.py +192 -0
codec/scripts/compare_codecs.py +441 -0
codec/scripts/compare_wavvae.py +264 -0
codec/scripts/compare_zcodec.py +312 -0
codec/scripts/compute_stats.py +76 -0
codec/scripts/compute_wer.py +48 -0
codec/scripts/compute_wer_from_refs.py +64 -0
codec/scripts/download_expresso.py +10 -0
codec/scripts/download_gigaspeech.py +14 -0
codec/scripts/download_lj.py +9 -0
codec/scripts/download_ltts.py +16 -0
codec/scripts/download_mlseng10k.py +13 -0
codec/scripts/eval_asr.py +100 -0
codec/scripts/eval_asr_from_filelist.py +60 -0

codec/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .train_patchvae import TrainPatchVAE
2	+ from .train_wavvae import TrainWavVAE

codec/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (260 Bytes). View file

codec/__pycache__/train_patchvae.cpython-312.pyc ADDED Viewed

Binary file (12.2 kB). View file

codec/__pycache__/train_wavvae.cpython-312.pyc ADDED Viewed

Binary file (15 kB). View file

codec/__pycache__/train_zflowae.cpython-312.pyc ADDED Viewed

Binary file (12.2 kB). View file

codec/datamodules.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import itertools
+import random
+import time
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+import numpy as np
+import pytorch_lightning as ptl
+import torch
+import torchaudio
+from safetensors.torch import safe_open
+from sklearn.model_selection import train_test_split
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from datasets import load_dataset, load_from_disk
+@dataclass
+class WavVAEDataConfig:
+    filelist_path: str
+    sampling_rate: int
+    num_samples: int
+    batch_size: int
+    num_workers: int
+class WavVAEDataModule(ptl.LightningDataModule):
+    def __init__(self, train_params: WavVAEDataConfig, val_params: WavVAEDataConfig):
+        super().__init__()
+        self.train_config = train_params
+        self.val_config = val_params
+    def _get_dataloder(self, cfg: WavVAEDataConfig, train: bool):
+        dataset = WavVAEDataset(cfg, train=train)
+        dataloader = DataLoader(
+            dataset,
+            batch_size=cfg.batch_size,
+            num_workers=cfg.num_workers,
+            shuffle=train,
+            pin_memory=True,
+        )
+        return dataloader
+    def train_dataloader(self) -> DataLoader:
+        return self._get_dataloder(self.train_config, train=True)
+    def val_dataloader(self) -> DataLoader:
+        return self._get_dataloder(self.val_config, train=False)
+class WavVAEDataset(Dataset):
+    def __init__(self, cfg: WavVAEDataConfig, train: bool):
+        with open(cfg.filelist_path) as f:
+            self.filelist = f.read().splitlines()
+        self.sampling_rate = cfg.sampling_rate
+        self.num_samples = cfg.num_samples
+        self.train = train
+    def __len__(self) -> int:
+        return len(self.filelist)
+    def __getitem__(self, index: int) -> torch.Tensor:
+        audio_path = self.filelist[index]
+        y, sr = torchaudio.load(audio_path)
+        if y.size(0) > 1:
+            # mix to mono
+            y = y.mean(dim=0, keepdim=True)
+        gain = np.random.uniform(-1, -6) if self.train else -3
+        y, _ = torchaudio.sox_effects.apply_effects_tensor(
+            y, sr, [["norm", f"{gain:.2f}"]]
+        )
+        if sr != self.sampling_rate:
+            y = torchaudio.functional.resample(
+                y, orig_freq=sr, new_freq=self.sampling_rate
+            )
+        if y.size(-1) < self.num_samples:
+            pad_length = self.num_samples - y.size(-1)
+            padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
+            y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
+        elif self.train:
+            start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
+            y = y[:, start : start + self.num_samples]
+        else:
+            # During validation, take always the first segment for determinism
+            y = y[:, : self.num_samples]
+        return y[0]
+def pad_tensor_list_raw(
+    tensor_list: list[tuple[torch.Tensor, torch.Tensor]], pad_idx: int = 0
+) -> dict[str, torch.Tensor | None]:
+    audio, hubert_maybe = zip(*tensor_list)
+    audio = torch.cat(audio, dim=0)
+    if hubert_maybe[0] is not None:
+        hubert_maybe = torch.stack(hubert_maybe, dim=0)
+    else:
+        hubert_maybe = None
+    return {"audio_z": audio, "hubert": hubert_maybe}
+class SafeTensorDataset(Dataset):
+    """
+    On __getitem__, opens the safetensor, uses get_slice() to inspect shape,
+    then either drops too-short files (return None) or returns a random subsequence slice.
+    """
+    def __init__(
+        self,
+        file_paths: list[str],
+        key: str,
+        hubert_path: str | None = None,
+        hubert_key: str = "layer_9",
+        min_length: int = 1,
+        subseq_length: int | None = None,
+    ):
+        self.file_paths = file_paths
+        self.key = key
+        self.min_length = min_length
+        self.subseq_length = subseq_length
+        self.hubert_path = hubert_path
+        self.hubert_key = hubert_key
+    def __len__(self):
+        return len(self.file_paths)
+    def __getitem__(self, idx: int) -> torch.Tensor | None:
+        path = self.file_paths[idx]
+        # open file, get a slice wrapper for full tensor
+        with safe_open(path, framework="pt") as f:
+            tensor_slice = f.get_slice(self.key)
+            Q, N, D = tensor_slice.get_shape()  # full shape [K, N]
+            # drop too-short
+            if N < self.min_length:
+                return None
+            L = self.subseq_length or N
+            if L < N:
+                # sample random start
+                start = torch.randint(0, max(1, N - L - 1), ()).item()
+                start -= start % 2
+                # this yields a torch.Tensor of shape [K, L]
+                seq = tensor_slice[:, start : start + L]
+            else:
+                # full length
+                start = 0
+                seq = tensor_slice[:, :]
+        if self.hubert_path is not None:
+            path = Path(self.hubert_path) / Path(path).name
+            with safe_open(path, framework="pt") as f:
+                tensor_slice = f.get_slice(self.hubert_key)
+                hubert_N, hubert_D = tensor_slice.get_shape()  # full shape [K, N]
+                seq_hubert = tensor_slice[start // 2 : start // 2 + L // 2]
+                return (seq, seq_hubert)
+        return (seq, None)
+class SafeTensorDataModule(ptl.LightningDataModule):
+    """
+    LightningDataModule using raw .safetensors file list + get_slice inside Dataset.
+    """
+    def __init__(
+        self,
+        train_file_list: str,
+        val_file_list: str | None = None,
+        hubert_path: str | None = None,
+        key: str = "audio_z",
+        hubert_key: str = "layer_9",
+        val_split: float = 0.1,
+        batch_size: int = 32,
+        num_workers: int = 4,
+        shuffle: bool = True,
+        seed: int = 1234,
+        min_length: int = 1,
+        subseq_length: int | None = None,
+    ):
+        super().__init__()
+        self.train_file_list = train_file_list
+        self.val_file_list = val_file_list
+        self.hubert_path = hubert_path
+        self.key = key
+        self.val_split = val_split
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.seed = seed
+        self.min_length = min_length
+        self.subseq_length = subseq_length
+    def setup(self, stage=None):
+        with open(self.train_file_list, "r") as f:
+            train_paths = [line.strip() for line in f if line.strip()]
+        val_paths = None
+        if self.val_file_list is not None:
+            with open(self.train_file_list, "r") as f:
+                val_paths = [line.strip() for line in f if line.strip()]
+        # Split into train/val
+        if (
+            isinstance(self.val_split, float)
+            and 0 < self.val_split < 1
+            and val_paths is None
+        ):
+            train_paths, val_paths = train_test_split(
+                train_paths, test_size=self.val_split, random_state=self.seed
+            )
+        self.train_ds = SafeTensorDataset(
+            train_paths,
+            key=self.key,
+            min_length=self.min_length,
+            subseq_length=self.subseq_length,
+            hubert_path=self.hubert_path,
+        )
+        self.val_ds = SafeTensorDataset(
+            val_paths,
+            key=self.key,
+            min_length=self.min_length,
+            subseq_length=self.subseq_length,
+        )
+    def _collate_fn(
+        self, batch: list[torch.Tensor | None]
+    ) -> tuple[torch.Tensor, torch.BoolTensor]:
+        seqs = [s for s in batch if s is not None]
+        return pad_tensor_list_raw(seqs, pad_idx=0)
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_ds,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            collate_fn=self._collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_ds,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=self._collate_fn,
+        )

codec/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .patchvae.model import PatchVAE, PatchVAEConfig
2	+ from .wavvae.model import WavVAE, WavVAEConfig

codec/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (308 Bytes). View file

codec/models/components/__init__.py ADDED Viewed

File without changes

codec/models/components/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (174 Bytes). View file

codec/models/components/__pycache__/convnext.cpython-312.pyc ADDED Viewed

Binary file (11.3 kB). View file

codec/models/components/convnext.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+from torch import nn
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int | None = None,
+        layer_scale_init_value: float = 0.0,
+        elementwise_affine_ln: bool = True,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        intermediate_dim = intermediate_dim if intermediate_dim is not None else dim * 3
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=0 if is_causal else 3, groups=dim
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(
+            dim, eps=1e-6, elementwise_affine=elementwise_affine_ln
+        )
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.is_causal = is_causal
+    def forward(
+        self,
+        x: torch.Tensor,
+        scale_shift: tuple[torch.Tensor, torch.Tensor] | None = None,
+        gate: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = x
+        if self.is_causal:
+            x = torch.nn.functional.pad(x, (6, 0))
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(x)
+        if scale_shift is not None:
+            scale, shift = scale_shift
+            x = x * scale[:, None] + shift[:, None]
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        if gate is not None:
+            x = gate[:, None] * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class ConvNextNet(nn.Module):
+    def __init__(self, n_layers, dim, intermediate_dim: int | None = None):
+        super().__init__()
+        self.net = nn.Sequential(
+            *[
+                ConvNeXtBlock(
+                    dim,
+                    intermediate_dim,
+                )
+                for _ in range(n_layers)
+            ]
+        )
+    def forward(self, x):
+        return self.net(x)
+class ConvNextPatchEncoder(nn.Module):
+    def __init__(
+        self,
+        patch_sizes: list[int],
+        n_layers_per_patch: int,
+        patch_expansion_factor: float = 1.5,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+        patch_to_dim = []
+        convnext = []
+        for i, patch_size in enumerate(patch_sizes):
+            in_dim = int((patch_expansion_factor if i > 0 else 1.0) * patch_size)
+            out_dim = int(patch_expansion_factor * patch_size)
+            if is_decoder:
+                in_dim, out_dim = out_dim, in_dim
+            patch_to_dim.append(
+                nn.Linear(
+                    in_dim,
+                    out_dim,
+                )
+            )
+            convnext += [
+                nn.Sequential(
+                    *[
+                        ConvNeXtBlock(int(patch_size * patch_expansion_factor))
+                        for _ in range(n_layers_per_patch)
+                    ]
+                )
+            ]
+        self.is_decoder = is_decoder
+        self.patch_sizes = patch_sizes
+        self.patch_expansion_factor = patch_expansion_factor
+        self.patch_to_dim = nn.ModuleList(patch_to_dim)
+        self.convnext = nn.ModuleList(convnext)
+    def forward(self, x):
+        if self.is_decoder:
+            for i, patch_size in reversed(list(enumerate(self.patch_sizes))):
+                B, P, N = x.shape
+                patch_expansion_factor_maybe = (
+                    self.patch_expansion_factor if i > 0 else 1.0
+                )
+                x = x.reshape(B, int(patch_size * self.patch_expansion_factor), -1)
+                x = self.convnext[i](x)
+                x = self.patch_to_dim[i](x.transpose(1, 2)).transpose(1, 2)
+        else:
+            for i, patch_size in enumerate(self.patch_sizes):
+                B, P, N = x.shape
+                patch_expansion_factor_maybe = (
+                    self.patch_expansion_factor if i > 0 else 1.0
+                )
+                x = x.reshape(B, int(patch_size * patch_expansion_factor_maybe), -1)
+                x = self.patch_to_dim[i](x.transpose(1, 2)).transpose(1, 2)
+                x = self.convnext[i](x)
+        return x
+class ConvNextEncoder(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        dim: int,
+        n_layers: int,
+        intermediate_dim: int | None = None,
+        stride: int = 1,
+    ):
+        super().__init__()
+        self.in_proj = nn.Linear(in_dim, dim)
+        if stride > 1:
+            self.stride = nn.Conv1d(
+                in_channels=dim,
+                out_channels=dim,
+                kernel_size=(stride * 2) + 1,
+                stride=stride,
+                padding=stride // 2,
+            )
+        else:
+            self.stride = nn.Identity()
+        self.net = ConvNextNet(n_layers, dim, intermediate_dim)
+    def forward(self, x):
+        x = self.in_proj(x.transpose(1, 2)).transpose(1, 2)
+        x = self.stride(x)
+        return self.net(x)
+class ConvNextDecoder(nn.Module):
+    def __init__(
+        self,
+        out_dim: int,
+        dim: int,
+        n_layers: int,
+        intermediate_dim: int | None = None,
+        stride: int = 1,
+        stride_position: str = "before",
+    ):
+        super().__init__()
+        self.out_proj = nn.Linear(dim, out_dim)
+        if stride > 1:
+            self.stride = nn.ConvTranspose1d(
+                in_channels=dim,
+                out_channels=dim,
+                kernel_size=(stride * 2) + 1,
+                stride=stride,
+                padding=stride // 2,
+                output_padding=stride // 2,
+            )
+        else:
+            self.stride = nn.Identity()
+        self.stride_position = stride_position
+        self.net = ConvNextNet(n_layers, dim, intermediate_dim)
+    def forward(self, x):
+        if self.stride_position == "before":
+            x = self.stride(x)
+        x = self.net(x)
+        if self.stride_position == "after":
+            x = self.stride(x)
+        return self.out_proj(x.transpose(1, 2)).transpose(1, 2)
+class SwiGLU(nn.Module):
+    def __init__(self, d_model: int, ffn_expansion_factor: int = 4):
+        super().__init__()
+        self.p_in = nn.Linear(d_model, (d_model * ffn_expansion_factor // 3) * 2)
+        self.p_out = nn.Linear(d_model * ffn_expansion_factor // 3, d_model)
+    def forward(self, x):
+        gate, x = self.p_in(x).chunk(2, dim=-1)
+        return self.p_out(nn.functional.silu(gate) * x)

codec/models/components/transformer.py ADDED Viewed

	@@ -0,0 +1,224 @@

+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from rotary_embedding_torch import RotaryEmbedding, apply_rotary_emb
+class LocalSelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        window_len: int = 32,
+        rotary: bool = True,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        self.heads = heads
+        assert dim % heads == 0, "dim must be divisible by heads"
+        self.qkv = nn.Linear(dim, 3 * dim)
+        self.o = nn.Linear(dim, dim)
+        self.rotary = RotaryEmbedding((dim // heads) // 2) if rotary else None
+        self.is_causal = is_causal
+        self.window_len = window_len
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        pos: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[int, torch.Tensor]] = None,
+        layer_idx: Optional[int] = None,
+        time_step: int = 0,
+    ) -> torch.Tensor:
+        # x: (batch, seq_len, dim)
+        b, n, dim = x.shape
+        b, t_len, hd = x.shape
+        pad_len = (self.window_len - t_len % self.window_len) % self.window_len
+        padded_x = torch.nn.functional.pad(x, (0, 0, 0, pad_len))  # pad on time dim
+        mask = torch.ones(t_len, dtype=torch.bool, device=x.device)
+        mask = torch.nn.functional.pad(
+            mask, (0, pad_len), value=False
+        )  # False = masked
+        mask = mask.expand(b, -1)  # [b, padded_len]
+        mask = rearrange(mask, "b (w n) -> b n 1 1 w", w=self.window_len)
+        qkv = self.qkv(padded_x).chunk(3, dim=-1)
+        q, k, v = [
+            rearrange(t, "b (w n) (h d) -> b n h w d", h=self.heads, w=self.window_len)
+            for t in qkv
+        ]
+        if cache is not None:
+            assert layer_idx is not None, "layer_idx must be set when using cache"
+            cache[layer_idx]["k"] = torch.cat([cache[layer_idx]["k"], k], dim=2)
+            cache[layer_idx]["v"] = torch.cat([cache[layer_idx]["v"], v], dim=2)
+            k, v = cache[layer_idx]["k"], cache[layer_idx]["v"]
+        # apply rotary embeddings
+        if self.rotary is not None:
+            if pos is not None:
+                rot = self.rotary(pos)  # (b,1,n,head_dim)
+                q = apply_rotary_emb(rot, q)
+                k = apply_rotary_emb(rot, k)
+            else:
+                q = self.rotary.rotate_queries_or_keys(q, offset=time_step)
+                k = self.rotary.rotate_queries_or_keys(k, offset=time_step)
+        # scaled dot-product attention
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None if self.is_causal else mask,
+            is_causal=self.is_causal,
+        )
+        y = rearrange(y, "b n h w d -> b (w n) (h d)")
+        y = self.o(y)
+        y = y[:, :t_len]
+        return y
+class SelfAttention(nn.Module):
+    def __init__(
+        self, dim: int, heads: int, rotary: bool = True, is_causal: bool = False
+    ):
+        super().__init__()
+        self.heads = heads
+        assert dim % heads == 0, "dim must be divisible by heads"
+        self.qkv = nn.Linear(dim, 3 * dim)
+        self.o = nn.Linear(dim, dim)
+        self.rotary = RotaryEmbedding((dim // heads) // 2) if rotary else None
+        self.is_causal = is_causal
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        pos: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[int, torch.Tensor]] = None,
+        layer_idx: Optional[int] = None,
+        time_step: int = 0,
+    ) -> torch.Tensor:
+        # x: (batch, seq_len, dim)
+        b, n, dim = x.shape
+        b, t_len, hd = x.shape
+        pad_len = (32 - t_len % 32) % 32
+        padded_x = torch.nn.functional.pad(x, (0, 0, 0, pad_len))  # pad on time dim
+        mask = torch.ones(t_len, dtype=torch.bool, device=x.device)
+        mask = torch.nn.functional.pad(
+            mask, (0, pad_len), value=False
+        )  # False = masked
+        mask = mask.expand(b, -1)  # [b, padded_len]
+        mask = rearrange(mask, "b (w n) -> b n 1 1 w", w=32)
+        qkv = self.qkv(padded_x).chunk(3, dim=-1)
+        q, k, v = [
+            rearrange(t, "b (w n) (h d) -> b n h w d", h=self.heads, w=32) for t in qkv
+        ]
+        # caching for fast autoregressive
+        if cache is not None:
+            assert layer_idx is not None, "layer_idx must be set when using cache"
+            # append new keys/values
+            cache[layer_idx]["k"] = torch.cat([cache[layer_idx]["k"], k], dim=2)
+            cache[layer_idx]["v"] = torch.cat([cache[layer_idx]["v"], v], dim=2)
+            k, v = cache[layer_idx]["k"], cache[layer_idx]["v"]
+        # apply rotary embeddings
+        if self.rotary is not None:
+            if pos is not None:
+                rot = self.rotary(pos)  # .unsqueeze(1)  # (b,1,n,head_dim)
+                q = apply_rotary_emb(rot, q)
+                k = apply_rotary_emb(rot, k)
+            else:
+                q = self.rotary.rotate_queries_or_keys(q, offset=time_step)
+                k = self.rotary.rotate_queries_or_keys(k, offset=time_step)
+        # scaled dot-product attention
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None if self.is_causal else mask,
+            is_causal=self.is_causal,
+        )
+        y = rearrange(y, "b n h w d -> b (w n) (h d)")
+        y = self.o(y)
+        y = y[:, :t_len]
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        hidden = d_model * 4 // 3
+        self.p_in = nn.Linear(d_model, hidden * 2)
+        self.p_out = nn.Linear(hidden, d_model)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate, data = self.p_in(x).chunk(2, dim=-1)
+        return self.p_out(F.silu(gate) * data)
+class TransformerBlock(nn.Module):
+    """
+    Transformer block using custom SelfAttention and SwiGLU FFN.
+    Args:
+        dim: embedding dimension
+        heads: number of attention heads
+        rotary: whether to use rotary embeddings
+        is_causal: whether to apply causal masking
+    """
+    def __init__(
+        self,
+        dim: int,
+        head_size: int,
+        rotary: bool = True,
+        is_causal: bool = False,
+        elementwise_affine_ln: bool = True,
+    ):
+        super().__init__()
+        assert dim % head_size == 0
+        heads = dim // head_size
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=elementwise_affine_ln)
+        self.attn = LocalSelfAttention(dim, heads, rotary=rotary, is_causal=is_causal)
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=elementwise_affine_ln)
+        self.ffn = SwiGLU(dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        pos: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[int, Dict[str, torch.Tensor]]] = None,
+        layer_idx: Optional[int] = None,
+        scale_shift: tuple[torch.Tensor, torch.Tensor] | None = None,
+        gate: torch.Tensor = None,
+        time_step: int = 0,
+    ) -> torch.Tensor:
+        # Self-attention block
+        norm1_x = self.norm1(x)
+        if scale_shift is not None:
+            scale, shift = scale_shift
+            norm1_x = norm1_x * scale[:, None] + shift[:, None]
+        attn_out = self.attn(
+            norm1_x,
+            mask=mask,
+            pos=pos,
+            cache=cache,
+            layer_idx=layer_idx,
+            time_step=time_step,
+        )
+        x = x + attn_out
+        norm2_x = self.norm2(x)
+        if gate is not None:
+            norm2_x = gate[:, None] * norm2_x
+        # Feedforward block
+        ffn_out = self.ffn(norm2_x)
+        x = x + ffn_out
+        return x

codec/models/pardi_tokenizer.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch
+from zcodec.models import WavVAE, ZFlowAutoEncoder
+from zcodec.models.wavvae.model import WavVAEConfig
+from zcodec.models.zflowae.model import ZFlowAutoEncoderConfig
+class PardiTokenizer(nn.Module):
+    def __init__(self, wavvae_cfg: WavVAEConfig, zflowae_cfg: ZFlowAutoEncoderConfig):

codec/models/patchvae/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (13 kB). View file

codec/models/patchvae/__pycache__/modules.cpython-312.pyc ADDED Viewed

Binary file (20.8 kB). View file

codec/models/patchvae/model.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import json
+import math
+import os
+import sys
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+import torch
+from safetensors.torch import load_file
+from torch import nn
+from torchdyn.core import NeuralODE
+from .modules import AdaLNFlowPredictor, AutoEncoder
+@contextmanager
+def suppress_stdout():
+    original_stdout = sys.stdout
+    try:
+        sys.stdout = open(os.devnull, "w")
+        yield
+    finally:
+        sys.stdout.close()
+        sys.stdout = original_stdout
+def cosine_schedule_with_warmup(warmup_steps, total_steps, start_lr, end_lr):
+    def lr_lambda(step):
+        if step < warmup_steps:
+            return step / max(1, warmup_steps)
+        progress = (step - warmup_steps) / max(1, total_steps - warmup_steps)
+        cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
+        return (start_lr - end_lr) * cosine_decay / start_lr + end_lr / start_lr
+    return lr_lambda
+@dataclass
+class PatchVAEConfig:
+    latent_dim: int
+    hidden_dim: int
+    latent_scaling: tuple[list[float], list[float]] | None
+    flow_factory: str
+    num_flow_layers: int
+    autoencoder_factory: str
+    num_autoencoder_layers: int
+    convnextformer_num_conv_per_transformer: int = 3
+    wavvae_path: str | None = None
+    fsq_levels: list[int] | None = None
+    bottleneck_size: int | None = None
+    latent_stride: int = 2
+    vae: bool = False
+    causal_transformer: bool = False
+    cond_dim: int | None = None
+    is_causal: bool = False
+class PatchVAE(nn.Module):
+    def __init__(self, cfg: PatchVAEConfig):
+        super().__init__()
+        self.flow_net = AdaLNFlowPredictor(
+            feat_dim=cfg.latent_dim * cfg.latent_stride,
+            dim=cfg.hidden_dim,
+            n_layer=cfg.num_flow_layers,
+            layer_factory=cfg.flow_factory,
+            cond_dim=cfg.cond_dim,
+            is_causal=cfg.is_causal,
+        )
+        self.autoencoder = AutoEncoder(
+            cfg.latent_dim * cfg.latent_stride,
+            cfg.hidden_dim,
+            cfg.num_autoencoder_layers,
+            cfg.autoencoder_factory,
+            out_dim=cfg.cond_dim,
+            vae=cfg.vae,
+            bottleneck_size=cfg.bottleneck_size,
+            convnextformer_num_conv_per_transformer=cfg.convnextformer_num_conv_per_transformer,
+            is_causal=cfg.is_causal,
+        )
+        if cfg.latent_scaling is not None:
+            mean, std = cfg.latent_scaling
+            self.register_buffer("mean_latent_scaling", torch.tensor(mean))
+            self.register_buffer("std_latent_scaling", torch.tensor(std))
+        else:
+            self.mean_latent_scaling = None
+            self.std_latent_scaling = None
+        self.latent_stride = cfg.latent_stride
+        self.latent_dim = cfg.latent_dim
+        self.wavvae = None
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        map_location: str = "cpu",
+    ):
+        if Path(pretrained_model_name_or_path).exists():
+            path = pretrained_model_name_or_path
+        else:
+            from huggingface_hub import snapshot_download
+            path = snapshot_download(pretrained_model_name_or_path)
+        with open(Path(path) / "config.json", "r") as f:
+            config = json.load(f)
+        config = PatchVAEConfig(**config)
+        model = cls(config).to(map_location)
+        state_dict = load_file(
+            Path(path) / "model.st",
+            device=map_location,
+        )
+        model.load_state_dict(state_dict, assign=True)
+        if config.wavvae_path is not None:
+            from .. import WavVAE
+            model.wavvae = WavVAE.from_pretrained(config.wavvae_path).to(map_location)
+        else:
+            model.wavvae = None
+        return model
+    def wavvae_from_pretrained(
+        self,
+        pretrained_model_name_or_path: str,
+        *args,
+        **kwargs,
+    ):
+        from .. import WavVAE
+        self.wavvae = WavVAE.from_pretrained(
+            pretrained_model_name_or_path,
+            *args,
+            **kwargs,
+        )
+    def encode(self, wav: torch.Tensor):
+        assert self.wavvae is not None, (
+            "please provide WavVAE model to encode from waveform"
+        )
+        z = self.wavvae.encode(wav)
+        zz = self.encode_patch(z)
+        return zz
+    def decode(self, patchvae_latent: torch.Tensor, **kwargs):
+        assert self.wavvae is not None, (
+            "please provide WavVAE model to decode to waveform"
+        )
+        z = self.decode_patch(patchvae_latent, **kwargs)
+        wav = self.wavvae.decode(z)
+        return wav
+    def normalize_z(self, z: torch.Tensor):
+        if self.mean_latent_scaling is not None:
+            z = (z - self.mean_latent_scaling) / self.std_latent_scaling
+        return z
+    def denormalize_z(self, z: torch.Tensor):
+        if self.std_latent_scaling is not None:
+            z = z * self.std_latent_scaling + self.mean_latent_scaling
+        return z
+    def encode_patch(self, z: torch.Tensor, deterministic: bool = False):
+        B, T, D = z.shape
+        z = self.normalize_z(z)
+        if self.latent_stride > 1:
+            z = z[:, : T - T % self.latent_stride]
+            z = z.reshape(B, T // self.latent_stride, D * self.latent_stride)
+        return self.autoencoder.encode(z, deterministic=deterministic)
+    def decode_patch(
+        self,
+        latent: torch.Tensor,
+        cfg: float = 2.0,
+        num_steps: int = 15,
+        solver: str = "euler",
+        sensitivity: str = "adjoint",
+        temperature: float = 1.0,
+        **kwargs,
+    ):
+        with torch.no_grad():
+            z_cond = self.autoencoder.decode(latent).transpose(1, 2)
+            if cfg == 1.0:
+                def solver_fn(t, Xt, *args, **kwargs):
+                    flow = self.flow_net(Xt, z_cond, t.unsqueeze(0))
+                    return flow
+            else:
+                z_cond_uncond = torch.cat((z_cond, torch.zeros_like(z_cond)), dim=0)
+                def solver_fn(t, Xt, *args, **kwargs):
+                    flow = self.flow_net(
+                        Xt.repeat(2, 1, 1), z_cond_uncond, t.unsqueeze(0)
+                    )
+                    cond, uncond = flow.chunk(2, dim=0)
+                    return uncond + cfg * (cond - uncond)
+            with suppress_stdout():
+                node_ = NeuralODE(
+                    solver_fn,
+                    solver=solver,
+                    sensitivity=sensitivity,
+                    **kwargs,
+                )
+            t_span = torch.linspace(0, 1, num_steps + 1, device=z_cond.device)
+            patch_dim = self.latent_dim * self.latent_stride
+            x0 = torch.randn(
+                z_cond.shape[0],
+                patch_dim,
+                z_cond.shape[2],
+                device=z_cond.device,
+            )
+            traj = node_.trajectory(
+                x0 * temperature,
+                t_span=t_span,
+            )
+            y_hat = traj[-1]
+            y_hat = y_hat.transpose(1, 2)
+            B, T, D = y_hat.shape
+            y_hat = y_hat.reshape(B, T * self.latent_stride, D // self.latent_stride)
+            y_hat = self.denormalize_z(y_hat)
+        return y_hat
+    def forward(
+        self,
+        z: torch.Tensor,
+        t: torch.Tensor,
+        drop_cond_rate: float = 0.0,
+        drop_vae_rate: float = 0.0,
+        sigma: float = 1e-4,
+    ):
+        z = self.normalize_z(z)
+        B, T, D = z.shape
+        if self.latent_stride > 1:
+            z = z.reshape(B, T // self.latent_stride, D * self.latent_stride)
+        prior, ae_loss = self.autoencoder(z, drop_vae_rate=drop_vae_rate)
+        if drop_cond_rate > 0.0:
+            to_drop = torch.rand(prior.shape[0], device=prior.device) < drop_cond_rate
+            prior[to_drop] = 0.0
+        x0 = torch.randn_like(z)
+        x1 = z
+        flow_target = x1 - (1 - sigma) * x0
+        alpha = (1 - (1 - sigma) * t).view(-1, 1, 1)
+        xt = alpha * x0 + t.view(-1, 1, 1) * x1
+        pred = self.flow_net(
+            xt.transpose(1, 2),
+            prior.transpose(1, 2),
+            t,
+        )
+        flow_loss = nn.functional.mse_loss(flow_target.transpose(1, 2), pred)
+        return flow_loss, ae_loss, prior

codec/models/patchvae/modules.py ADDED Viewed

	@@ -0,0 +1,396 @@

+from random import random
+from typing import Literal
+import torch
+import torch.nn.functional as F
+from torch import nn
+from vector_quantize_pytorch import FSQ
+from zcodec.models.components.transformer import TransformerBlock
+class AdaLayerNormScale(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.linear = nn.Linear(dim, dim * 3)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False)
+    def forward(self, x, c):
+        x = self.norm(x)
+        scale, bias, gate = self.linear(F.silu(c)).chunk(3, dim=1)
+        shape = x.shape[0] + [1] * (x.dim() - 2) + x.shape[-1]
+        scale, bias, gate = map(lambda x: x.view(*shape), (scale, bias, gate))
+        x = x * (1 + scale) + bias
+        return x, gate
+class GaussianFourierTimeEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(dim), requires_grad=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x[:, None] * self.weight[None, :] * 2 * torch.pi
+        x = torch.cat((torch.sin(x), torch.cos(x)), dim=1)
+        return x
+LAYER_FACTORIES = {}
+def register_flow_layer_factory(name):
+    def decorator(fn):
+        LAYER_FACTORIES[name] = fn
+        return fn
+    return decorator
+@register_flow_layer_factory("convnext")
+def SimpleConvNextFactory(dim: int, i: int, n_layer: int, is_causal: bool = False):
+    return ConvNeXtBlock(dim, elementwise_affine_ln=False, is_causal=is_causal)
+@register_flow_layer_factory("mlp")
+def MLP(dim: int, i: int, n_layer: int, is_causal: bool = False):
+    return AdaLNMLP(dim)
+@register_flow_layer_factory("sa_transformer")
+def SelfAttentionTransformer(dim: int, i: int, n_layer: int, is_causal: bool = False):
+    return TransformerBlock(dim, 64, elementwise_affine_ln=False, is_causal=is_causal)
+def init_weights(m: nn.Module):
+    if isinstance(m, (nn.Conv1d, nn.Linear)):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+def init_adaln_weights(m: nn.Module):
+    nn.init.trunc_normal_(m.weight, std=0.02)
+    nn.init.zeros_(m.bias)
+def modulate(x, scale, shift):
+    return x * (1 + scale[:, None]) + shift[:, None]
+class AdaLNFlowPredictor(nn.Module):
+    def __init__(
+        self,
+        feat_dim: int,
+        dim: int,
+        n_layer: int,
+        layer_factory: str,
+        cond_dim: int | None = None,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        layer_factory = LAYER_FACTORIES[layer_factory]
+        self.layers = nn.ModuleList(
+            [
+                layer_factory(dim, i, n_layer, is_causal=is_causal)
+                for i in range(n_layer)
+            ]
+        )
+        if cond_dim is None:
+            cond_dim = feat_dim
+        self.initial_proj = nn.Linear(feat_dim + cond_dim, dim)
+        self.adaln_proj = nn.ModuleList([nn.Linear(dim, dim * 3) for _ in self.layers])
+        self.final_adaln_proj = nn.Linear(dim, dim * 2)
+        self.out_proj = nn.Linear(dim, feat_dim)
+        self.final_norm = nn.LayerNorm(dim, elementwise_affine=False)
+        self.time_emb = GaussianFourierTimeEmbedding(dim // 2)
+        self.apply(init_weights)
+        for l in self.adaln_proj:
+            init_adaln_weights(l)
+        init_adaln_weights(self.final_adaln_proj)
+    def forward(
+        self,
+        x_t: torch.Tensor,
+        x_mu: torch.Tensor,
+        t: torch.Tensor,
+    ):
+        x_t, x_mu = map(lambda x: x.transpose(1, 2), (x_t, x_mu))
+        x = self.initial_proj(torch.cat((x_t, x_mu), dim=-1)).transpose(1, 2)
+        t_emb = self.time_emb(t)
+        for i, (l, adaln) in enumerate(zip(self.layers, self.adaln_proj)):
+            scale, shift, gate = F.silu(adaln(t_emb)).chunk(3, dim=1)
+            x = l(x, scale_shift=(scale, shift), gate=gate)
+        scale, shift = F.silu(self.final_adaln_proj(t_emb)).chunk(2, dim=1)
+        x = self.final_norm(x.transpose(1, 2))
+        x = modulate(x, scale, shift)
+        x = self.out_proj(x).transpose(1, 2)
+        return x
+class AdaLNMLP(nn.Module):
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.in_ln = nn.LayerNorm(hidden_dim, eps=1e-6, elementwise_affine=False)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_dim, hidden_dim, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_dim, 4 * hidden_dim, bias=True)
+        )
+    def forward(self, x, scale_shift, gate):
+        x = x.transpose(-1, -2)
+        h = modulate(self.in_ln(x), *scale_shift)
+        h = self.mlp(h)
+        return (x + gate[:, None] * h).transpose(-1, -2)
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int | None = None,
+        layer_scale_init_value: float = 0.0,
+        elementwise_affine_ln: bool = True,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        intermediate_dim = intermediate_dim if intermediate_dim is not None else dim * 3
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=0 if is_causal else 3, groups=dim
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(
+            dim, eps=1e-6, elementwise_affine=elementwise_affine_ln
+        )
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.is_causal = is_causal
+    def forward(
+        self,
+        x: torch.Tensor,
+        scale_shift: tuple[torch.Tensor, torch.Tensor] | None = None,
+        gate: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = x
+        if self.is_causal:
+            x = torch.nn.functional.pad(x, (6, 0))
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(x)
+        if scale_shift is not None:
+            scale, shift = scale_shift
+            x = x * scale[:, None] + shift[:, None]
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        if gate is not None:
+            x = gate[:, None] * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class ConvNextNet(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_layers: int,
+        intermediate_dim: int | None = None,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        self.net = nn.Sequential(
+            *[
+                ConvNeXtBlock(dim, intermediate_dim, is_causal=is_causal)
+                for _ in range(n_layers)
+            ]
+        )
+    def forward(self, x):
+        return self.net(x.transpose(1, 2)).transpose(1, 2)
+def convnext_factory(dim, n_layers, is_causal=False):
+    return ConvNextNet(dim, n_layers, is_causal=is_causal)
+def convnextformer_factory(
+    dim, n_layers, n_convnext_per_transformer_block, is_causal=False
+):
+    layers = []
+    for i in range(0, n_layers, n_convnext_per_transformer_block + 1):
+        layers.append(
+            ConvNextNet(dim, n_convnext_per_transformer_block, is_causal=is_causal)
+        )
+        layers.append(TransformerBlock(dim, 64, is_causal=is_causal))
+    return nn.Sequential(*layers)
+class AutoEncoder(nn.Module):
+    def __init__(
+        self,
+        feat_dim: int,
+        hidden_dim: int,
+        num_layers: int,
+        net_factory: Literal["convnext", "convnextformer_decoder", "convnextformer"],
+        out_dim: int | None = None,
+        convnextformer_num_conv_per_transformer: int = 3,
+        causal_transformer: bool = False,
+        bottleneck_size: int | None = None,
+        vae: bool = False,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        self.embed = nn.Linear(feat_dim, hidden_dim)
+        if out_dim is None:
+            out_dim = feat_dim
+        self.unembed = nn.Linear(hidden_dim, out_dim)
+        if net_factory == "convnext":
+            self.encoder_net = convnext_factory(
+                hidden_dim, num_layers, is_causal=is_causal
+            )
+            self.decoder_net = convnext_factory(
+                hidden_dim, num_layers, is_causal=is_causal
+            )
+        elif net_factory == "convnextformer_decoder":
+            self.encoder_net = convnext_factory(
+                hidden_dim, num_layers, is_causal=is_causal
+            )
+            self.decoder_net = convnextformer_factory(
+                hidden_dim,
+                num_layers,
+                convnextformer_num_conv_per_transformer,
+                is_causal=is_causal,
+            )
+        elif net_factory == "convnextformer":
+            self.encoder_net = convnextformer_factory(
+                hidden_dim,
+                num_layers,
+                convnextformer_num_conv_per_transformer,
+                is_causal=is_causal,
+            )
+            self.decoder_net = convnextformer_factory(
+                hidden_dim,
+                num_layers,
+                convnextformer_num_conv_per_transformer,
+                is_causal=is_causal,
+            )
+        self.bottleneck = (
+            nn.Linear(hidden_dim, bottleneck_size * (1 + vae))
+            if bottleneck_size is not None
+            else nn.Identity()
+        )
+        self.unbottleneck = (
+            nn.Linear(bottleneck_size, hidden_dim)
+            if bottleneck_size is not None
+            else nn.Identity()
+        )
+        self.vae = vae
+    def reparameterize(
+        self,
+        mu: torch.Tensor,
+        logvar: torch.Tensor,
+        deterministic: bool = False,
+        drop_vae_rate: float = 0.0,
+    ) -> torch.Tensor:
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        if drop_vae_rate > 0.0:
+            to_drop = torch.rand(std.shape[0], device=std.device) < drop_vae_rate
+            eps = torch.randn_like(std)
+            eps[to_drop] = 0.0
+        else:
+            if deterministic:
+                eps = torch.zeros_like(std)
+            else:
+                eps = torch.randn_like(std)
+        return mu + eps * std
+    def kl_divergence(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        kl = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp())
+        return kl.sum(dim=-1).mean()
+    def forward(self, x: torch.Tensor, drop_vae_rate: float = 0.0) -> torch.Tensor:
+        # Encode
+        x = self.embed(x)
+        x = self.encoder_net(x)
+        x = self.bottleneck(x)
+        if self.vae:
+            mu, logvar = x.chunk(2, dim=-1)
+            loss = {
+                "kl_div": self.kl_divergence(mu, logvar),
+                "_mu_mean": mu.mean(),
+                "_mu_std": mu.std(),
+                "_logvar_mean": logvar.mean(),
+                "_logvar_std": logvar.std(),
+            }
+            x = self.reparameterize(
+                mu,
+                logvar,
+                drop_vae_rate=drop_vae_rate,
+            )
+        else:
+            loss = {}
+        # Decode
+        x = self.unbottleneck(x)
+        x = self.decoder_net(x)
+        x = self.unembed(x)
+        return x, loss
+    def encode(self, x: torch.Tensor, deterministic: bool = False):
+        x = self.embed(x)
+        x = self.encoder_net(x)
+        x = self.bottleneck(x)
+        if self.vae:
+            x = self.reparameterize(*x.chunk(2, dim=-1), deterministic=deterministic)
+        return x
+    def decode(
+        self,
+        latent: torch.Tensor | None = None,
+    ):
+        x = self.unbottleneck(latent)
+        x = self.decoder_net(x)
+        x = self.unembed(x)
+        return x

codec/models/wavvae/__init__.py ADDED Viewed

File without changes

codec/models/wavvae/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (170 Bytes). View file

codec/models/wavvae/__pycache__/discriminators.cpython-312.pyc ADDED Viewed

Binary file (13.1 kB). View file

codec/models/wavvae/__pycache__/heads.cpython-312.pyc ADDED Viewed

Binary file (10.4 kB). View file

codec/models/wavvae/__pycache__/layers.cpython-312.pyc ADDED Viewed

Binary file (13.1 kB). View file

codec/models/wavvae/__pycache__/loss.cpython-312.pyc ADDED Viewed

Binary file (7.01 kB). View file

codec/models/wavvae/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (8.79 kB). View file

codec/models/wavvae/__pycache__/modules.cpython-312.pyc ADDED Viewed

Binary file (11.1 kB). View file

codec/models/wavvae/__pycache__/spectral_ops.cpython-312.pyc ADDED Viewed

Binary file (12.4 kB). View file

codec/models/wavvae/dataset.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from dataclasses import dataclass
+import numpy as np
+import torch
+import torchaudio
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader, Dataset
+torch.set_num_threads(1)
+@dataclass
+class WavVAEDataConfig:
+    filelist_path: str
+    sampling_rate: int
+    num_samples: int
+    batch_size: int
+    num_workers: int
+class WavVAEDataModule(LightningDataModule):
+    def __init__(self, train_params: WavVAEDataConfig, val_params: WavVAEDataConfig):
+        super().__init__()
+        self.train_config = train_params
+        self.val_config = val_params
+    def _get_dataloder(self, cfg: DataConfig, train: bool):
+        dataset = WavVAEDataset(cfg, train=train)
+        dataloader = DataLoader(
+            dataset,
+            batch_size=cfg.batch_size,
+            num_workers=cfg.num_workers,
+            shuffle=train,
+            pin_memory=True,
+        )
+        return dataloader
+    def train_dataloader(self) -> DataLoader:
+        return self._get_dataloder(self.train_config, train=True)
+    def val_dataloader(self) -> DataLoader:
+        return self._get_dataloder(self.val_config, train=False)
+class WavVAEDataset(Dataset):
+    def __init__(self, cfg: DataConfig, train: bool):
+        with open(cfg.filelist_path) as f:
+            self.filelist = f.read().splitlines()
+        self.sampling_rate = cfg.sampling_rate
+        self.num_samples = cfg.num_samples
+        self.train = train
+    def __len__(self) -> int:
+        return len(self.filelist)
+    def __getitem__(self, index: int) -> torch.Tensor:
+        audio_path = self.filelist[index]
+        y, sr = torchaudio.load(audio_path)
+        if y.size(0) > 1:
+            # mix to mono
+            y = y.mean(dim=0, keepdim=True)
+        gain = np.random.uniform(-1, -6) if self.train else -3
+        y, _ = torchaudio.sox_effects.apply_effects_tensor(
+            y, sr, [["norm", f"{gain:.2f}"]]
+        )
+        try:
+            if sr != self.sampling_rate:
+                y = torchaudio.functional.resample(
+                    y, orig_freq=sr, new_freq=self.sampling_rate
+                )
+        except:
+            print(audio_path, y.shape)
+        if y.size(-1) < self.num_samples:
+            pad_length = self.num_samples - y.size(-1)
+            padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
+            y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
+        elif self.train:
+            start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
+            y = y[:, start : start + self.num_samples]
+        else:
+            # During validation, take always the first segment for determinism
+            y = y[:, : self.num_samples]
+        return y[0]

codec/models/wavvae/discriminators.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from typing import List, Optional, Tuple
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import Conv2d
+from torch.nn.utils import weight_norm
+from torchaudio.transforms import Spectrogram
+class MultiPeriodDiscriminator(nn.Module):
+    """
+    Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
+    Additionally, it allows incorporating conditional information with a learned embeddings table.
+    Args:
+        periods (tuple[int]): Tuple of periods for each discriminator.
+        num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+            Defaults to None.
+    """
+    def __init__(self, periods: Tuple[int, ...] = (2, 3, 5, 7, 11), num_embeddings: Optional[int] = None):
+        super().__init__()
+        self.discriminators = nn.ModuleList([DiscriminatorP(period=p, num_embeddings=num_embeddings) for p in periods])
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: Optional[torch.Tensor] = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorP(nn.Module):
+    def __init__(
+        self,
+        period: int,
+        in_channels: int = 1,
+        kernel_size: int = 5,
+        stride: int = 3,
+        lrelu_slope: float = 0.1,
+        num_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.period = period
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(Conv2d(in_channels, 32, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(1024, 1024, (kernel_size, 1), (1, 1), padding=(kernel_size // 2, 0))),
+            ]
+        )
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=1024)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+        self.lrelu_slope = lrelu_slope
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        x = x.unsqueeze(1)
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for i, l in enumerate(self.convs):
+            x = l(x)
+            x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
+            if i > 0:
+                fmap.append(x)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap

codec/models/wavvae/experiment.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+
3	+

codec/models/wavvae/heads.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from typing import Optional
+import torch
+from einops import rearrange
+from torch import nn
+from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz
+from .modules import symexp
+from .spectral_ops import IMDCT, ISTFT
+class FourierHead(nn.Module):
+    """Base class for inverse fourier modules."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class LinearNoBiasHead(FourierHead):
+    def __init__(self, dim: int, hop_length: int, n_fft: int):
+        super().__init__()
+        self.pre_head = nn.Linear(dim, n_fft + 2)
+        self.head = nn.Linear(n_fft + 2, hop_length, bias=False)
+    def forward(self, x):
+        y = self.pre_head(x)
+        y = self.head(y).clamp(min=-1.0, max=1.0)
+        B, _, _ = y.shape
+        return y.reshape(B, -1)
+class ISTFTHead(FourierHead):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.hop_length = hop_length
+        self.istft = ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the ISTFTHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x).transpose(1, 2)
+        mag, p = x.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        S = mag * (x + 1j * y)
+        audio = self.istft(S)
+        audio = nn.functional.pad(audio, (self.hop_length // 2, self.hop_length // 2))
+        return audio
+class IMDCTSymExpHead(FourierHead):
+    """
+    IMDCT Head module for predicting MDCT coefficients with symmetric exponential function
+    Args:
+        dim (int): Hidden dimension of the model.
+        mdct_frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+        sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized
+                                     based on perceptual scaling. Defaults to None.
+        clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
+    """
+    def __init__(
+        self,
+        dim: int,
+        mdct_frame_len: int,
+        padding: str = "same",
+        sample_rate: Optional[int] = None,
+        clip_audio: bool = False,
+    ):
+        super().__init__()
+        out_dim = mdct_frame_len // 2
+        self.out = nn.Linear(dim, out_dim)
+        self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
+        self.clip_audio = clip_audio
+        if sample_rate is not None:
+            # optionally init the last layer following mel-scale
+            m_max = _hz_to_mel(sample_rate // 2)
+            m_pts = torch.linspace(0, m_max, out_dim)
+            f_pts = _mel_to_hz(m_pts)
+            scale = 1 - (f_pts / f_pts.max())
+            with torch.no_grad():
+                self.out.weight.mul_(scale.view(-1, 1))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the IMDCTSymExpHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x)
+        x = symexp(x)
+        x = torch.clip(
+            x, min=-1e2, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        audio = self.imdct(x)
+        if self.clip_audio:
+            audio = torch.clip(x, min=-1.0, max=1.0)
+        return audio
+class IMDCTCosHead(FourierHead):
+    """
+    IMDCT Head module for predicting MDCT coefficients with parametrizing MDCT = exp(m) · cos(p)
+    Args:
+        dim (int): Hidden dimension of the model.
+        mdct_frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+        clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
+    """
+    def __init__(
+        self,
+        dim: int,
+        mdct_frame_len: int,
+        padding: str = "same",
+        clip_audio: bool = False,
+    ):
+        super().__init__()
+        self.clip_audio = clip_audio
+        self.out = nn.Linear(dim, mdct_frame_len)
+        self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the IMDCTCosHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x)
+        m, p = x.chunk(2, dim=2)
+        m = torch.exp(m).clip(
+            max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        audio = self.imdct(m * torch.cos(p))
+        if self.clip_audio:
+            audio = torch.clip(x, min=-1.0, max=1.0)
+        return audio

codec/models/wavvae/helpers.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import matplotlib
+import numpy as np
+import torch
+from matplotlib import pyplot as plt
+from pytorch_lightning import Callback
+matplotlib.use("Agg")
+def save_figure_to_numpy(fig: plt.Figure) -> np.ndarray:
+    """
+    Save a matplotlib figure to a numpy array.
+    Args:
+        fig (Figure): Matplotlib figure object.
+    Returns:
+        ndarray: Numpy array representing the figure.
+    """
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    return data
+def plot_spectrogram_to_numpy(spectrogram: np.ndarray) -> np.ndarray:
+    """
+    Plot a spectrogram and convert it to a numpy array.
+    Args:
+        spectrogram (ndarray): Spectrogram data.
+    Returns:
+        ndarray: Numpy array representing the plotted spectrogram.
+    """
+    spectrogram = spectrogram.astype(np.float32)
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+class GradNormCallback(Callback):
+    """
+    Callback to log the gradient norm.
+    """
+    def on_after_backward(self, trainer, model):
+        model.log("grad_norm", gradient_norm(model))
+def gradient_norm(model: torch.nn.Module, norm_type: float = 2.0) -> torch.Tensor:
+    """
+    Compute the gradient norm.
+    Args:
+        model (Module): PyTorch model.
+        norm_type (float, optional): Type of the norm. Defaults to 2.0.
+    Returns:
+        Tensor: Gradient norm.
+    """
+    grads = [p.grad for p in model.parameters() if p.grad is not None]
+    total_norm = torch.norm(torch.stack([torch.norm(g.detach(), norm_type) for g in grads]), norm_type)
+    return total_norm

codec/models/wavvae/layers.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+class VocosDecoder(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.convnext = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=dim,
+                    intermediate_dim=intermediate_dim,
+                    layer_scale_init_value=0.0,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = x.transpose(1, 2)
+        for conv_block in self.convnext:
+            x = conv_block(x)
+        x = self.final_layer_norm(x.transpose(1, 2))
+        return x
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int | None = None,
+        layer_scale_init_value: float = 0.0,
+        elementwise_affine_ln: bool = True,
+    ):
+        super().__init__()
+        intermediate_dim = intermediate_dim if intermediate_dim is not None else dim * 3
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=3, groups=dim
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(
+            dim, eps=1e-6, elementwise_affine=elementwise_affine_ln
+        )
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        scale_shift: tuple[torch.Tensor, torch.Tensor] | None = None,
+        gate: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(x)
+        if scale_shift is not None:
+            scale, shift = scale_shift
+            x = x * scale[:, None] + shift[:, None]
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        if gate is not None:
+            x = gate[:, None] * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model=32,
+        strides=[2, 4, 4, 8],
+        depthwise=False,
+    ):
+        super().__init__()
+        layers = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        for stride in strides:
+            d_model *= 2
+            groups = d_model // 2 if depthwise else 1
+            layers += [EncoderBlock(output_dim=d_model, stride=stride, groups=groups)]
+        groups = d_model if depthwise else 1
+        layers += [
+            WNConv1d(d_model, d_model, kernel_size=7, padding=3, groups=groups),
+        ]
+        self.block = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.block(x)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        noise=False,
+        depthwise=False,
+        d_out=1,
+    ):
+        super().__init__()
+        if depthwise:
+            layers = [
+                WNConv1d(
+                    input_channel,
+                    input_channel,
+                    kernel_size=7,
+                    padding=3,
+                    groups=input_channel,
+                ),
+                WNConv1d(input_channel, channels, kernel_size=1),
+            ]
+        else:
+            layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            groups = output_dim if depthwise else 1
+            layers.append(
+                DecoderBlock(input_dim, output_dim, stride, noise, groups=groups)
+            )
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.model(x)
+        return x
+class ResidualUnit(nn.Module):
+    def __init__(self, dim=16, dilation=1, kernel=7, groups=1):
+        super().__init__()
+        pad = ((kernel - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(
+                dim,
+                dim,
+                kernel_size=kernel,
+                dilation=dilation,
+                padding=pad,
+                groups=groups,
+            ),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class EncoderBlock(nn.Module):
+    def __init__(self, output_dim=16, input_dim=None, stride=1, groups=1):
+        super().__init__()
+        input_dim = input_dim or output_dim // 2
+        self.block = nn.Sequential(
+            ResidualUnit(input_dim, dilation=1, groups=groups),
+            ResidualUnit(input_dim, dilation=3, groups=groups),
+            ResidualUnit(input_dim, dilation=9, groups=groups),
+            Snake1d(input_dim),
+            WNConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class NoiseBlock(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.linear = WNConv1d(dim, dim, kernel_size=1, bias=False)
+    def forward(self, x):
+        B, C, T = x.shape
+        noise = torch.randn((B, 1, T), device=x.device, dtype=x.dtype)
+        h = self.linear(x)
+        n = noise * h
+        x = x + n
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim=16, output_dim=8, stride=1, noise=False, groups=1):
+        super().__init__()
+        layers = [
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=stride % 2,
+            ),
+        ]
+        if noise:
+            layers.append(NoiseBlock(output_dim))
+        layers.extend(
+            [
+                ResidualUnit(output_dim, dilation=1, groups=groups),
+                ResidualUnit(output_dim, dilation=3, groups=groups),
+                ResidualUnit(output_dim, dilation=9, groups=groups),
+            ]
+        )
+        self.block = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.block(x)
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)

codec/models/wavvae/loss.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from typing import List, Optional, Tuple
+import torch
+import torchaudio
+from torch import nn
+from .modules import safe_log
+class MelSpecReconstructionLoss(nn.Module):
+    """
+    L1 distance between the mel-scaled magnitude spectrograms of the ground truth sample and the generated sample
+    """
+    def __init__(
+        self,
+        sample_rate: int = 24000,
+        n_fft: int | None = None,
+        hop_length: int = 256,
+        n_mels: int = 100,
+        f_min: int = 0,
+        f_max: Optional[int] = None,
+    ):
+        super().__init__()
+        self.mel_spec = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=hop_length * 4 if n_fft is None else n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+            center=True,
+            power=1,
+            f_min=f_min,
+            f_max=f_max,
+        )
+    def forward(self, y_hat, y) -> torch.Tensor:
+        """
+        Args:
+            y_hat (Tensor): Predicted audio waveform.
+            y (Tensor): Ground truth audio waveform.
+        Returns:
+            Tensor: L1 loss between the mel-scaled magnitude spectrograms.
+        """
+        # B, C, Th = y_hat.shape
+        # B, C, T = y.shape
+        # crop = (Th - T) // 2
+        mel_hat = safe_log(self.mel_spec(y_hat))
+        # mel_hat = safe_log(self.mel_spec(y_hat[..., crop:-crop]))
+        # mel = safe_log(self.mel_spec(y[..., crop:-crop]))
+        mel = safe_log(self.mel_spec(y))
+        loss = torch.nn.functional.l1_loss(mel, mel_hat)
+        return loss
+class GeneratorLoss(nn.Module):
+    """
+    Generator Loss module. Calculates the loss for the generator based on discriminator outputs.
+    """
+    def forward(
+        self, disc_outputs: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Args:
+            disc_outputs (List[Tensor]): List of discriminator outputs.
+        Returns:
+            Tuple[Tensor, List[Tensor]]: Tuple containing the total loss and a list of loss values from
+                                         the sub-discriminators
+        """
+        loss = torch.zeros(
+            1, device=disc_outputs[0].device, dtype=disc_outputs[0].dtype
+        )
+        gen_losses = []
+        for dg in disc_outputs:
+            l = torch.mean(torch.clamp(1 - dg, min=0))
+            gen_losses.append(l)
+            loss += l
+        return loss, gen_losses
+class DiscriminatorLoss(nn.Module):
+    """
+    Discriminator Loss module. Calculates the loss for the discriminator based on real and generated outputs.
+    """
+    def forward(
+        self,
+        disc_real_outputs: List[torch.Tensor],
+        disc_generated_outputs: List[torch.Tensor],
+    ) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Args:
+            disc_real_outputs (List[Tensor]): List of discriminator outputs for real samples.
+            disc_generated_outputs (List[Tensor]): List of discriminator outputs for generated samples.
+        Returns:
+            Tuple[Tensor, List[Tensor], List[Tensor]]: A tuple containing the total loss, a list of loss values from
+                                                       the sub-discriminators for real outputs, and a list of
+                                                       loss values for generated outputs.
+        """
+        loss = torch.zeros(
+            1, device=disc_real_outputs[0].device, dtype=disc_real_outputs[0].dtype
+        )
+        r_losses = []
+        g_losses = []
+        for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+            r_loss = torch.mean(torch.clamp(1 - dr, min=0))
+            g_loss = torch.mean(torch.clamp(1 + dg, min=0))
+            loss += r_loss + g_loss
+            r_losses.append(r_loss)
+            g_losses.append(g_loss)
+        return loss, r_losses, g_losses
+class FeatureMatchingLoss(nn.Module):
+    """
+    Feature Matching Loss module. Calculates the feature matching loss between feature maps of the sub-discriminators.
+    """
+    def forward(
+        self, fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]
+    ) -> torch.Tensor:
+        """
+        Args:
+            fmap_r (List[List[Tensor]]): List of feature maps from real samples.
+            fmap_g (List[List[Tensor]]): List of feature maps from generated samples.
+        Returns:
+            Tensor: The calculated feature matching loss.
+        """
+        loss = torch.zeros(1, device=fmap_r[0][0].device, dtype=fmap_r[0][0].dtype)
+        for dr, dg in zip(fmap_r, fmap_g):
+            for rl, gl in zip(dr, dg):
+                loss += torch.mean(torch.abs(rl - gl))
+        return loss

codec/models/wavvae/model.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+import torch
+from safetensors.torch import load_file
+from torch import nn
+from .heads import ISTFTHead, LinearNoBiasHead
+from .layers import Encoder, VocosDecoder
+from .modules import ConvNeXtBlock
+@dataclass
+class WavVAEConfig:
+    conv_dim: int = 48
+    latent_dim: int = 32
+    decoder_hidden_dim: int = 768
+    decoder_intermediate_dim: int = 1536
+    decoder_num_layers: int = 8
+    n_fft: int = 1024
+    hop_length: int = 256
+    padding: str = "center"
+    head_type: Literal["istft", "linear"] = "istft"
+    strides: list[int] = field(default_factory=lambda: [2, 4, 4, 8])
+    learnable_pre_norm: bool = False
+    sampling_rate: int = 24000
+class WavVAE(nn.Module):
+    def __init__(self, cfg: WavVAEConfig):
+        super().__init__()
+        self.conv_encoder = Encoder(cfg.conv_dim, strides=cfg.strides, depthwise=True)
+        conv_final_dim = cfg.conv_dim * 2 ** len(cfg.strides)
+        self.bottleneck = nn.Linear(conv_final_dim, cfg.latent_dim * 2)
+        self.unbottleneck = nn.Linear(cfg.latent_dim, cfg.decoder_hidden_dim)
+        self.latent_norm = nn.LayerNorm(conv_final_dim)
+        self.vocos_decoder = VocosDecoder(
+            cfg.decoder_hidden_dim,
+            cfg.decoder_intermediate_dim,
+            cfg.decoder_num_layers,
+        )
+        if cfg.head_type == "istft":
+            self.head = ISTFTHead(
+                cfg.decoder_hidden_dim,
+                cfg.n_fft,
+                cfg.hop_length,
+                padding=cfg.padding,
+            )
+        elif cfg.head_type == "linear":
+            self.head = LinearNoBiasHead(
+                cfg.decoder_hidden_dim,
+                cfg.hop_length,
+                cfg.n_fft,
+            )
+        self._sampling_rate = cfg.sampling_rate
+        self._strides = cfg.strides
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    @property
+    def sampling_rate(self) -> int:
+        return self._sampling_rate
+    @property
+    def hop_length(self) -> int:
+        hop_length = 1
+        for s in self._strides:
+            hop_length *= s
+        return hop_length
+    @property
+    def frame_rate(self) -> float:
+        return self.sampling_rate / self.hop_length
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        device: str = "cpu",
+    ):
+        if Path(pretrained_model_name_or_path).exists():
+            path = pretrained_model_name_or_path
+        else:
+            from huggingface_hub import snapshot_download
+            path = snapshot_download(pretrained_model_name_or_path)
+        with open(Path(path) / "config.json", "r") as f:
+            config = json.load(f)
+        config = WavVAEConfig(**config)
+        model = cls(config)
+        state_dict = load_file(
+            Path(path) / "model.st",
+            device=device,
+        )
+        model.load_state_dict(state_dict, assign=True)
+        return model
+    def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        return mu + eps * std
+    def kl_divergence(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        kl = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp())
+        return kl.sum(dim=-1).mean()
+    def encode(self, audio: torch.Tensor) -> torch.Tensor:
+        y = self.conv_encoder(audio.unsqueeze(1)).transpose(1, 2)
+        y = self.latent_norm(y)
+        mu, logvar = self.bottleneck(y).chunk(2, dim=-1)
+        z = self.reparameterize(mu, logvar)
+        return z
+    def decode(self, z: torch.Tensor) -> torch.Tensor:
+        y = self.unbottleneck(z)
+        y = self.vocos_decoder(y)
+        return self.head(y)
+    def forward(self, audio_input: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        y = self.conv_encoder(audio_input.unsqueeze(1)).transpose(1, 2)
+        y = self.latent_norm(y)
+        mu, logvar = self.bottleneck(y).chunk(2, dim=-1)
+        kl_div = self.kl_divergence(mu, logvar)
+        z = self.reparameterize(mu, logvar)
+        y = self.unbottleneck(z)
+        y = self.vocos_decoder(y)
+        audio_output = self.head(y)
+        return audio_output, kl_div

codec/models/wavvae/modules.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from typing import Optional, Tuple
+import torch
+from torch import nn
+from torch.nn.utils import weight_norm, remove_weight_norm
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: float,
+        adanorm_num_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.adanorm = adanorm_num_embeddings is not None
+        if adanorm_num_embeddings:
+            self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        if self.adanorm:
+            assert cond_embedding_id is not None
+            x = self.norm(x, cond_embedding_id)
+        else:
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class AdaLayerNorm(nn.Module):
+    """
+    Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes
+    Args:
+        num_embeddings (int): Number of embeddings.
+        embedding_dim (int): Dimension of the embeddings.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = embedding_dim
+        self.scale = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+        self.shift = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+        torch.nn.init.ones_(self.scale.weight)
+        torch.nn.init.zeros_(self.shift.weight)
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor) -> torch.Tensor:
+        scale = self.scale(cond_embedding_id)
+        shift = self.shift(cond_embedding_id)
+        x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps)
+        x = x * scale + shift
+        return x
+class ResBlock1(nn.Module):
+    """
+    ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
+    but without upsampling layers.
+    Args:
+        dim (int): Number of input channels.
+        kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
+        dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
+            Defaults to (1, 3, 5).
+        lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
+            Defaults to 0.1.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_size: int = 3,
+        dilation: Tuple[int, int, int] = (1, 3, 5),
+        lrelu_slope: float = 0.1,
+        layer_scale_init_value: Optional[float] = None,
+    ):
+        super().__init__()
+        self.lrelu_slope = lrelu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=self.get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=self.get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=self.get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))),
+                weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))),
+                weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))),
+            ]
+        )
+        self.gamma = nn.ParameterList(
+            [
+                nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True)
+                if layer_scale_init_value is not None
+                else None,
+                nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True)
+                if layer_scale_init_value is not None
+                else None,
+                nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True)
+                if layer_scale_init_value is not None
+                else None,
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma):
+            xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope)
+            xt = c1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope)
+            xt = c2(xt)
+            if gamma is not None:
+                xt = gamma * xt
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+    @staticmethod
+    def get_padding(kernel_size: int, dilation: int = 1) -> int:
+        return int((kernel_size * dilation - dilation) / 2)
+def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
+    """
+    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
+    Args:
+        x (Tensor): Input tensor.
+        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
+    Returns:
+        Tensor: Element-wise logarithm of the input tensor with clipping applied.
+    """
+    return torch.log(torch.clip(x, min=clip_val))
+def symlog(x: torch.Tensor) -> torch.Tensor:
+    return torch.sign(x) * torch.log1p(x.abs())
+def symexp(x: torch.Tensor) -> torch.Tensor:
+    return torch.sign(x) * (torch.exp(x.abs()) - 1)

codec/models/wavvae/spectral_ops.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import numpy as np
+import scipy
+import torch
+from torch import nn, view_as_real, view_as_complex
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        assert spec.dim() == 3, "Expected a 3D tensor as input"
+        B, N, T = spec.shape
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+        )[:, 0, 0, pad:-pad]
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = torch.nn.functional.fold(
+            window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+        ).squeeze()[pad:-pad]
+        # Normalize
+        assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        return y
+class MDCT(nn.Module):
+    """
+    Modified Discrete Cosine Transform (MDCT) module.
+    Args:
+        frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, frame_len: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.frame_len = frame_len
+        N = frame_len // 2
+        n0 = (N + 1) / 2
+        window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
+        self.register_buffer("window", window)
+        pre_twiddle = torch.exp(-1j * torch.pi * torch.arange(frame_len) / frame_len)
+        post_twiddle = torch.exp(-1j * torch.pi * n0 * (torch.arange(N) + 0.5) / N)
+        # view_as_real: NCCL Backend does not support ComplexFloat data type
+        # https://github.com/pytorch/pytorch/issues/71613
+        self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
+        self.register_buffer("post_twiddle", view_as_real(post_twiddle))
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the Modified Discrete Cosine Transform (MDCT) to the input audio.
+        Args:
+            audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size
+                and T is the length of the audio.
+        Returns:
+            Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames
+                and N is the number of frequency bins.
+        """
+        if self.padding == "center":
+            audio = torch.nn.functional.pad(audio, (self.frame_len // 2, self.frame_len // 2))
+        elif self.padding == "same":
+            # hop_length is 1/2 frame_len
+            audio = torch.nn.functional.pad(audio, (self.frame_len // 4, self.frame_len // 4))
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        x = audio.unfold(-1, self.frame_len, self.frame_len // 2)
+        N = self.frame_len // 2
+        x = x * self.window.expand(x.shape)
+        X = torch.fft.fft(x * view_as_complex(self.pre_twiddle).expand(x.shape), dim=-1)[..., :N]
+        res = X * view_as_complex(self.post_twiddle).expand(X.shape) * np.sqrt(1 / N)
+        return torch.real(res) * np.sqrt(2)
+class IMDCT(nn.Module):
+    """
+    Inverse Modified Discrete Cosine Transform (IMDCT) module.
+    Args:
+        frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, frame_len: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.frame_len = frame_len
+        N = frame_len // 2
+        n0 = (N + 1) / 2
+        window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
+        self.register_buffer("window", window)
+        pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N)
+        post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2))
+        self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
+        self.register_buffer("post_twiddle", view_as_real(post_twiddle))
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients.
+        Args:
+            X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size,
+                L is the number of frames, and N is the number of frequency bins.
+        Returns:
+            Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio.
+        """
+        B, L, N = X.shape
+        Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device)
+        Y[..., :N] = X
+        Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,)))
+        y = torch.fft.ifft(Y * view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1)
+        y = torch.real(y * view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2)
+        result = y * self.window.expand(y.shape)
+        output_size = (1, (L + 1) * N)
+        audio = torch.nn.functional.fold(
+            result.transpose(1, 2),
+            output_size=output_size,
+            kernel_size=(1, self.frame_len),
+            stride=(1, self.frame_len // 2),
+        )[:, 0, 0, :]
+        if self.padding == "center":
+            pad = self.frame_len // 2
+        elif self.padding == "same":
+            pad = self.frame_len // 4
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        audio = audio[:, pad:-pad]
+        return audio

codec/scripts/compare_codecs.py ADDED Viewed

	@@ -0,0 +1,441 @@

+#!/usr/bin/env python3
+import argparse
+import json
+import os
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Tuple
+import torch
+from torchaudio import load as ta_load
+from torchaudio.functional import resample as ta_resample
+import torchaudio
+# Your libs
+from zcodec.models import WavVAE, ZFlowAutoEncoder
+# -------------------------
+# Data structures
+# -------------------------
+@dataclass
+class DecodeParams:
+    num_steps: int = 10
+    cfg: float = 2.0
+@dataclass
+class ModelPairSpec:
+    name: str
+    wavvae_dir: str
+    zflowae_dir: str
+    decode: DecodeParams
+# -------------------------
+# Utilities
+# -------------------------
+def load_json_if_exists(path: Path) -> Optional[Dict[str, Any]]:
+    if path.is_file():
+        try:
+            with path.open("r", encoding="utf-8") as f:
+                return json.load(f)
+        except Exception:
+            return None
+    return None
+def read_config_any(checkpoint_dir: str) -> Dict[str, Any]:
+    """
+    Try to read config.json (or a few common fallbacks) from a checkpoint dir.
+    Returns {} if nothing could be parsed.
+    """
+    cand = [
+        Path(checkpoint_dir) / "config.json",
+        Path(checkpoint_dir)
+        / "config.yaml",  # won't parse yaml here, we only display path
+        Path(checkpoint_dir) / "model_config.json",
+    ]
+    for p in cand:
+        if p.exists():
+            if p.suffix == ".json":
+                j = load_json_if_exists(p)
+                if j is not None:
+                    return j
+            else:
+                # For YAML or unknown, just show filename rather than failing
+                return {"_config_file": str(p)}
+    return {}
+def sanitize_name(s: str) -> str:
+    return "".join(c if c.isalnum() or c in "-_." else "_" for c in s)
+def ensure_mono_and_resample(
+    wav: torch.Tensor, sr: int, target_sr: int
+) -> Tuple[torch.Tensor, int]:
+    """
+    wav: (channels, samples)
+    returns mono float32 in [-1,1], resampled to target_sr
+    """
+    if wav.ndim != 2:
+        raise ValueError(f"Expected 2D waveform (C, T), got shape {tuple(wav.shape)}")
+    # to mono
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    # resample if needed
+    if sr != target_sr:
+        wav = ta_resample(wav, sr, target_sr)
+        sr = target_sr
+    return wav.to(torch.float32), sr
+def save_wav(path: Path, wav: torch.Tensor, sr: int):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # (C, T)
+    if wav.ndim == 1:
+        wav = wav.unsqueeze(0)
+    # Clamp to [-1,1]
+    wav = wav.clamp(-1, 1).contiguous().cpu()
+    torchaudio.save(
+        str(path), wav, sample_rate=sr, encoding="PCM_S", bits_per_sample=16
+    )
+# -------------------------
+# Core inference
+# -------------------------
+@torch.inference_mode()
+def reconstruct_full_pipeline(
+    wav_mono: torch.Tensor,
+    sr: int,
+    wavvae: WavVAE,
+    zflowae: ZFlowAutoEncoder,
+    decode_params: DecodeParams,
+    device: str,
+) -> torch.Tensor:
+    """
+    Full path: audio -> WavVAE.encode -> ZFlowAE.encode -> ZFlowAE.decode -> WavVAE.decode -> audio_hat
+    """
+    wav_mono = wav_mono.to(device)
+    # WavVAE expects (B, C, T); assume C=1
+    x = wav_mono.unsqueeze(0)  # (1, 1, T)
+    # Encode to high-framerate latents
+    z = wavvae.encode(x)
+    # Compress latents
+    y = zflowae.encode(z)
+    # Decompress
+    z_hat = zflowae.decode(y, num_steps=decode_params.num_steps, cfg=decode_params.cfg)
+    # Decode to waveform
+    wav_hat = wavvae.decode(z_hat)  # (1, 1, T)
+    # Return mono 1D
+    return wav_hat.squeeze(0).squeeze(0).detach()
+def load_model_pair(spec: ModelPairSpec, device: str):
+    wavvae = WavVAE.from_pretrained_local(spec.wavvae_dir).to(device)
+    zflowae = ZFlowAutoEncoder.from_pretrained_local(spec.zflowae_dir).to(device)
+    # try to get sampling rate from WavVAE
+    target_sr = getattr(wavvae, "sampling_rate", None)
+    if target_sr is None:
+        # reasonable fallback
+        target_sr = 24000
+    return wavvae, zflowae, int(target_sr)
+def parse_manifest(path: str) -> List[ModelPairSpec]:
+    """
+    Manifest format (JSON list):
+    [
+      {
+        "name": "zdim32x8",
+        "wavvae": "/path/to/WavVAE_framerate100_zdim32/",
+        "zflowae": "/path/to/ZFlowAutoEncoder_stride4_zdim32_vae8_.../",
+        "decode": {"num_steps": 10, "cfg": 2.0}
+      }
+    ]
+    """
+    with open(path, "r", encoding="utf-8") as f:
+        raw = json.load(f)
+    out: List[ModelPairSpec] = []
+    for item in raw:
+        name = item["name"]
+        wavvae_dir = item["wavvae"]
+        zflowae_dir = item["zflowae"]
+        d = item.get("decode", {})
+        out.append(
+            ModelPairSpec(
+                name=name,
+                wavvae_dir=wavvae_dir,
+                zflowae_dir=zflowae_dir,
+                decode=DecodeParams(
+                    num_steps=int(d.get("num_steps", 10)),
+                    cfg=float(d.get("cfg", 2.0)),
+                ),
+            )
+        )
+    return out
+# -------------------------
+# HTML generation
+# -------------------------
+def html_escape(s: str) -> str:
+    return (
+        s.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+        .replace("'", "&#39;")
+    )
+def make_html(
+    output_dir: Path,
+    audio_files: List[Path],
+    models: List[ModelPairSpec],
+    sr_by_model: Dict[str, int],
+    wavvae_cfg: Dict[str, Dict[str, Any]],
+    zflow_cfg: Dict[str, Dict[str, Any]],
+) -> str:
+    """
+    Build a static HTML page with a table:
+      Row = input audio file
+      Col 1 = Original
+      Col 2..N = each model reconstruction
+    Also shows minimal model config info above the table.
+    """
+    def player(src_rel: str, controls: bool = True) -> str:
+        return f'<audio {"controls" if controls else ""} preload="none" src="{html_escape(src_rel)}"></audio>'
+    # Model cards
+    model_cards = []
+    for spec in models:
+        wcfg = wavvae_cfg.get(spec.name, {})
+        zcfg = zflow_cfg.get(spec.name, {})
+        w_short = json.dumps(wcfg if wcfg else {"_": "no JSON config found"}, indent=2)[
+            :1200
+        ]
+        z_short = json.dumps(zcfg if zcfg else {"_": "no JSON config found"}, indent=2)[
+            :1200
+        ]
+        card = f"""
+        <div class="model-card">
+          <h3>{html_escape(spec.name)}</h3>
+          <p><b>Sample rate</b>: {sr_by_model.get(spec.name, "N/A")} Hz</p>
+          <details>
+            <summary>WavVAE config</summary>
+            <pre>{html_escape(w_short)}</pre>
+          </details>
+          <details>
+            <summary>ZFlowAE config</summary>
+            <pre>{html_escape(z_short)}</pre>
+          </details>
+          <p><b>Decode</b>: num_steps={spec.decode.num_steps}, cfg={spec.decode.cfg}</p>
+        </div>
+        """
+        model_cards.append(card)
+    # Table header
+    th = "<th>Input</th><th>Original</th>" + "".join(
+        f"<th>{html_escape(m.name)}</th>" for m in models
+    )
+    # Rows
+    rows = []
+    for af in audio_files:
+        base = af.stem
+        orig_rel = f"original/{html_escape(af.name)}"
+        tds = [f"<td>{html_escape(base)}</td>", f"<td>{player(orig_rel)}</td>"]
+        for m in models:
+            rec_rel = f"recon/{html_escape(m.name)}/{html_escape(base)}.wav"
+            tds.append(f"<td>{player(rec_rel)}</td>")
+        rows.append("<tr>" + "".join(tds) + "</tr>")
+    # Simple CSS to keep it clean
+    css = """
+    body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; padding: 20px; }
+    h1 { margin-bottom: 0.2rem; }
+    .cards { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 12px; margin-bottom: 18px; }
+    .model-card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
+    table { border-collapse: collapse; width: 100%; }
+    th, td { border: 1px solid #eee; padding: 8px; vertical-align: top; }
+    th { background: #fafafa; position: sticky; top: 0; }
+    audio { width: 260px; }
+    """
+    html = f"""<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8"/>
+    <title>Codec Comparison</title>
+    <style>{css}</style>
+  </head>
+  <body>
+    <h1>Codec Comparison</h1>
+    <p>This page compares reconstructions across model checkpoints. Click play in each cell.</p>
+    <h2>Models</h2>
+    <div class="cards">
+      {"".join(model_cards)}
+    </div>
+    <h2>Audio</h2>
+    <table>
+      <thead><tr>{th}</tr></thead>
+      <tbody>
+        {"".join(rows)}
+      </tbody>
+    </table>
+  </body>
+</html>
+"""
+    out = output_dir / "index.html"
+    out.write_text(html, encoding="utf-8")
+    return str(out)
+# -------------------------
+# Main
+# -------------------------
+def main():
+    p = argparse.ArgumentParser(
+        description="Compare Z-Codec configurations and generate a static HTML page."
+    )
+    p.add_argument(
+        "--manifest",
+        type=str,
+        required=True,
+        help="JSON file listing model pairs. See docstring in parse_manifest().",
+    )
+    p.add_argument(
+        "--audio", type=str, nargs="+", required=True, help="List of input audio files."
+    )
+    p.add_argument(
+        "--out",
+        type=str,
+        default="codec_compare_out",
+        help="Output directory for reconstructions and HTML.",
+    )
+    p.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to run inference on (cuda or cpu).",
+    )
+    p.add_argument(
+        "--force",
+        action="store_true",
+        help="Recompute even if target wav already exists.",
+    )
+    args = p.parse_args()
+    device = "cuda" if args.device == "cuda" and torch.cuda.is_available() else "cpu"
+    out_dir = Path(args.out)
+    orig_dir = out_dir / "original"
+    recon_dir = out_dir / "recon"
+    orig_dir.mkdir(parents=True, exist_ok=True)
+    recon_dir.mkdir(parents=True, exist_ok=True)
+    # Parse models
+    specs = parse_manifest(args.manifest)
+    if not specs:
+        print("No models in manifest.", file=sys.stderr)
+        sys.exit(1)
+    # Load models
+    loaded: Dict[str, Dict[str, Any]] = {}
+    sr_by_model: Dict[str, int] = {}
+    wavvae_cfg: Dict[str, Dict[str, Any]] = {}
+    zflow_cfg: Dict[str, Dict[str, Any]] = {}
+    for spec in specs:
+        print(f"[Load] {spec.name}")
+        wavvae, zflowae, target_sr = load_model_pair(spec, device)
+        loaded[spec.name] = {"wavvae": wavvae, "zflowae": zflowae, "sr": target_sr}
+        sr_by_model[spec.name] = target_sr
+        wavvae_cfg[spec.name] = read_config_any(spec.wavvae_dir)
+        zflow_cfg[spec.name] = read_config_any(spec.zflowae_dir)
+    # Process audio files
+    audio_files = [Path(a) for a in args.audio]
+    for af in audio_files:
+        if not af.exists():
+            print(f"[Skip] Missing: {af}", file=sys.stderr)
+            continue
+        # copy original (resampled per model? We'll store original as-is)
+        # Just place the original file for direct playback
+        # If it's not wav, we still copy a WAV version for compatibility.
+        # But simplest: if not wav, we re-save as wav 16-bit for the page.
+        out_orig = orig_dir / af.name
+        if args.force or not out_orig.exists():
+            # Load and resave as wav to ensure browser-compat
+            wav, sr = ta_load(str(af))
+            # make it mono for fair listening
+            wav_mono, sr = ensure_mono_and_resample(wav, sr, sr)
+            save_wav(out_orig.with_suffix(".wav"), wav_mono, sr)
+            # keep the name consistent in the HTML (use .wav)
+            af = af.with_suffix(".wav")
+            # rename saved file to matched name
+            if out_orig.suffix != ".wav":
+                # Clean: ensure HTML references the .wav filename
+                out_orig = out_orig.with_suffix(".wav")
+        # For each model, run full pipeline and save
+        base = af.stem
+        # Re-load from disk to ensure consistent start-point (original .wav in out folder)
+        wav0, sr0 = ta_load(str(out_orig if out_orig.exists() else orig_dir / af.name))
+        # Make mono only once; resample per-model to each target SR
+        if wav0.size(0) > 1:
+            wav0 = wav0.mean(dim=0, keepdim=True)
+        for spec in specs:
+            mname = spec.name
+            target_sr = sr_by_model[mname]
+            # resample to model's SR
+            if sr0 != target_sr:
+                wav_mono = ta_resample(wav0, sr0, target_sr)
+            else:
+                wav_mono = wav0
+            # reconstruct
+            out_path = recon_dir / mname / f"{sanitize_name(base)}.wav"
+            if args.force or not out_path.exists():
+                print(f"[Reconstruct] {mname} ← {base}")
+                wavvae = loaded[mname]["wavvae"]
+                zflowae = loaded[mname]["zflowae"]
+                wav_hat = reconstruct_full_pipeline(
+                    wav_mono, target_sr, wavvae, zflowae, spec.decode, device
+                )
+                save_wav(out_path, wav_hat.unsqueeze(0), target_sr)
+    # Build HTML
+    # Rebuild the list of files actually present in original/ (use .wav names)
+    actual_audio = sorted([p for p in (orig_dir).glob("*.wav")])
+    html_path = make_html(
+        out_dir,
+        actual_audio,
+        specs,
+        sr_by_model,
+        wavvae_cfg,
+        zflow_cfg,
+    )
+    print(f"\nDone. Open: {html_path}")
+if __name__ == "__main__":
+    main()

codec/scripts/compare_wavvae.py ADDED Viewed

	@@ -0,0 +1,264 @@

+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torchaudio
+from torchaudio import load as ta_load
+from torchaudio.functional import resample as ta_resample
+from zcodec.models import WavVAE
+# -------------------------
+# Data structures
+# -------------------------
+@dataclass
+class WavVaeSpec:
+    name: str
+    wavvae_dir: str
+# -------------------------
+# Utilities
+# -------------------------
+def load_json_if_exists(path: Path) -> Optional[Dict[str, Any]]:
+    if path.is_file():
+        try:
+            return json.load(path.open("r", encoding="utf-8"))
+        except Exception:
+            return None
+    return None
+def read_config_any(checkpoint_dir: str) -> Dict[str, Any]:
+    cand = [
+        Path(checkpoint_dir) / "config.json",
+        Path(checkpoint_dir) / "model_config.json",
+        Path(checkpoint_dir) / "config.yaml",  # shown as path only
+    ]
+    for p in cand:
+        if p.exists():
+            if p.suffix == ".json":
+                j = load_json_if_exists(p)
+                if j is not None:
+                    return j
+            else:
+                return {"_config_file": str(p)}
+    return {}
+def sanitize_name(s: str) -> str:
+    return "".join(c if c.isalnum() or c in "-_." else "_" for c in s)
+def ensure_mono_and_resample(
+    wav: torch.Tensor, sr: int, target_sr: int
+) -> Tuple[torch.Tensor, int]:
+    if wav.ndim != 2:
+        raise ValueError(f"Expected (C,T), got {tuple(wav.shape)}")
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    if sr != target_sr:
+        wav = ta_resample(wav, sr, target_sr)
+        sr = target_sr
+    return wav.to(torch.float32), sr
+def save_wav(path: Path, wav: torch.Tensor, sr: int):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if wav.ndim == 1:
+        wav = wav.unsqueeze(0)
+    wav = wav.clamp(-1, 1).contiguous().cpu()
+    torchaudio.save(
+        str(path), wav, sample_rate=sr, encoding="PCM_S", bits_per_sample=16
+    )
+def read_audio_manifest(txt_path: str) -> List[Path]:
+    lines = Path(txt_path).read_text(encoding="utf-8").splitlines()
+    files = [
+        Path(l.strip()) for l in lines if l.strip() and not l.strip().startswith("#")
+    ]
+    return files
+def html_escape(s: str) -> str:
+    return (
+        s.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+        .replace("'", "&#39;")
+    )
+def make_html(
+    output_dir: Path,
+    audio_files: List[Path],
+    specs: List[WavVaeSpec],
+    sr_by_model: Dict[str, int],
+    wavvae_cfg: Dict[str, Dict[str, Any]],
+) -> str:
+    def player(src_rel: str) -> str:
+        return f'<audio controls preload="none" src="{html_escape(src_rel)}"></audio>'
+    # cards
+    cards = []
+    for s in specs:
+        cfg = wavvae_cfg.get(s.name, {})
+        cfg_short = json.dumps(cfg if cfg else {"_": "no JSON config found"}, indent=2)[
+            :1200
+        ]
+        card = f"""
+        <div class="model-card">
+          <h3>{html_escape(s.name)}</h3>
+          <p><b>Sample rate</b>: {sr_by_model.get(s.name, "N/A")} Hz</p>
+          <details><summary>WavVAE config</summary><pre>{html_escape(cfg_short)}</pre></details>
+        </div>
+        """
+        cards.append(card)
+    css = """
+    body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; padding: 20px; }
+    .cards { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 12px; margin-bottom: 18px; }
+    .model-card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
+    table { border-collapse: collapse; width: 100%; }
+    th, td { border: 1px solid #eee; padding: 8px; vertical-align: top; }
+    th { background: #fafafa; position: sticky; top: 0; }
+    audio { width: 260px; }
+    """
+    th = "<th>Input</th><th>Original</th>" + "".join(
+        f"<th>{html_escape(s.name)}</th>" for s in specs
+    )
+    rows = []
+    for af in audio_files:
+        base = af.stem
+        orig_rel = f"original/{html_escape(af.name)}"
+        tds = [f"<td>{html_escape(base)}</td>", f"<td>{player(orig_rel)}</td>"]
+        for s in specs:
+            rec_rel = f"recon/{html_escape(s.name)}/{html_escape(base)}.wav"
+            tds.append(f"<td>{player(rec_rel)}</td>")
+        rows.append("<tr>" + "".join(tds) + "</tr>")
+    html = f"""<!doctype html>
+<html>
+  <head><meta charset="utf-8"/><title>WavVAE Comparison</title><style>{css}</style></head>
+  <body>
+    <h1>WavVAE Comparison</h1>
+    <div class="cards">{"".join(cards)}</div>
+    <table>
+      <thead><tr>{th}</tr></thead>
+      <tbody>{"".join(rows)}</tbody>
+    </table>
+  </body>
+</html>
+"""
+    out = output_dir / "index.html"
+    out.write_text(html, encoding="utf-8")
+    return str(out)
+# -------------------------
+# Core
+# -------------------------
+@torch.inference_mode()
+def reconstruct_wavvae(
+    wav_mono: torch.Tensor, wavvae: WavVAE, device: str
+) -> torch.Tensor:
+    x = wav_mono.to(device)  # (1,T)
+    z = wavvae.encode(x)
+    wav_hat = wavvae.decode(z)  # (1,1,T)
+    return wav_hat.squeeze(0).squeeze(0).detach()
+def parse_models_manifest(path: str) -> List[WavVaeSpec]:
+    """
+    JSON list of:
+    {"name": "...", "wavvae": "/path/to/WavVAE_dir"}
+    """
+    raw = json.loads(Path(path).read_text(encoding="utf-8"))
+    specs = []
+    for it in raw:
+        specs.append(WavVaeSpec(name=it["name"], wavvae_dir=it["wavvae"]))
+    return specs
+def main():
+    ap = argparse.ArgumentParser(
+        description="Compare WavVAE checkpoints and generate a static HTML page."
+    )
+    ap.add_argument("--models", required=True, help="JSON manifest of WavVAE models.")
+    ap.add_argument(
+        "--audio_manifest", required=True, help="TXT file: one audio path per line."
+    )
+    ap.add_argument("--out", default="compare_wavvae_out")
+    ap.add_argument("--device", default="cuda")
+    ap.add_argument("--force", action="store_true")
+    args = ap.parse_args()
+    device = "cuda" if args.device == "cuda" and torch.cuda.is_available() else "cpu"
+    out_dir = Path(args.out)
+    (out_dir / "original").mkdir(parents=True, exist_ok=True)
+    recon_dir = out_dir / "recon"
+    recon_dir.mkdir(parents=True, exist_ok=True)
+    specs = parse_models_manifest(args.models)
+    if not specs:
+        print("No models.", file=sys.stderr)
+        sys.exit(1)
+    # load models
+    wavvae_by_name: Dict[str, WavVAE] = {}
+    sr_by_model: Dict[str, int] = {}
+    wavvae_cfg: Dict[str, Dict[str, Any]] = {}
+    for s in specs:
+        print(f"[Load] {s.name}")
+        w = WavVAE.from_pretrained_local(s.wavvae_dir).to(device)
+        wavvae_by_name[s.name] = w
+        sr_by_model[s.name] = int(getattr(w, "sampling_rate", 24000))
+        wavvae_cfg[s.name] = read_config_any(s.wavvae_dir)
+    audio_paths = read_audio_manifest(args.audio_manifest)
+    # normalize originals to wav+mono (browser-friendly); keep native sr for original column
+    actual_audio = []
+    for ap in audio_paths:
+        if not ap.exists():
+            print(f"[Skip missing] {ap}", file=sys.stderr)
+            continue
+        wav, sr = ta_load(str(ap))
+        wav_mono, sr = ensure_mono_and_resample(wav, sr, sr)
+        out_orig = out_dir / "original" / (ap.stem + ".wav")
+        if args.force or not out_orig.exists():
+            save_wav(out_orig, wav_mono, sr)
+        actual_audio.append(out_orig)
+    # recon per model
+    for out_orig in actual_audio:
+        wav0, sr0 = ta_load(str(out_orig))
+        if wav0.size(0) > 1:
+            wav0 = wav0.mean(dim=0, keepdim=True)
+        for s in specs:
+            target_sr = sr_by_model[s.name]
+            wav_in = ta_resample(wav0, sr0, target_sr) if sr0 != target_sr else wav0
+            out_path = recon_dir / s.name / f"{sanitize_name(out_orig.stem)}.wav"
+            if args.force or not out_path.exists():
+                print(f"[Reconstruct] {s.name} ← {out_orig.name}")
+                wav_hat = reconstruct_wavvae(wav_in, wavvae_by_name[s.name], device)
+                save_wav(out_path, wav_hat, target_sr)
+    html_path = make_html(out_dir, actual_audio, specs, sr_by_model, wavvae_cfg)
+    print(f"Done. Open: {html_path}")
+if __name__ == "__main__":
+    main()

codec/scripts/compare_zcodec.py ADDED Viewed

	@@ -0,0 +1,312 @@

+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torchaudio
+from torchaudio import load as ta_load
+from torchaudio.functional import resample as ta_resample
+from zcodec.models import WavVAE, ZFlowAutoEncoder
+# -------------------------
+# Data structures
+# -------------------------
+@dataclass
+class DecodeParams:
+    num_steps: int = 10
+    cfg: float = 2.0
+@dataclass
+class StackSpec:
+    name: str
+    wavvae_dir: str
+    zflowae_dir: str
+    decode: DecodeParams
+# -------------------------
+# Utilities (same helpers)
+# -------------------------
+def load_json_if_exists(path: Path):
+    if path.is_file():
+        try:
+            return json.load(path.open("r", encoding="utf-8"))
+        except Exception:
+            return None
+    return None
+def read_config_any(checkpoint_dir: str) -> Dict[str, Any]:
+    cand = [
+        Path(checkpoint_dir) / "config.json",
+        Path(checkpoint_dir) / "model_config.json",
+        Path(checkpoint_dir) / "config.yaml",
+    ]
+    for p in cand:
+        if p.exists():
+            if p.suffix == ".json":
+                j = load_json_if_exists(p)
+                if j is not None:
+                    return j
+            else:
+                return {"_config_file": str(p)}
+    return {}
+def sanitize_name(s: str) -> str:
+    return "".join(c if c.isalnum() or c in "-_." else "_" for c in s)
+def ensure_mono_and_resample(
+    wav: torch.Tensor, sr: int, target_sr: int
+) -> Tuple[torch.Tensor, int]:
+    if wav.ndim != 2:
+        raise ValueError(f"Expected (C,T), got {tuple(wav.shape)}")
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    if sr != target_sr:
+        wav = ta_resample(wav, sr, target_sr)
+        sr = target_sr
+    return wav.to(torch.float32), sr
+def save_wav(path: Path, wav: torch.Tensor, sr: int):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if wav.ndim == 1:
+        wav = wav.unsqueeze(0)
+    wav = wav.clamp(-1, 1).contiguous().cpu()
+    torchaudio.save(
+        str(path), wav, sample_rate=sr, encoding="PCM_S", bits_per_sample=16
+    )
+def read_audio_manifest(txt_path: str) -> List[Path]:
+    lines = Path(txt_path).read_text(encoding="utf-8").splitlines()
+    return [
+        Path(l.strip()) for l in lines if l.strip() and not l.strip().startswith("#")
+    ]
+def html_escape(s: str) -> str:
+    return (
+        s.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+        .replace("'", "&#39;")
+    )
+def make_html(
+    output_dir: Path,
+    audio_files: List[Path],
+    specs: List[StackSpec],
+    sr_by_model: Dict[str, int],
+    wavvae_cfg: Dict[str, Dict[str, Any]],
+    zflow_cfg: Dict[str, Dict[str, Any]],
+) -> str:
+    def player(src_rel: str) -> str:
+        return f'<audio controls preload="none" src="{html_escape(src_rel)}"></audio>'
+    cards = []
+    for s in specs:
+        wcfg = wavvae_cfg.get(s.name, {})
+        zcfg = zflow_cfg.get(s.name, {})
+        w_short = json.dumps(wcfg if wcfg else {"_": "no JSON config found"}, indent=2)[
+            :1200
+        ]
+        z_short = json.dumps(zcfg if zcfg else {"_": "no JSON config found"}, indent=2)[
+            :1200
+        ]
+        card = f"""
+        <div class="model-card">
+          <h3>{html_escape(s.name)}</h3>
+          <p><b>Sample rate</b>: {sr_by_model.get(s.name, "N/A")} Hz</p>
+          <p><b>Decode</b>: steps={s.decode.num_steps}, cfg={s.decode.cfg}</p>
+          <details><summary>WavVAE config</summary><pre>{html_escape(w_short)}</pre></details>
+          <details><summary>ZFlowAE config</summary><pre>{html_escape(z_short)}</pre></details>
+        </div>
+        """
+        cards.append(card)
+    css = """
+    body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; padding: 20px; }
+    .cards { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 12px; margin-bottom: 18px; }
+    .model-card { border: 1px solid #ddd; border-radius: 12px; padding: 12px; }
+    table { border-collapse: collapse; width: 100%; }
+    th, td { border: 1px solid #eee; padding: 8px; vertical-align: top; }
+    th { background: #fafafa; position: sticky; top: 0; }
+    audio { width: 260px; }
+    """
+    th = "<th>Input</th><th>Original</th>" + "".join(
+        f"<th>{html_escape(s.name)}</th>" for s in specs
+    )
+    rows = []
+    for af in audio_files:
+        base = af.stem
+        orig_rel = f"original/{html_escape(af.name)}"
+        tds = [f"<td>{html_escape(base)}</td>", f"<td>{player(orig_rel)}</td>"]
+        for s in specs:
+            rec_rel = f"recon/{html_escape(s.name)}/{html_escape(base)}.wav"
+            tds.append(f"<td>{player(rec_rel)}</td>")
+        rows.append("<tr>" + "".join(tds) + "</tr>")
+    html = f"""<!doctype html>
+<html>
+  <head><meta charset="utf-8"/><title>Stacked Codec Comparison</title><style>{css}</style></head>
+  <body>
+    <h1>WavVAE + ZFlowAE Comparison</h1>
+    <div class="cards">{"".join(cards)}</div>
+    <table>
+      <thead><tr>{th}</tr></thead>
+      <tbody>{"".join(rows)}</tbody>
+    </table>
+  </body>
+</html>
+"""
+    out = output_dir / "index.html"
+    out.write_text(html, encoding="utf-8")
+    return str(out)
+# -------------------------
+# Core
+# -------------------------
+@torch.inference_mode()
+def reconstruct_stack(
+    wav_mono: torch.Tensor,
+    wavvae: WavVAE,
+    zflow: ZFlowAutoEncoder,
+    steps: int,
+    cfg: float,
+    device: str,
+) -> torch.Tensor:
+    x = wav_mono.to(device)  # (1,T)
+    z = wavvae.encode(x)  # high-framerate latents
+    y, _ = zflow.encode(z)  # compressed latents
+    z_hat = zflow.decode(y, num_steps=steps, cfg=cfg)
+    wav_hat = wavvae.decode(z_hat)  # (1,1,T)
+    return wav_hat.squeeze(0).squeeze(0).detach()
+def parse_models_manifest(path: str) -> List[StackSpec]:
+    """
+    JSON list of:
+    {
+      "name": "...",
+      "wavvae": "/path/to/WavVAE_dir",
+      "zflowae": "/path/to/ZFlowAE_dir",
+      "decode": {"num_steps": 10, "cfg": 2.0}
+    }
+    """
+    raw = json.loads(Path(path).read_text(encoding="utf-8"))
+    specs = []
+    for it in raw:
+        d = it.get("decode", {})
+        specs.append(
+            StackSpec(
+                name=it["name"],
+                wavvae_dir=it["wavvae"],
+                zflowae_dir=it["zflowae"],
+                decode=DecodeParams(
+                    num_steps=int(d.get("num_steps", 10)), cfg=float(d.get("cfg", 2.0))
+                ),
+            )
+        )
+    return specs
+def main():
+    ap = argparse.ArgumentParser(
+        description="Compare WavVAE+ZFlowAE stacks and generate a static HTML page."
+    )
+    ap.add_argument("--models", required=True, help="JSON manifest of stacks.")
+    ap.add_argument(
+        "--audio_manifest", required=True, help="TXT file: one audio path per line."
+    )
+    ap.add_argument("--out", default="compare_stack_out")
+    ap.add_argument("--device", default="cuda")
+    ap.add_argument("--force", action="store_true")
+    args = ap.parse_args()
+    device = "cuda" if args.device == "cuda" and torch.cuda.is_available() else "cpu"
+    out_dir = Path(args.out)
+    (out_dir / "original").mkdir(parents=True, exist_ok=True)
+    recon_dir = out_dir / "recon"
+    recon_dir.mkdir(parents=True, exist_ok=True)
+    specs = parse_models_manifest(args.models)
+    if not specs:
+        print("No models.", file=sys.stderr)
+        sys.exit(1)
+    # load models
+    wavvae_by_name: Dict[str, WavVAE] = {}
+    zflow_by_name: Dict[str, ZFlowAutoEncoder] = {}
+    sr_by_model: Dict[str, int] = {}
+    wavvae_cfg: Dict[str, Dict[str, Any]] = {}
+    zflow_cfg: Dict[str, Dict[str, Any]] = {}
+    for s in specs:
+        print(f"[Load] {s.name}")
+        w = WavVAE.from_pretrained_local(s.wavvae_dir).to(device)
+        z = ZFlowAutoEncoder.from_pretrained_local(s.zflowae_dir).to(device)
+        wavvae_by_name[s.name] = w
+        zflow_by_name[s.name] = z
+        sr_by_model[s.name] = int(getattr(w, "sampling_rate", 24000))
+        wavvae_cfg[s.name] = read_config_any(s.wavvae_dir)
+        zflow_cfg[s.name] = read_config_any(s.zflowae_dir)
+    audio_paths = read_audio_manifest(args.audio_manifest)
+    actual_audio = []
+    for ap in audio_paths:
+        if not ap.exists():
+            print(f"[Skip missing] {ap}", file=sys.stderr)
+            continue
+        wav, sr = ta_load(str(ap))
+        wav_mono, sr = ensure_mono_and_resample(wav, sr, sr)
+        out_orig = out_dir / "original" / (ap.stem + ".wav")
+        if args.force or not out_orig.exists():
+            save_wav(out_orig, wav_mono, sr)
+        actual_audio.append(out_orig)
+    for out_orig in actual_audio:
+        wav0, sr0 = ta_load(str(out_orig))
+        if wav0.size(0) > 1:
+            wav0 = wav0.mean(dim=0, keepdim=True)
+        for s in specs:
+            target_sr = sr_by_model[s.name]
+            wav_in = ta_resample(wav0, sr0, target_sr) if sr0 != target_sr else wav0
+            out_path = recon_dir / s.name / f"{sanitize_name(out_orig.stem)}.wav"
+            if args.force or not out_path.exists():
+                print(f"[Reconstruct] {s.name} ← {out_orig.name}")
+                wav_hat = reconstruct_stack(
+                    wav_in,
+                    wavvae_by_name[s.name],
+                    zflow_by_name[s.name],
+                    s.decode.num_steps,
+                    s.decode.cfg,
+                    device,
+                )
+                save_wav(out_path, wav_hat, target_sr)
+    html_path = make_html(
+        out_dir, actual_audio, specs, sr_by_model, wavvae_cfg, zflow_cfg
+    )
+    print(f"Done. Open: {html_path}")
+if __name__ == "__main__":
+    main()

codec/scripts/compute_stats.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import argparse
+import random
+import torch
+from safetensors.torch import safe_open, save_file
+from tqdm import tqdm
+def load_tensor(path: str, key: str = "embedding") -> torch.Tensor:
+    with safe_open(path, framework="pt", device="cpu") as f:
+        return f.get_tensor(key)
+def compute_global_stats(file_list, key="embedding", length_weighted=True):
+    sum_all = None
+    sum_sq_all = None
+    count_all = 0
+    for path in tqdm(file_list, desc="Computing stats"):
+        tensor = load_tensor(path, key)  # shape: [B, T, D]
+        flat = tensor.reshape(-1, tensor.shape[-1])  # [B*T, D]
+        sum_ = flat.sum(dim=0)  # [D]
+        sum_sq = (flat**2).sum(dim=0)  # [D]
+        count = flat.shape[0]  # B*T
+        if sum_all is None:
+            sum_all = sum_
+            sum_sq_all = sum_sq
+        else:
+            sum_all += sum_
+            sum_sq_all += sum_sq
+        count_all += count
+    mean = sum_all / count_all
+    var = sum_sq_all / count_all - mean**2
+    std = torch.sqrt(torch.clamp(var, min=1e-8))
+    return mean, std
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "filelist", type=str, help="Text file with list of safetensors paths"
+    )
+    parser.add_argument("output", type=str, help="Path to output stats.safetensors")
+    parser.add_argument(
+        "--key", type=str, default="audio_z", help="Key of tensor in safetensors file"
+    )
+    parser.add_argument(
+        "--max-files", type=int, default=None, help="Max number of files to process"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for shuffling"
+    )
+    args = parser.parse_args()
+    with open(args.filelist) as f:
+        files = [line.strip() for line in f if line.strip()]
+    if args.max_files:
+        random.seed(args.seed)
+        files = random.sample(files, k=min(args.max_files, len(files)))
+    mean, std = compute_global_stats(files, key=args.key)
+    save_file({"mean": mean, "std": std}, args.output)
+    print(f"✅ Saved to {args.output}")
+    print("Example mean/std:", mean[:5], std[:5])
+if __name__ == "__main__":
+    main()

codec/scripts/compute_wer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import argparse
+import json
+import string
+from jiwer import wer
+def normalize_text(text: str) -> str:
+    """
+    Lowercase and remove punctuation from a string.
+    Args:
+        text (str): Input string
+    Returns:
+        str: Normalized string
+    """
+    # Lowercase
+    text = text.lower()
+    # Remove punctuation
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    return text
+def load_transcripts(jsonl_path):
+    originals = []
+    reconstructions = []
+    with open(jsonl_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            originals.append(data["original_text"])
+            reconstructions.append(data["reconstructed_text"])
+    return originals, reconstructions
+def main(args):
+    originals, reconstructions = map(normalize_text, load_transcripts(args.jsonl))
+    score = wer(originals, reconstructions)
+    print(f"WER: {score:.3%}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--jsonl", type=str, required=True, help="Path to the transcript JSONL file"
+    )
+    args = parser.parse_args()
+    main(args)

codec/scripts/compute_wer_from_refs.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import argparse
+import json
+import string
+from pathlib import Path
+from jiwer import cer, wer
+def normalize_text(text: str) -> str:
+    """
+    Lowercase and remove punctuation from a string.
+    Args:
+        text (str): Input string
+    Returns:
+        str: Normalized string
+    """
+    # Lowercase
+    text = text.lower()
+    # Remove punctuation
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    return text
+def load_jsonl_dict(path):
+    transcripts = {}
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            transcripts[Path(data["file"]).name] = data["transcript"]
+    return transcripts
+def main(args):
+    ref_dict = load_jsonl_dict(args.reference)
+    hyp_dict = load_jsonl_dict(args.hypothesis)
+    common_files = set(ref_dict.keys()) & set(hyp_dict.keys())
+    if not common_files:
+        print("No common files between reference and hypothesis.")
+        return
+    refs = [normalize_text(ref_dict[f]) for f in sorted(common_files)]
+    hyps = [normalize_text(hyp_dict[f]) for f in sorted(common_files)]
+    cer_score = cer(refs, hyps)
+    wer_score = wer(refs, hyps)
+    print(f"CER: {cer_score:.3%}")
+    print(f"WER: {wer_score:.3%}")
+    print(f"Evaluated on {len(common_files)} files.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--reference", type=str, required=True, help="Path to reference JSONL"
+    )
+    parser.add_argument(
+        "--hypothesis", type=str, required=True, help="Path to hypothesis JSONL"
+    )
+    args = parser.parse_args()
+    main(args)

codec/scripts/download_expresso.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import soundfile as sf
+from datasets import load_dataset
+dataset = load_dataset("ylacombe/expresso", split="train")
+print(dataset)
+for i, x in enumerate(dataset):
+    audio = x["audio"]
+    wav, sr = audio["array"], audio["sampling_rate"]
+    sf.write(f"expresso/org/{i}.wav", wav, sr)

codec/scripts/download_gigaspeech.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from random import sample
+import soundfile as sf
+from datasets import load_dataset
+# dataset = load_dataset("keithito/lj_speech", split="train")
+#dataset = load_dataset("parler-tts/mls_eng", split="train")
+dataset = load_dataset("speechcolab/gigaspeech", "xl", split="train", token=True)
+Is = sample(list(range(len(dataset))), k=100000)
+print(dataset)
+for i, I in enumerate(Is):
+    audio = dataset[I]["audio"]
+    wav, sr = audio["array"], audio["sampling_rate"]
+    sf.write(f"gigaspeech/{I}.wav", wav, sr)

codec/scripts/download_lj.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import soundfile as sf
+from datasets import load_dataset
+dataset = load_dataset("keithito/lj_speech", split="train")
+print(dataset)
+for i, x in enumerate(dataset):
+    audio = x["audio"]
+    wav, sr = audio["array"], audio["sampling_rate"]
+    sf.write(f"ljspeech/{i}.wav", wav, sr)

codec/scripts/download_ltts.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from pathlib import Path
+import soundfile as sf
+from datasets import load_dataset
+dataset = load_dataset("mythicinfinity/libritts", "clean")
+for split in dataset.keys():
+    Path(f"libritts/{split}").mkdir(exist_ok=True)
+    for i, x in enumerate(dataset[split]):
+        # audio = x["audio"]
+        text = x["text_normalized"]
+        # wav, sr = audio["array"], audio["sampling_rate"]
+        # sf.write(f"libritts/{split}/{i}.wav", wav, sr)
+        with open(f"libritts/{split}/{i}.txt", "w") as f:
+            f.write(text)

codec/scripts/download_mlseng10k.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from random import sample
+import soundfile as sf
+from datasets import load_dataset
+# dataset = load_dataset("keithito/lj_speech", split="train")
+dataset = load_dataset("parler-tts/mls_eng", split="train")
+Is = sample(list(range(len(dataset))), k=100000)
+print(dataset)
+for i, I in enumerate(Is):
+    audio = dataset[I]["audio"]
+    wav, sr = audio["array"], audio["sampling_rate"]
+    sf.write(f"mls10keng/{i}.wav", wav, sr)

codec/scripts/eval_asr.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import argparse
+import json
+from pathlib import Path
+import nemo.collections.asr as nemo_asr
+import torch
+import yaml
+from jiwer import wer
+from torchaudio import load
+from torchaudio.functional import resample
+from tqdm import tqdm
+from zcodec.models import WavVAE, ZFlowAutoEncoder
+def load_config(config_path):
+    with open(config_path, "r") as f:
+        return yaml.safe_load(f)
+def transcribe(audio: torch.Tensor, asr_model) -> str:
+    audio = audio.cpu().numpy(force=True)
+    with torch.inference_mode():
+        return asr_model.transcribe([audio[0]])[0].text
+def main(args):
+    config = load_config(args.config)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load models
+    wavvae = WavVAE.from_pretrained_local(config["wavvae_ckpt"]).to(device).eval()
+    zflowae = (
+        ZFlowAutoEncoder.from_pretrained_local(config["zflowae_ckpt"]).to(device).eval()
+    )
+    # Load ASR model
+    asr_model = nemo_asr.models.ASRModel.from_pretrained(
+        model_name=config.get("asr_model", "nvidia/parakeet-tdt-0.6b-v2")
+    )
+    # Read file list
+    with open(config["file_list"], "r") as f:
+        wav_files = [line.strip() for line in f if line.strip()]
+    results = []
+    for wav_path in tqdm(wav_files, desc="Processing files"):
+        wav, sr = load(wav_path)
+        wav = resample(wav, orig_freq=sr, new_freq=wavvae.sampling_rate).to(device)
+        with torch.inference_mode():
+            # Transcribe original
+            original_text = transcribe(wav, asr_model)
+            # Compress and decompress
+            z = wavvae.encode(wav)
+            zz, _ = zflowae.encode(z)
+            z_hat = zflowae.decode(
+                zz, num_steps=config.get("num_steps", 10), cfg=config.get("cfg", 2.0)
+            )
+            wav_hat = wavvae.decode(z_hat)
+            # Transcribe reconstructed
+            reconstructed_text = transcribe(wav_hat, asr_model)
+        results.append(
+            {
+                "file": wav_path,
+                "original_text": original_text,
+                "reconstructed_text": reconstructed_text,
+            }
+        )
+    # Save output
+    out_path = Path(config.get("output_jsonl", "transcripts.jsonl"))
+    with out_path.open("w") as f:
+        for entry in results:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    print(f"\nSaved {len(results)} transcript pairs to {out_path}")
+    # Optionally compute WER
+    if args.compute_wer:
+        original_texts = [r["original_text"] for r in results]
+        reconstructed_texts = [r["reconstructed_text"] for r in results]
+        score = wer(original_texts, reconstructed_texts)
+        print(f"WER: {score:.3%}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML config")
+    parser.add_argument(
+        "--compute_wer", action="store_true", help="Compute WER after decoding"
+    )
+    args = parser.parse_args()
+    main(args)

codec/scripts/eval_asr_from_filelist.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import argparse
+import json
+from pathlib import Path
+import nemo.collections.asr as nemo_asr
+import torch
+import yaml
+from torchaudio import load
+from torchaudio.functional import resample
+from tqdm import tqdm
+def load_config(config_path):
+    with open(config_path, "r") as f:
+        return yaml.safe_load(f)
+def transcribe(audio: torch.Tensor, asr_model) -> str:
+    audio = audio.cpu().numpy(force=True)
+    with torch.inference_mode():
+        return asr_model.transcribe([audio[0]])[0].text
+def main(args):
+    config = load_config(args.config)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load ASR model
+    asr_model = nemo_asr.models.ASRModel.from_pretrained(
+        model_name=config.get("asr_model", "nvidia/parakeet-tdt-0.6b-v2")
+    )
+    # Read file list
+    with open(config["file_list"], "r") as f:
+        wav_files = [line.strip() for line in f if line.strip()]
+    results = []
+    for wav_path in tqdm(wav_files, desc="Transcribing"):
+        wav, sr = load(wav_path)
+        wav = resample(wav, orig_freq=sr, new_freq=16000).to(device)
+        transcript = transcribe(wav, asr_model)
+        results.append({"file": wav_path, "transcript": transcript})
+    # Save output
+    out_path = Path(config.get("output_jsonl", "asr_transcripts.jsonl"))
+    with out_path.open("w") as f:
+        for entry in results:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    print(f"\nSaved {len(results)} transcripts to {out_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML config")
+    args = parser.parse_args()
+    main(args)