Text-to-Audio
Audiocraft
English
audiogen
styletts2
shift-tts
sound
audio-generation
text-to-speech
mimic3
Instructions to use dkounadis/artificial-styletts2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Audiocraft
How to use dkounadis/artificial-styletts2 with Audiocraft:
from audiocraft.models import AudioGen model = AudioGen.get_pretrained("dkounadis/artificial-styletts2") model.set_generation_params(duration=5) # generate 5 seconds. descriptions = ['dog barking', 'sirene of an emergency vehicle', 'footsteps in a corridor'] wav = model.generate(descriptions) # generates 3 samples. - Notebooks
- Google Colab
- Kaggle
| # coding:utf-8 | |
| import os | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.nn.utils import spectral_norm | |
| from torch.nn.utils.parametrizations import weight_norm | |
| # from Utils.ASR.models import ASRCNN | |
| # from Utils.JDC.model import JDCNet | |
| from Modules.hifigan import _tile, AdainResBlk1d | |
| import math | |
| class MelSpec(torch.nn.Module): | |
| def __init__(self, | |
| sample_rate=17402, # https://github.com/fakerybakery/styletts2-cli/blob/main/msinference.py = Default 16000. However 17400 vocalises better also "en_US/vctk_p274" | |
| n_fft=2048, | |
| win_length=1200, | |
| hop_length=300, | |
| n_mels=80 | |
| ): | |
| '''avoids dependency on torchaudio''' | |
| super().__init__() | |
| self.n_fft = n_fft | |
| self.win_length = win_length if win_length is not None else n_fft | |
| self.hop_length = hop_length if hop_length is not None else self.win_length // 2 | |
| # -- | |
| f_min = 0.0 | |
| f_max = float(sample_rate // 2) | |
| all_freqs = torch.linspace(0, sample_rate // 2, n_fft//2+1) | |
| m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0)) | |
| m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0)) | |
| m_pts = torch.linspace(m_min, m_max, n_mels + 2) | |
| f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0) | |
| f_diff = f_pts[1:] - f_pts[:-1] # (n_mels + 1) | |
| slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1) | |
| zero = torch.zeros(1) | |
| down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_mels) | |
| up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_mels) | |
| fb = torch.max(zero, torch.min(down_slopes, up_slopes)) | |
| # -- | |
| self.register_buffer('fb', fb) | |
| window = torch.hann_window(self.win_length) | |
| self.register_buffer('window', window) | |
| def forward(self, x): | |
| spec_f = torch.stft(x, | |
| self.n_fft, | |
| self.hop_length, | |
| self.win_length, | |
| self.window, | |
| center=True, | |
| pad_mode="reflect", | |
| normalized=False, | |
| onesided=True, | |
| return_complex=True) # [bs, 1025, 56] | |
| mel_specgram = torch.matmul(spec_f.abs().pow(2).transpose(1, 2), self.fb).transpose(1, 2) | |
| return mel_specgram[:, None, :, :] # [bs, 1, 80, time] | |
| class LearnedDownSample(nn.Module): | |
| def __init__(self, dim_in): | |
| super().__init__() | |
| self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=( | |
| 3, 3), stride=(2, 2), groups=dim_in, padding=1)) | |
| def forward(self, x): | |
| return self.conv(x) | |
| class ResBlk(nn.Module): | |
| def __init__(self, | |
| dim_in, dim_out): | |
| super().__init__() | |
| self.actv = nn.LeakyReLU(0.2) # .07 also nice | |
| self.downsample_res = LearnedDownSample(dim_in) | |
| self.learned_sc = dim_in != dim_out | |
| self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1)) | |
| self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1)) | |
| if self.learned_sc: | |
| self.conv1x1 = spectral_norm( | |
| nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)) | |
| def _shortcut(self, x): | |
| if self.learned_sc: | |
| x = self.conv1x1(x) | |
| if x.shape[3] % 2 != 0: # [bs, 128, Freq, Time] | |
| x = torch.cat([x, x[:, :, :, -1:]], dim=3) | |
| return F.interpolate(x, scale_factor=.5, mode='nearest-exact') # F.avg_pool2d(x, 2) | |
| def _residual(self, x): | |
| x = self.actv(x) | |
| x = self.conv1(x) | |
| x = self.downsample_res(x) | |
| x = self.actv(x) | |
| x = self.conv2(x) | |
| return x | |
| def forward(self, x): | |
| x = self._shortcut(x) + self._residual(x) | |
| return x / math.sqrt(2) # unit variance | |
| class StyleEncoder(nn.Module): | |
| # for both acoustic & prosodic ref_s/p | |
| def __init__(self, | |
| dim_in=64, | |
| style_dim=128, | |
| max_conv_dim=512): | |
| super().__init__() | |
| blocks = [spectral_norm(nn.Conv2d(1, dim_in, 3, stride=1, padding=1))] | |
| for _ in range(4): | |
| dim_out = min(dim_in * 2, | |
| max_conv_dim) | |
| blocks += [ResBlk(dim_in, dim_out)] | |
| dim_in = dim_out | |
| blocks += [nn.LeakyReLU(0.24), # w/o this activation - produces no speech | |
| spectral_norm(nn.Conv2d(dim_out, dim_out, 5, stride=1, padding=0)), | |
| nn.LeakyReLU(0.2) # 0.3 sounds nice | |
| ] | |
| self.shared = nn.Sequential(*blocks) | |
| self.unshared = nn.Linear(dim_out, style_dim) | |
| def forward(self, x): | |
| x = self.shared(x) | |
| x = x.mean(3, keepdims=True) # comment this line for time varying style vector | |
| x = x.transpose(1, 3) | |
| s = self.unshared(x) | |
| return s | |
| class LinearNorm(torch.nn.Module): | |
| def __init__(self, in_dim, out_dim, bias=True): | |
| super().__init__() | |
| self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) | |
| def forward(self, x): | |
| return self.linear_layer(x) | |
| class LayerNorm(nn.Module): | |
| def __init__(self, channels, eps=1e-5): | |
| super().__init__() | |
| self.channels = channels | |
| self.eps = eps | |
| self.gamma = nn.Parameter(torch.ones(channels)) | |
| self.beta = nn.Parameter(torch.zeros(channels)) | |
| def forward(self, x): | |
| x = x.transpose(1, -1) | |
| x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) | |
| return x.transpose(1, -1) | |
| class TextEncoder(nn.Module): | |
| def __init__(self, channels, kernel_size, depth, n_symbols): | |
| super().__init__() | |
| self.embedding = nn.Embedding(n_symbols, channels) | |
| padding = (kernel_size - 1) // 2 | |
| self.cnn = nn.ModuleList() | |
| for _ in range(depth): | |
| self.cnn.append(nn.Sequential( | |
| weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)), | |
| LayerNorm(channels), | |
| nn.LeakyReLU(0.24)) | |
| ) | |
| self.lstm = nn.LSTM(channels, channels//2, 1, | |
| batch_first=True, bidirectional=True) | |
| def forward(self, x): | |
| x = self.embedding(x) # [B, T, emb] | |
| x = x.transpose(1, 2) | |
| for c in self.cnn: | |
| x = c(x) | |
| x = x.transpose(1, 2) | |
| x, _ = self.lstm(x) | |
| return x | |
| class AdaLayerNorm(nn.Module): | |
| def __init__(self, style_dim, channels=None, eps=1e-5): | |
| super().__init__() | |
| self.eps = eps | |
| self.fc = nn.Linear(style_dim, 1024) | |
| def forward(self, x, s): | |
| h = self.fc(s) | |
| gamma = h[:, :, :512] | |
| beta = h[:, :, 512:1024] | |
| x = F.layer_norm(x, (512, ), eps=self.eps) | |
| x = (1 + gamma) * x + beta | |
| return x # [1, 75, 512] | |
| class ProsodyPredictor(nn.Module): | |
| def __init__(self, style_dim, d_hid, nlayers, max_dur=50): | |
| super().__init__() | |
| self.text_encoder = DurationEncoder(sty_dim=style_dim, | |
| d_model=d_hid, | |
| nlayers=nlayers) # called outside forward | |
| self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, | |
| 1, batch_first=True, bidirectional=True) | |
| self.duration_proj = LinearNorm(d_hid, max_dur) | |
| self.shared = nn.LSTM(d_hid + style_dim, d_hid // | |
| 2, 1, batch_first=True, bidirectional=True) | |
| self.F0 = nn.ModuleList([ | |
| AdainResBlk1d(d_hid, d_hid, style_dim), | |
| AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True), | |
| AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim), | |
| ]) | |
| self.N = nn.ModuleList([ | |
| AdainResBlk1d(d_hid, d_hid, style_dim), | |
| AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True), | |
| AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim) | |
| ]) | |
| self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0) | |
| self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0) | |
| def F0Ntrain(self, x, s): | |
| x, _ = self.shared(x) # [bs, time, ch] LSTM | |
| x = x.transpose(1, 2) # [bs, ch, time] | |
| F0 = x | |
| for block in self.F0: | |
| # print(f'LOOP {F0.shape=} {s.shape=}\n') | |
| # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128]) | |
| # This is an AdainResBlk1d expects conv1d dimensions | |
| F0 = block(F0, s) | |
| F0 = self.F0_proj(F0) | |
| N = x | |
| for block in self.N: | |
| N = block(N, s) | |
| N = self.N_proj(N) | |
| return F0, N | |
| def forward(self, d_en=None, s=None): | |
| blend = self.text_encoder(d_en, s) | |
| x, _ = self.lstm(blend) | |
| dur = self.duration_proj(x) # [bs, 150, 50] | |
| _, input_length, classifier_50 = dur.shape | |
| dur = dur[0, :, :] | |
| dur = torch.sigmoid(dur).sum(1) | |
| dur = dur.round().clamp(min=1).to(torch.int64) | |
| aln_trg = torch.zeros(1, | |
| dur.sum(), | |
| input_length, | |
| device=s.device) | |
| c_frame = 0 | |
| for i in range(input_length): | |
| aln_trg[:, c_frame:c_frame + dur[i], i] = 1 | |
| c_frame += dur[i] | |
| en = torch.bmm(aln_trg, blend) | |
| F0_pred, N_pred = self.F0Ntrain(en, s) | |
| return aln_trg, F0_pred, N_pred | |
| class DurationEncoder(nn.Module): | |
| def __init__(self, sty_dim=128, d_model=512, nlayers=3): | |
| super().__init__() | |
| self.lstms = nn.ModuleList() | |
| for _ in range(nlayers): | |
| self.lstms.append(nn.LSTM(d_model + sty_dim, | |
| d_model // 2, | |
| num_layers=1, | |
| batch_first=True, | |
| bidirectional=True | |
| )) | |
| self.lstms.append(AdaLayerNorm(sty_dim, d_model)) | |
| def forward(self, x, style): | |
| _, _, input_lengths = x.shape # [bs, 512, time] | |
| style = _tile(style, length=x.shape[2]).transpose(1, 2) | |
| x = x.transpose(1, 2) | |
| for block in self.lstms: | |
| if isinstance(block, AdaLayerNorm): | |
| x = block(x, style) # LSTM has transposed x | |
| else: | |
| x = torch.cat([x, style], axis=2) | |
| # LSTM | |
| x,_ = block(x) # expects [bs, time, chan] OUTPUTS [bs, time, 2*chan] 2x FROM BIDIRECTIONAL | |
| return torch.cat([x, style], axis=2) # predictor.lstm() | |