Text-to-Speech
ZONOS2
File size: 2,056 Bytes
9fcd0e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{
  "model_type": "zonos2",
  "dtype": "bfloat16",
  "n_layers": 28,
  "dim": 2048,
  "head_dim": 128,
  "n_heads": null,
  "n_kv_heads": 4,
  "ffn_dim_multiplier": 1.5,
  "multiple_of": 256,
  "norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "max_seqlen": 6144,
  "n_codebooks": 9,
  "codebook_size": 1024,
  "eoa_id": 1024,
  "audio_pad_id": 1025,
  "text_vocab": 519,
  "loss_softcap": 15.0,
  "speaker_enabled": true,
  "speaker_embedding_dim": 2048,
  "speaker_lda_dim": 1024,
  "speaker_background_token_enabled": true,
  "accurate_mode_token_enabled": true,
  "speaking_rate_num_buckets": 8,
  "speaking_rate_buckets": ["0-8", "8-11", "11-14", "14-17", "17-21", "21-28", "28-40", "40+"],
  "quality_num_buckets": 60,
  "quality_features": [
    "lufs",
    "estimated_snr",
    "max_pause",
    "estimated_bandlimit_hz",
    "leading_silence_s",
    "trailing_silence_s"
  ],
  "quality_buckets": {
    "lufs": ["-1000--50", "-50--45.5", "-45.5--41", "-41--36.5", "-36.5--32", "-32--27.5", "-27.5--23", "-23--18.5", "-18.5--14", "-14--9.5", "-9.5--5", "-5+"],
    "estimated_snr": ["-1000-0", "0-6", "6-12", "12-18", "18-24", "24-30", "30-36", "36-42", "42-48", "48-54", "54-60", "60+"],
    "max_pause": ["0-0.5", "0.5-1", "1-1.5", "1.5-2", "2-2.5", "2.5-3", "3-3.5", "3.5-4", "4-4.5", "4.5-5", "5-5.5", "5.5-6"],
    "estimated_bandlimit_hz": ["495.3-3433", "3433-6371", "6371-9310", "9310-12248", "12248-15186", "15186-18124", "18124-21062", "21062-24000"],
    "leading_silence_s": ["0-0.05", "0.05-0.1", "0.1-0.25", "0.25-0.5", "0.5-1", "1-2", "2-4", "4+"],
    "trailing_silence_s": ["0-0.05", "0.05-0.1", "0.1-0.25", "0.25-0.5", "0.5-1", "1-2", "2-4", "4+"]
  },
  "quality_dropout": {
    "lufs": 0.25,
    "estimated_snr": 0.25,
    "max_pause": 0.25,
    "estimated_bandlimit_hz": 0.25,
    "leading_silence_s": 0.25,
    "trailing_silence_s": 0.25
  },
  "moe_impl": "sonic",
  "moe_n_experts": 16,
  "moe_router_topk": 1,
  "special_topk_layers": {"26": 2},
  "moe_router_dim": 128,
  "moe_start_from_layer": 3,
  "moe_end_from_layer": 1
}