ViZipvoice / config.json
dinhthuan's picture
Update latest checkpoint 1860k: config.json
6b63e62 verified
{
"architectures": [
"ZipVoice"
],
"model_type": "zipvoice",
"library_name": "pytorch",
"pipeline_tag": "text-to-speech",
"checkpoint": "checkpoint-1860000.pt",
"dataset_hours": 7000,
"tokenizer": {
"type": "SimpleTokenizer",
"level": "character",
"vocab_size": 244,
"token_file": "tokens.txt"
},
"text_normalizer": {
"package": "soe-vinorm",
"enabled_by_default": true,
"postprocess": "remove extra spaces around punctuation"
},
"reference_audio": {
"directory": "audio",
"count": 30,
"text_format": "sidecar_txt_same_basename"
},
"demo": {
"directory": "demo",
"sample_count": 3
},
"model": {
"fm_decoder_downsampling_factor": [
1,
2,
4,
2,
1
],
"fm_decoder_num_layers": [
2,
2,
4,
4,
4
],
"fm_decoder_cnn_module_kernel": [
31,
15,
7,
15,
31
],
"fm_decoder_feedforward_dim": 1536,
"fm_decoder_num_heads": 4,
"fm_decoder_dim": 512,
"text_encoder_num_layers": 4,
"text_encoder_feedforward_dim": 512,
"text_encoder_cnn_module_kernel": 9,
"text_encoder_num_heads": 4,
"text_encoder_dim": 192,
"query_head_dim": 32,
"value_head_dim": 12,
"pos_head_dim": 4,
"pos_dim": 48,
"time_embed_dim": 192,
"text_embed_dim": 192,
"feat_dim": 100
},
"feature": {
"sampling_rate": 24000,
"type": "vocos"
}
}