{ "architectures": [ "ZipVoice" ], "model_type": "zipvoice", "library_name": "pytorch", "pipeline_tag": "text-to-speech", "checkpoint": "checkpoint-1860000.pt", "dataset_hours": 7000, "tokenizer": { "type": "SimpleTokenizer", "level": "character", "vocab_size": 244, "token_file": "tokens.txt" }, "text_normalizer": { "package": "soe-vinorm", "enabled_by_default": true, "postprocess": "remove extra spaces around punctuation" }, "reference_audio": { "directory": "audio", "count": 30, "text_format": "sidecar_txt_same_basename" }, "demo": { "directory": "demo", "sample_count": 3 }, "model": { "fm_decoder_downsampling_factor": [ 1, 2, 4, 2, 1 ], "fm_decoder_num_layers": [ 2, 2, 4, 4, 4 ], "fm_decoder_cnn_module_kernel": [ 31, 15, 7, 15, 31 ], "fm_decoder_feedforward_dim": 1536, "fm_decoder_num_heads": 4, "fm_decoder_dim": 512, "text_encoder_num_layers": 4, "text_encoder_feedforward_dim": 512, "text_encoder_cnn_module_kernel": 9, "text_encoder_num_heads": 4, "text_encoder_dim": 192, "query_head_dim": 32, "value_head_dim": 12, "pos_head_dim": 4, "pos_dim": 48, "time_embed_dim": 192, "text_embed_dim": 192, "feat_dim": 100 }, "feature": { "sampling_rate": 24000, "type": "vocos" } }