contextboxai
/

ViZipvoice

Model card Files Files and versions

ViZipvoice / config.json

dinhthuan's picture

Update latest checkpoint 1860k: config.json

6b63e62 verified 1 day ago

history blame contribute delete

1.46 kB

	{
	"architectures": [
	"ZipVoice"
	],
	"model_type": "zipvoice",
	"library_name": "pytorch",
	"pipeline_tag": "text-to-speech",
	"checkpoint": "checkpoint-1860000.pt",
	"dataset_hours": 7000,
	"tokenizer": {
	"type": "SimpleTokenizer",
	"level": "character",
	"vocab_size": 244,
	"token_file": "tokens.txt"
	},
	"text_normalizer": {
	"package": "soe-vinorm",
	"enabled_by_default": true,
	"postprocess": "remove extra spaces around punctuation"
	},
	"reference_audio": {
	"directory": "audio",
	"count": 30,
	"text_format": "sidecar_txt_same_basename"
	},
	"demo": {
	"directory": "demo",
	"sample_count": 3
	},
	"model": {
	"fm_decoder_downsampling_factor": [
	1,
	2,
	4,
	2,
	1
	],
	"fm_decoder_num_layers": [
	2,
	2,
	4,
	4,
	4
	],
	"fm_decoder_cnn_module_kernel": [
	31,
	15,
	7,
	15,
	31
	],
	"fm_decoder_feedforward_dim": 1536,
	"fm_decoder_num_heads": 4,
	"fm_decoder_dim": 512,
	"text_encoder_num_layers": 4,
	"text_encoder_feedforward_dim": 512,
	"text_encoder_cnn_module_kernel": 9,
	"text_encoder_num_heads": 4,
	"text_encoder_dim": 192,
	"query_head_dim": 32,
	"value_head_dim": 12,
	"pos_head_dim": 4,
	"pos_dim": 48,
	"time_embed_dim": 192,
	"text_embed_dim": 192,
	"feat_dim": 100
	},
	"feature": {
	"sampling_rate": 24000,
	"type": "vocos"
	}
	}