chatterbox-hf / config.json
Manmay's picture
Upload folder using huggingface_hub
9bf7cd5 verified
{
"hiftnet_config": {
"audio_limit": 0.99,
"base_channels": 512,
"f0_predictor_cond_channels": 512,
"f0_predictor_in_channels": 80,
"hidden_size": 512,
"in_channels": 80,
"istft_hop_len": 4,
"istft_n_fft": 16,
"lrelu_slope": 0.1,
"model_type": "hiftnet",
"nb_harmonics": 8,
"nsf_alpha": 0.1,
"nsf_sigma": 0.003,
"nsf_voiced_threshold": 10.0,
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"resblock_kernel_sizes": [
3,
7,
11
],
"sampling_rate": 22050,
"source_resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
]
],
"source_resblock_kernel_sizes": [
7,
11
],
"upsample_kernel_sizes": [
16,
16
],
"upsample_rates": [
8,
8
]
},
"is_multilingual": false,
"model_type": "chatterbox",
"s3gen_config": {
"cfm_inference_cfg_rate": 0.7,
"cfm_sigma_min": 1e-06,
"cfm_solver": "euler",
"cfm_t_scheduler": "cosine",
"decoder_act_fn": "gelu",
"decoder_attention_head_dim": 64,
"decoder_channels": [
256
],
"decoder_in_channels": 320,
"decoder_n_blocks": 4,
"decoder_num_heads": 8,
"decoder_num_mid_blocks": 12,
"decoder_out_channels": 80,
"encoder_attention_heads": 8,
"encoder_dropout_rate": 0.1,
"encoder_linear_units": 2048,
"encoder_num_blocks": 6,
"encoder_output_size": 512,
"fmax": 8000,
"fmin": 0,
"hop_length": 480,
"input_frame_rate": 25,
"mel_bins": 80,
"model_type": "s3gen",
"n_fft": 1920,
"pre_lookahead_len": 3,
"sampling_rate": 24000,
"speaker_embed_dim": 192,
"speaker_feat_dim": 80,
"token_embed_dim": 512,
"token_mel_ratio": 2,
"vocab_size": 6561,
"win_size": 1920
},
"t3_config": {
"alignment_layer_idx": 9,
"emotion_adv": true,
"encoder_type": "voice_encoder",
"hidden_size": 1024,
"input_pos_emb": "learned",
"llama_config_dict": {
"attention_bias": false,
"attention_dropout": 0.0,
"attn_implementation": "sdpa",
"head_dim": 64,
"hidden_act": "silu",
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 131072,
"mlp_bias": false,
"model_type": "llama",
"num_attention_heads": 16,
"num_hidden_layers": 30,
"num_key_value_heads": 16,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": {
"factor": 8.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"rope_theta": 500000.0,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"use_cache": true,
"vocab_size": 8
},
"llama_config_name": "Llama_520M",
"max_speech_tokens": 4096,
"max_text_tokens": 2048,
"model_type": "t3",
"perceiver_latent_dim": 1024,
"perceiver_num_heads": 4,
"perceiver_num_latents": 32,
"speaker_embed_size": 256,
"speech_cond_prompt_len": 150,
"speech_tokens_dict_size": 8194,
"start_speech_token": 6561,
"start_text_token": 255,
"stop_speech_token": 6562,
"stop_text_token": 0,
"text_tokens_dict_size": 704,
"use_alignment_analyzer": false,
"use_perceiver_resampler": true
},
"transformers_version": "5.0.0.dev0"
}