DeepLatent Base Model
This is a DeepLatent model checkpoint from the base training stage.
Model Details
- Stage: base
- Model Tag: deeplatent_200m
- Step: latest
- Architecture: DeepLatent (Continuous Thought Machine)
Usage
from models.DeepLatent.DeepLatent import DeepLatent
from models.DeepLatent.tokenizer import get_tokenizer
# Load model
model = DeepLatent.from_pretrained("almaghrabima/deeplatent-base")
# Load tokenizer
tokenizer = get_tokenizer() # or load from the tokenizer directory in the repo
# Use model
inputs = tokenizer("Hello, world!")
outputs = model(inputs)
Training Details
Training metadata:
{
"step": 30,
"val_bpb": 3.078201573447804,
"model_config": {
"sequence_len": 1024,
"vocab_size": 64663,
"iterations": 16,
"d_model": 384,
"d_input": 1024,
"heads": 8,
"n_synch_out": 192,
"n_synch_action": 192,
"synapse_depth": 8,
"memory_length": 8,
"deep_nlms": true,
"memory_hidden_dims": 64,
"do_layernorm_nlm": false,
"predict_n_tokens": 1,
"dropout": 0.05,
"dropout_nlm": null,
"nlm_chunk_size": 1024,
"neuron_select_type": "random-pairing",
"n_random_pairing_self": 0,
"sub_task": "plasticDeepLatent",
"attention": {
"vocab_size": 64663,
"emb_dim": 1024,
"hidden_dim": 3072,
"n_layers": 12,
"n_heads": 8,
"n_kv_groups": 4,
"head_dim": 128,
"context_length": 1024,
"rope_base": 1000000.0,
"qk_norm": true,
"attention_dropout": 0.0,
"attention_bias": false,
"attn_implementation": "sdpa"
},
"prediction_iterations": [
4,
8,
12,
16
],
"ensemble_weights": [
0.1,
0.2,
0.3,
0.4
],
"iteration_residual_interval": 4,
"iteration_residual_scale": 0.3,
"iteration_checkpoint": true,
"iteration_checkpoint_interval": 8,
"exit_gate_enabled": true,
"exit_gate_hidden_size": 0,
"exit_gate_min_iterations": 1,
"exit_gate_entropy_lambda": 0.01,
"exit_gate_stage2": false,
"exit_gate_stage2_threshold": 0.05,
"exit_gate_stage2_steepness": 50.0,
"no_ctm": false,
"standard_decoder": false,
"causal_ctm": true,
"use_spectral_norm": true,
"plastic_weight_clip": 1.0,
"iteration_grad_scale": true,
"o1_memory": true,
"plastic_ode": true,
"nlm_variant": "gated",
"chunked_ctm": true,
"chunked_ctm_num_chunks": 4,
"chunked_ctm_sync_interval": 8
},
"user_config": {
"run": "dummy",
"device_type": "",
"depth": 12,
"max_seq_len": 1024,
"head_dim": 128,
"rope_base": 1000000.0,
"qk_norm": true,
"attention_dropout": 0.0,
"attention_bias": false,
"iterations": 16,
"d_model": 384,
"d_input": 1024,
"heads": 8,
"n_kv_heads": 4,
"hidden_dim": 3072,
"n_synch_out": 192,
"n_synch_action": 192,
"synapse_depth": 8,
"memory_length": 8,
"deep_nlms": true,
"memory_hidden_dims": 64,
"do_layernorm_nlm": false,
"dropout": 0.05,
"dropout_nlm": null,
"nlm_chunk_size": 1024,
"neuron_select_type": "random-pairing",
"n_random_pairing_self": 0,
"sub_task": "plasticDeepLatent",
"num_iterations": 30,
"target_flops": -1.0,
"target_param_data_ratio": 20.0,
"device_batch_size": 8,
"total_batch_size": -1,
"embedding_lr": 0.2,
"unembedding_lr": 0.004,
"weight_decay": 0.0,
"matrix_lr": 0.02,
"grad_clip": 1.0,
"grad_norm_lr_backoff_threshold": 3.0,
"grad_norm_lr_backoff_min_multiplier": 0.25,
"warmup_ratio": 0.1,
"warmdown_ratio": 0.3,
"final_lr_frac": 0.0,
"resume_from_step": -1,
"eval_every": 500,
"eval_tokens": 524288,
"core_metric_every": 4000,
"core_metric_max_per_task": 500,
"sample_every": 1000,
"save_every": 2000,
"model_tag": "deeplatent_200m",
"predict_n_tokens": 1,
"curriculum_lookahead": 5,
"use_most_certain": true,
"ultra_ctm": true,
"ultra_ctm_prediction_iterations": "4,8,12,16",
"ultra_ctm_ensemble_weights": "0.1,0.2,0.3,0.4",
"ultra_ctm_residual_interval": 4,
"ultra_ctm_residual_scale": 0.3,
"ultra_ctm_checkpoint": true,
"ultra_ctm_checkpoint_interval": 8,
"drope": true,
"drope_start_ratio": 0.875,
"use_enhanced_loss": true,
"use_focal_loss": true,
"focal_gamma": 2.0,
"focal_alpha": 1.0,
"sync_entropy_weight": 0.01,
"sync_smoothness_weight": 0.01,
"optimizer": "sophia",
"sophia_rho": 0.03,
"sophia_betas": "0.965,0.99",
"sophia_hessian_freq": 5,
"layer_wise_lr_decay": 0.9,
"lr_scaling_mode": "disabled",
"lr_scaling_ref_budget": 0.0,
"lr_scaling_floor": 0.5,
"lr_scaling_ceiling": 2.0,
"scale_grad_clip": false,
"grad_clip_scale_ceiling": 2.0,
"lr_scaling_dry_run": false,
"no_compile": false,
"use_1cycle_lr": false,
"use_safetensors": false,
"use_spectral_norm": true,
"plastic_weight_clip": 1.0,
"iteration_grad_scale": true
},
"device_batch_size": 8,
"max_seq_len": 1024,
"dataloader_state_dict": {
"pq_idx": 0,
"rg_idx": 0
},
"resolved_lrs": {
"stage": "pretraining",
"mode": "disabled",
"total_tokens": 15728640,
"num_scaling_params": 151220224,
"matrix_lr": 0.02,
"embedding_lr": 0.2,
"unembedding_lr": 0.004,
"grad_clip": 1.0,
"scale": 1.0,
"lr_scale": 1.0,
"clip_scale": 1.0,
"scale_floor": 0.5,
"scale_ceiling": 2.0,
"scale_grad_clip": false,
"grad_clip_scale_ceiling": 2.0,
"budget_tokens_per_param": null,
"ref_budget_tokens_per_param": null
},
"loop_state": {
"min_val_bpb": 3.078201573447804,
"smooth_train_loss": 22.523515959332077,
"total_training_time": 539.9748866558075,
"drope_activated": true,
"grad_norm_ema": 0.7607746757361753,
"grad_norm_ema_step": 19
}
}
- Downloads last month
- 4
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support