DeepLatent Base Model

This is a DeepLatent model checkpoint from the base training stage.

Model Details

Stage: base
Model Tag: deeplatent_200m
Step: latest
Architecture: DeepLatent (Continuous Thought Machine)

Usage

from models.DeepLatent.DeepLatent import DeepLatent
from models.DeepLatent.tokenizer import get_tokenizer

# Load model
model = DeepLatent.from_pretrained("almaghrabima/deeplatent-base")

# Load tokenizer
tokenizer = get_tokenizer()  # or load from the tokenizer directory in the repo

# Use model
inputs = tokenizer("Hello, world!")
outputs = model(inputs)

Training Details

Training metadata:

{
  "step": 30,
  "val_bpb": 3.078201573447804,
  "model_config": {
    "sequence_len": 1024,
    "vocab_size": 64663,
    "iterations": 16,
    "d_model": 384,
    "d_input": 1024,
    "heads": 8,
    "n_synch_out": 192,
    "n_synch_action": 192,
    "synapse_depth": 8,
    "memory_length": 8,
    "deep_nlms": true,
    "memory_hidden_dims": 64,
    "do_layernorm_nlm": false,
    "predict_n_tokens": 1,
    "dropout": 0.05,
    "dropout_nlm": null,
    "nlm_chunk_size": 1024,
    "neuron_select_type": "random-pairing",
    "n_random_pairing_self": 0,
    "sub_task": "plasticDeepLatent",
    "attention": {
      "vocab_size": 64663,
      "emb_dim": 1024,
      "hidden_dim": 3072,
      "n_layers": 12,
      "n_heads": 8,
      "n_kv_groups": 4,
      "head_dim": 128,
      "context_length": 1024,
      "rope_base": 1000000.0,
      "qk_norm": true,
      "attention_dropout": 0.0,
      "attention_bias": false,
      "attn_implementation": "sdpa"
    },
    "prediction_iterations": [
      4,
      8,
      12,
      16
    ],
    "ensemble_weights": [
      0.1,
      0.2,
      0.3,
      0.4
    ],
    "iteration_residual_interval": 4,
    "iteration_residual_scale": 0.3,
    "iteration_checkpoint": true,
    "iteration_checkpoint_interval": 8,
    "exit_gate_enabled": true,
    "exit_gate_hidden_size": 0,
    "exit_gate_min_iterations": 1,
    "exit_gate_entropy_lambda": 0.01,
    "exit_gate_stage2": false,
    "exit_gate_stage2_threshold": 0.05,
    "exit_gate_stage2_steepness": 50.0,
    "no_ctm": false,
    "standard_decoder": false,
    "causal_ctm": true,
    "use_spectral_norm": true,
    "plastic_weight_clip": 1.0,
    "iteration_grad_scale": true,
    "o1_memory": true,
    "plastic_ode": true,
    "nlm_variant": "gated",
    "chunked_ctm": true,
    "chunked_ctm_num_chunks": 4,
    "chunked_ctm_sync_interval": 8
  },
  "user_config": {
    "run": "dummy",
    "device_type": "",
    "depth": 12,
    "max_seq_len": 1024,
    "head_dim": 128,
    "rope_base": 1000000.0,
    "qk_norm": true,
    "attention_dropout": 0.0,
    "attention_bias": false,
    "iterations": 16,
    "d_model": 384,
    "d_input": 1024,
    "heads": 8,
    "n_kv_heads": 4,
    "hidden_dim": 3072,
    "n_synch_out": 192,
    "n_synch_action": 192,
    "synapse_depth": 8,
    "memory_length": 8,
    "deep_nlms": true,
    "memory_hidden_dims": 64,
    "do_layernorm_nlm": false,
    "dropout": 0.05,
    "dropout_nlm": null,
    "nlm_chunk_size": 1024,
    "neuron_select_type": "random-pairing",
    "n_random_pairing_self": 0,
    "sub_task": "plasticDeepLatent",
    "num_iterations": 30,
    "target_flops": -1.0,
    "target_param_data_ratio": 20.0,
    "device_batch_size": 8,
    "total_batch_size": -1,
    "embedding_lr": 0.2,
    "unembedding_lr": 0.004,
    "weight_decay": 0.0,
    "matrix_lr": 0.02,
    "grad_clip": 1.0,
    "grad_norm_lr_backoff_threshold": 3.0,
    "grad_norm_lr_backoff_min_multiplier": 0.25,
    "warmup_ratio": 0.1,
    "warmdown_ratio": 0.3,
    "final_lr_frac": 0.0,
    "resume_from_step": -1,
    "eval_every": 500,
    "eval_tokens": 524288,
    "core_metric_every": 4000,
    "core_metric_max_per_task": 500,
    "sample_every": 1000,
    "save_every": 2000,
    "model_tag": "deeplatent_200m",
    "predict_n_tokens": 1,
    "curriculum_lookahead": 5,
    "use_most_certain": true,
    "ultra_ctm": true,
    "ultra_ctm_prediction_iterations": "4,8,12,16",
    "ultra_ctm_ensemble_weights": "0.1,0.2,0.3,0.4",
    "ultra_ctm_residual_interval": 4,
    "ultra_ctm_residual_scale": 0.3,
    "ultra_ctm_checkpoint": true,
    "ultra_ctm_checkpoint_interval": 8,
    "drope": true,
    "drope_start_ratio": 0.875,
    "use_enhanced_loss": true,
    "use_focal_loss": true,
    "focal_gamma": 2.0,
    "focal_alpha": 1.0,
    "sync_entropy_weight": 0.01,
    "sync_smoothness_weight": 0.01,
    "optimizer": "sophia",
    "sophia_rho": 0.03,
    "sophia_betas": "0.965,0.99",
    "sophia_hessian_freq": 5,
    "layer_wise_lr_decay": 0.9,
    "lr_scaling_mode": "disabled",
    "lr_scaling_ref_budget": 0.0,
    "lr_scaling_floor": 0.5,
    "lr_scaling_ceiling": 2.0,
    "scale_grad_clip": false,
    "grad_clip_scale_ceiling": 2.0,
    "lr_scaling_dry_run": false,
    "no_compile": false,
    "use_1cycle_lr": false,
    "use_safetensors": false,
    "use_spectral_norm": true,
    "plastic_weight_clip": 1.0,
    "iteration_grad_scale": true
  },
  "device_batch_size": 8,
  "max_seq_len": 1024,
  "dataloader_state_dict": {
    "pq_idx": 0,
    "rg_idx": 0
  },
  "resolved_lrs": {
    "stage": "pretraining",
    "mode": "disabled",
    "total_tokens": 15728640,
    "num_scaling_params": 151220224,
    "matrix_lr": 0.02,
    "embedding_lr": 0.2,
    "unembedding_lr": 0.004,
    "grad_clip": 1.0,
    "scale": 1.0,
    "lr_scale": 1.0,
    "clip_scale": 1.0,
    "scale_floor": 0.5,
    "scale_ceiling": 2.0,
    "scale_grad_clip": false,
    "grad_clip_scale_ceiling": 2.0,
    "budget_tokens_per_param": null,
    "ref_budget_tokens_per_param": null
  },
  "loop_state": {
    "min_val_bpb": 3.078201573447804,
    "smooth_train_loss": 22.523515959332077,
    "total_training_time": 539.9748866558075,
    "drope_activated": true,
    "grad_norm_ema": 0.7607746757361753,
    "grad_norm_ema_step": 19
  }
}

Downloads last month: 2

Safetensors

Model size

0.2B params

Tensor type

I64

F32

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support