model_name = "basic_reference_200m" n_layers = 2 d_model = 512 d_mlp = 2048 d_head = 64 n_heads = 8 attn_only = false layer_norm_eps = 1e-05 init_range = 0.02 n_ctx = 1024 d_vocab = 48262 dataset_name = "eoinf/unprocessed-c4-code-test" tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" seed = 10 device = "cuda" use_bfloat16_matmul = false batch_size_per_device = 32 n_devices = 1 batches_per_step = 1 max_tokens = 200000000 lr_hidden = 0.002 lr_vector = 0.001 lr_schedule = "constant_with_warmup" warmup_tokens = 30000000 weight_decay = 0.05 grad_norm_clip = 1.0 train_loss_moving_average_beta = 0.99 log_interval = 25 save_checkpoints = true checkpoint_interval = 1000 checkpoint_interval_ratio = 1.10 save_log_checkpoints = true