| { |
| "adamw_betas_for_muon_others": [ |
| 0.9, |
| 0.95 |
| ], |
| "adamw_eps_for_muon_others": 1e-10, |
| "adamw_lr": 0.00064, |
| "adamw_lr_for_muon_others": 0.000267, |
| "adamw_max_grad_norm": 0.5, |
| "adamw_weight_decay": 0.01, |
| "adamw_weight_decay_for_muon_others": 0.05, |
| "adjust_learning_rate_for_accumulation": true, |
| "architectures": [ |
| "ViralBERTForSequenceClassification" |
| ], |
| "attention_head_size": 64, |
| "attention_probs_dropout_prob": 0.0, |
| "batch_size": 256, |
| "class_weights": null, |
| "classifier_dropout_prob": 0.1, |
| "cls_token_id": 1, |
| "compile_backend": "inductor", |
| "compile_fullgraph": true, |
| "compile_mode": "default", |
| "data_dir": "", |
| "dtype": "float32", |
| "fasta_file": "", |
| "feed_forward_activation": "swiglu", |
| "filter_n": false, |
| "fp16": true, |
| "freeze_bert_layers": 8, |
| "global_attn_every_n_layers": 0, |
| "global_max_grad_norm": 1.0, |
| "gradient_accumulation_steps": 8, |
| "hidden_dropout_prob": 0.0, |
| "hidden_size": 768, |
| "high_lr_multiplier": 1.0, |
| "high_lr_steps_ratio": 0.0, |
| "id2label": { |
| "0": "bac", |
| "1": "virus" |
| }, |
| "initializer_range": 0.02, |
| "intermediate_size": 2048, |
| "label2id": { |
| "bac": 0, |
| "virus": 1 |
| }, |
| "label_smoothing_factor": 0.1, |
| "layer_norm_eps": 1e-12, |
| "logging_steps": 1000, |
| "loss_type": "ce", |
| "lr_scheduler_type": "cosine", |
| "mask_token_id": 3, |
| "masking_strategy": "structural", |
| "max_eval_samples": 2048, |
| "max_steps_for_sweep": null, |
| "min_lr_ratio": 0.05, |
| "mlm_probability": 0.15, |
| "model_type": "viralbert_for_sequence_classification", |
| "muon_lr": 0.0015, |
| "muon_max_grad_norm": 1.0, |
| "muon_momentum": 0.95, |
| "muon_weight_decay": 0.05, |
| "n_token_id": 9, |
| "norm_layer_type": "rmsnorm", |
| "num_attention_heads": 12, |
| "num_hidden_layers": 12, |
| "num_train_epochs": 3, |
| "num_workers": 4, |
| "optimizer_type": "muon_adamw", |
| "p_codon": 0.5, |
| "pad_token_id": 0, |
| "pos_weight": null, |
| "position_embedding_type": "rope", |
| "resume_from_checkpoint": null, |
| "resume_mode": null, |
| "reverse_complement_prob": 0.5, |
| "rope_interpolation_factor": 1.0, |
| "run_name": "", |
| "save_steps": 10000, |
| "save_total_limit": 5, |
| "scale_loss_for_accumulation": true, |
| "seed": 42, |
| "sep_token_id": 2, |
| "seq_length": 512, |
| "seq_mask_prob": 0.5, |
| "seq_mask_ratio": 0.15, |
| "sliding_window_size": 0, |
| "stride": 256, |
| "sweep_early_stopping_patience_steps": 1000, |
| "sweep_early_stopping_threshold": 50.0, |
| "tie_word_embeddings": false, |
| "transformers_version": "4.56.1", |
| "use_compile": true, |
| "use_per_group_clipping": false, |
| "use_qk_norm": true, |
| "use_seq_augment": true, |
| "use_xpos": false, |
| "vocab_size": 14, |
| "wandb_enabled": true, |
| "wandb_group": "", |
| "wandb_name": "", |
| "wandb_notes": "", |
| "wandb_project": "", |
| "wandb_tags": [], |
| "wandb_watch_freq": null, |
| "wandb_watch_model": false, |
| "warmup_steps": 4000, |
| "warmup_steps_ratio": 0.1 |
| } |
|
|