tiny-random
/

minimax-m1

@@ -28,6 +28,63 @@ pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, trust_remot
 print(pipe('Write an article about Artificial Intelligence.'))
 ```
 ### Codes to create this repo:
 ```python
@@ -81,7 +138,9 @@ automap = config_json['auto_map']
 torch.set_default_dtype(torch.bfloat16)
 model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
 torch.set_default_dtype(torch.float32)
 if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
     model.generation_config = GenerationConfig.from_pretrained(
         source_model_id, trust_remote_code=True,

 print(pipe('Write an article about Artificial Intelligence.'))
 ```
+### Printing the model:
+```text
+MiniMaxM1ForCausalLM(
+  (model): MiniMaxM1Model(
+    (embed_tokens): Embedding(200064, 64)
+    (layers): ModuleList(
+      (0): MiniMaxM1DecoderLayer(
+        (self_attn): MiniMaxM1LightningAttention(
+          (out_proj): Linear(in_features=64, out_features=64, bias=False)
+          (norm): MiniMaxM1RMSNorm()
+          (qkv_proj): Linear(in_features=64, out_features=192, bias=False)
+          (output_gate): Linear(in_features=64, out_features=64, bias=False)
+        )
+        (block_sparse_moe): MiniMaxM1SparseMoeBlock(
+          (gate): Linear(in_features=64, out_features=8, bias=False)
+          (experts): ModuleList(
+            (0-7): 8 x MiniMaxM1BlockSparseTop2MLP(
+              (w1): Linear(in_features=64, out_features=128, bias=False)
+              (w2): Linear(in_features=128, out_features=64, bias=False)
+              (w3): Linear(in_features=64, out_features=128, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+        )
+        (input_layernorm): MiniMaxM1RMSNorm()
+        (post_attention_layernorm): MiniMaxM1RMSNorm()
+      )
+      (1): MiniMaxM1DecoderLayer(
+        (self_attn): MiniMaxM1FlashAttention2(
+          (q_proj): Linear(in_features=64, out_features=64, bias=False)
+          (k_proj): Linear(in_features=64, out_features=32, bias=False)
+          (v_proj): Linear(in_features=64, out_features=32, bias=False)
+          (o_proj): Linear(in_features=64, out_features=64, bias=False)
+          (rotary_emb): MiniMaxM1RotaryEmbedding()
+        )
+        (block_sparse_moe): MiniMaxM1SparseMoeBlock(
+          (gate): Linear(in_features=64, out_features=8, bias=False)
+          (experts): ModuleList(
+            (0-7): 8 x MiniMaxM1BlockSparseTop2MLP(
+              (w1): Linear(in_features=64, out_features=128, bias=False)
+              (w2): Linear(in_features=128, out_features=64, bias=False)
+              (w3): Linear(in_features=64, out_features=128, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+        )
+        (input_layernorm): MiniMaxM1RMSNorm()
+        (post_attention_layernorm): MiniMaxM1RMSNorm()
+      )
+    )
+    (norm): MiniMaxM1RMSNorm()
+  )
+  (lm_head): Linear(in_features=64, out_features=200064, bias=False)
+)
+```
 ### Codes to create this repo:
 ```python
 torch.set_default_dtype(torch.bfloat16)
 model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
 torch.set_default_dtype(torch.float32)
+# according to source model, gat is in FP32
+for i in range(config.num_hidden_layers):
+    model.model.layers[i].block_sparse_moe.gate.float()
 if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
     model.generation_config = GenerationConfig.from_pretrained(
         source_model_id, trust_remote_code=True,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6581a14a0cd32f179fec72b1066e97d80ea5ee170199d4ea5de11725807b1fa
-size 26470640

 version https://git-lfs.github.com/spec/v1
+oid sha256:9989590c725ca73f9d96e3da308207df3bab221cd046c452506bc50f9ad59770
+size 26472672