Spaces:
Running
Running
P2 batch: pangu_pro_moe, iquest_coder, minicpm, step3_5, mimo_v2, llada2-uni, emu3 + fixes
Browse files
scan.py
CHANGED
|
@@ -76,9 +76,10 @@ KNOWN_BASES = {
|
|
| 76 |
"model_type_patterns": ["qwen3"],
|
| 77 |
},
|
| 78 |
"qwen2": {
|
| 79 |
-
"name": "Qwen2.5",
|
| 80 |
-
"vocab_size": [151936, 152064],
|
| 81 |
"model_type_patterns": ["qwen2"],
|
|
|
|
| 82 |
},
|
| 83 |
"llama3": {
|
| 84 |
"name": "Llama 3.x",
|
|
@@ -134,10 +135,11 @@ KNOWN_BASES = {
|
|
| 134 |
},
|
| 135 |
"llada2": {
|
| 136 |
"name": "inclusionAI LLaDA2 (discrete-diffusion MoE)",
|
| 137 |
-
"vocab_size": 157184,
|
| 138 |
"model_type_patterns": ["llada2_moe", "llada2"],
|
| 139 |
-
#
|
| 140 |
-
#
|
|
|
|
| 141 |
},
|
| 142 |
"kimi": {
|
| 143 |
"name": "Moonshot Kimi (K2, Kimi-Linear)",
|
|
@@ -162,6 +164,39 @@ KNOWN_BASES = {
|
|
| 162 |
"vocab_size": 153216,
|
| 163 |
"model_type_patterns": ["interns1"],
|
| 164 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
"emu3": {
|
| 166 |
"name": "BAAI Emu3 family (unified vision+text)",
|
| 167 |
"vocab_size": [184622, 282926],
|
|
|
|
| 76 |
"model_type_patterns": ["qwen3"],
|
| 77 |
},
|
| 78 |
"qwen2": {
|
| 79 |
+
"name": "Qwen2.5 (incl. VL)",
|
| 80 |
+
"vocab_size": [151936, 152064, 151680],
|
| 81 |
"model_type_patterns": ["qwen2"],
|
| 82 |
+
# 151680 = MiMo-Embodied-7B uses Qwen2.5-VL backbone with this vocab
|
| 83 |
},
|
| 84 |
"llama3": {
|
| 85 |
"name": "Llama 3.x",
|
|
|
|
| 135 |
},
|
| 136 |
"llada2": {
|
| 137 |
"name": "inclusionAI LLaDA2 (discrete-diffusion MoE)",
|
| 138 |
+
"vocab_size": [157184, 173568],
|
| 139 |
"model_type_patterns": ["llada2_moe", "llada2"],
|
| 140 |
+
# 157184 = text-only discrete diffusion (flash, base)
|
| 141 |
+
# 173568 = Uni any-to-any variant — adds ~16K image codebook tokens to vocab
|
| 142 |
+
# Non-autoregressive masked LM; separate family from Bailing-V2 by training paradigm
|
| 143 |
},
|
| 144 |
"kimi": {
|
| 145 |
"name": "Moonshot Kimi (K2, Kimi-Linear)",
|
|
|
|
| 164 |
"vocab_size": 153216,
|
| 165 |
"model_type_patterns": ["interns1"],
|
| 166 |
},
|
| 167 |
+
"pangu_pro_moe": {
|
| 168 |
+
"name": "FreedomIntelligence Pangu-R (Huawei Pangu-Pro-MoE)",
|
| 169 |
+
"vocab_size": 153600,
|
| 170 |
+
"model_type_patterns": ["pangupromoe"],
|
| 171 |
+
# model_type in config is "PanguProMoE" — lowercased to pangupromoe for matching
|
| 172 |
+
# MoE 80/8, first_k_dense_replace=4, hidden=4608, layers=50
|
| 173 |
+
},
|
| 174 |
+
"iquest_coder": {
|
| 175 |
+
"name": "IQuest-Coder",
|
| 176 |
+
"vocab_size": 76800,
|
| 177 |
+
"model_type_patterns": ["iquestcoder"],
|
| 178 |
+
# Code-specialized tokenizer (76800 = code-token-dense). Dense GQA 32→2.
|
| 179 |
+
# Same family across 7B (14 layers) and 40B (80 layers).
|
| 180 |
+
},
|
| 181 |
+
"minicpm": {
|
| 182 |
+
"name": "OpenBMB MiniCPM",
|
| 183 |
+
"vocab_size": 73448,
|
| 184 |
+
"model_type_patterns": ["minicpm"],
|
| 185 |
+
# MiniCPM family (AgentCPM-Report etc.). Heavy GQA 32→2.
|
| 186 |
+
},
|
| 187 |
+
"step3_5": {
|
| 188 |
+
"name": "StepFun Step-3.5 Flash",
|
| 189 |
+
"vocab_size": [128815, 128896],
|
| 190 |
+
"model_type_patterns": ["step3p5"],
|
| 191 |
+
# Per-layer RoPE schedule: every 4th layer gets long-context theta (1e6/5e6),
|
| 192 |
+
# others get 1e4. Sliding-window=512. First StepFun entry with multi-freq RoPE.
|
| 193 |
+
},
|
| 194 |
+
"mimo_v2": {
|
| 195 |
+
"name": "Xiaomi MiMo V2.x",
|
| 196 |
+
"vocab_size": 152576,
|
| 197 |
+
"model_type_patterns": ["mimo_v2"],
|
| 198 |
+
# V2.5: hidden=4096, 48 layers; V2.5-Pro: hidden=6144, 70 layers
|
| 199 |
+
},
|
| 200 |
"emu3": {
|
| 201 |
"name": "BAAI Emu3 family (unified vision+text)",
|
| 202 |
"vocab_size": [184622, 282926],
|