trohrbaugh commited on
Commit
4e5ba2e
·
verified ·
1 Parent(s): b559a7a

P2 batch: pangu_pro_moe, iquest_coder, minicpm, step3_5, mimo_v2, llada2-uni, emu3 + fixes

Browse files
Files changed (1) hide show
  1. scan.py +40 -5
scan.py CHANGED
@@ -76,9 +76,10 @@ KNOWN_BASES = {
76
  "model_type_patterns": ["qwen3"],
77
  },
78
  "qwen2": {
79
- "name": "Qwen2.5",
80
- "vocab_size": [151936, 152064],
81
  "model_type_patterns": ["qwen2"],
 
82
  },
83
  "llama3": {
84
  "name": "Llama 3.x",
@@ -134,10 +135,11 @@ KNOWN_BASES = {
134
  },
135
  "llada2": {
136
  "name": "inclusionAI LLaDA2 (discrete-diffusion MoE)",
137
- "vocab_size": 157184,
138
  "model_type_patterns": ["llada2_moe", "llada2"],
139
- # Shares Bailing-V2 tokenizer/expert geometry (256/8, vocab 157184)
140
- # but uses discrete-diffusion masked LM non-autoregressive, separate family
 
141
  },
142
  "kimi": {
143
  "name": "Moonshot Kimi (K2, Kimi-Linear)",
@@ -162,6 +164,39 @@ KNOWN_BASES = {
162
  "vocab_size": 153216,
163
  "model_type_patterns": ["interns1"],
164
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  "emu3": {
166
  "name": "BAAI Emu3 family (unified vision+text)",
167
  "vocab_size": [184622, 282926],
 
76
  "model_type_patterns": ["qwen3"],
77
  },
78
  "qwen2": {
79
+ "name": "Qwen2.5 (incl. VL)",
80
+ "vocab_size": [151936, 152064, 151680],
81
  "model_type_patterns": ["qwen2"],
82
+ # 151680 = MiMo-Embodied-7B uses Qwen2.5-VL backbone with this vocab
83
  },
84
  "llama3": {
85
  "name": "Llama 3.x",
 
135
  },
136
  "llada2": {
137
  "name": "inclusionAI LLaDA2 (discrete-diffusion MoE)",
138
+ "vocab_size": [157184, 173568],
139
  "model_type_patterns": ["llada2_moe", "llada2"],
140
+ # 157184 = text-only discrete diffusion (flash, base)
141
+ # 173568 = Uni any-to-any variantadds ~16K image codebook tokens to vocab
142
+ # Non-autoregressive masked LM; separate family from Bailing-V2 by training paradigm
143
  },
144
  "kimi": {
145
  "name": "Moonshot Kimi (K2, Kimi-Linear)",
 
164
  "vocab_size": 153216,
165
  "model_type_patterns": ["interns1"],
166
  },
167
+ "pangu_pro_moe": {
168
+ "name": "FreedomIntelligence Pangu-R (Huawei Pangu-Pro-MoE)",
169
+ "vocab_size": 153600,
170
+ "model_type_patterns": ["pangupromoe"],
171
+ # model_type in config is "PanguProMoE" — lowercased to pangupromoe for matching
172
+ # MoE 80/8, first_k_dense_replace=4, hidden=4608, layers=50
173
+ },
174
+ "iquest_coder": {
175
+ "name": "IQuest-Coder",
176
+ "vocab_size": 76800,
177
+ "model_type_patterns": ["iquestcoder"],
178
+ # Code-specialized tokenizer (76800 = code-token-dense). Dense GQA 32→2.
179
+ # Same family across 7B (14 layers) and 40B (80 layers).
180
+ },
181
+ "minicpm": {
182
+ "name": "OpenBMB MiniCPM",
183
+ "vocab_size": 73448,
184
+ "model_type_patterns": ["minicpm"],
185
+ # MiniCPM family (AgentCPM-Report etc.). Heavy GQA 32→2.
186
+ },
187
+ "step3_5": {
188
+ "name": "StepFun Step-3.5 Flash",
189
+ "vocab_size": [128815, 128896],
190
+ "model_type_patterns": ["step3p5"],
191
+ # Per-layer RoPE schedule: every 4th layer gets long-context theta (1e6/5e6),
192
+ # others get 1e4. Sliding-window=512. First StepFun entry with multi-freq RoPE.
193
+ },
194
+ "mimo_v2": {
195
+ "name": "Xiaomi MiMo V2.x",
196
+ "vocab_size": 152576,
197
+ "model_type_patterns": ["mimo_v2"],
198
+ # V2.5: hidden=4096, 48 layers; V2.5-Pro: hidden=6144, 70 layers
199
+ },
200
  "emu3": {
201
  "name": "BAAI Emu3 family (unified vision+text)",
202
  "vocab_size": [184622, 282926],