Spaces:
Running
Running
j_yoon.song
commited on
Commit
Β·
289ccbe
1
Parent(s):
cb27169
Add model & update visualization
Browse files- src/about.py +1 -5
- src/data/length_data.json +612 -0
- src/data/stats.csv +20 -2
- src/data/stats_lang.csv +20 -2
- vis_utils.py +1 -1
src/about.py
CHANGED
|
@@ -24,13 +24,9 @@ LINK = """
|
|
| 24 |
<h3 style="text-align: right; margin-top: 0;">
|
| 25 |
<span>β¨</span>
|
| 26 |
<a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
|
| 27 |
-
<span>π</span>
|
| 28 |
-
<a href="https://github.com/samsung" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">GitHub</a> |
|
| 29 |
-
<span>π</span>
|
| 30 |
-
<a href="https://x.com/samsungresearch" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">X</a> |
|
| 31 |
<span>π </span>
|
| 32 |
<a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
|
| 33 |
-
<span>π</span> Updated: 2025-09-
|
| 34 |
</h3>
|
| 35 |
"""
|
| 36 |
|
|
|
|
| 24 |
<h3 style="text-align: right; margin-top: 0;">
|
| 25 |
<span>β¨</span>
|
| 26 |
<a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
<span>π </span>
|
| 28 |
<a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
|
| 29 |
+
<span>π</span> Updated: 2025-09-23
|
| 30 |
</h3>
|
| 31 |
"""
|
| 32 |
|
src/data/length_data.json
CHANGED
|
@@ -135,6 +135,74 @@
|
|
| 135 |
"Med Resp": 1488.0
|
| 136 |
}
|
| 137 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
"DeepSeek V3.1 (think)": {
|
| 139 |
"Overall": {
|
| 140 |
"Min": 80,
|
|
@@ -203,6 +271,74 @@
|
|
| 203 |
"Med Resp": 1545.0
|
| 204 |
}
|
| 205 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
"o4-mini": {
|
| 207 |
"Overall": {
|
| 208 |
"Min": -10,
|
|
@@ -815,6 +951,74 @@
|
|
| 815 |
"Med Resp": 1318.5
|
| 816 |
}
|
| 817 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
"GLM-4.5 FP8 (think)": {
|
| 819 |
"Overall": {
|
| 820 |
"Min": 75,
|
|
@@ -1019,6 +1223,74 @@
|
|
| 1019 |
"Med Resp": 2150.0
|
| 1020 |
}
|
| 1021 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
"Qwen3 32B (think)": {
|
| 1023 |
"Overall": {
|
| 1024 |
"Min": 164,
|
|
@@ -1087,6 +1359,74 @@
|
|
| 1087 |
"Med Resp": 1481.0
|
| 1088 |
}
|
| 1089 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
"Qwen3 235B A22B Instruct 2507": {
|
| 1091 |
"Overall": {
|
| 1092 |
"Min": 1,
|
|
@@ -1291,6 +1631,74 @@
|
|
| 1291 |
"Med Resp": -3.0
|
| 1292 |
}
|
| 1293 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1294 |
"GPT-5 mini (Reasoning: medium)": {
|
| 1295 |
"Overall": {
|
| 1296 |
"Min": -10,
|
|
@@ -1359,6 +1767,74 @@
|
|
| 1359 |
"Med Resp": -3.0
|
| 1360 |
}
|
| 1361 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1362 |
"GPT-5 nano (Reasoning: medium)": {
|
| 1363 |
"Overall": {
|
| 1364 |
"Min": -10,
|
|
@@ -1902,5 +2378,141 @@
|
|
| 1902 |
"Med": -6.0,
|
| 1903 |
"Med Resp": -3.0
|
| 1904 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1905 |
}
|
| 1906 |
}
|
|
|
|
| 135 |
"Med Resp": 1488.0
|
| 136 |
}
|
| 137 |
},
|
| 138 |
+
"Qwen3 Next 80B A3B Thinking": {
|
| 139 |
+
"Overall": {
|
| 140 |
+
"Min": 4,
|
| 141 |
+
"Max": 76399,
|
| 142 |
+
"Med": 3263.0,
|
| 143 |
+
"Med Resp": 329.0
|
| 144 |
+
},
|
| 145 |
+
"Content Generation": {
|
| 146 |
+
"Min": 651,
|
| 147 |
+
"Max": 65449,
|
| 148 |
+
"Med": 3195.0,
|
| 149 |
+
"Med Resp": 368.5
|
| 150 |
+
},
|
| 151 |
+
"Editing": {
|
| 152 |
+
"Min": 466,
|
| 153 |
+
"Max": 65112,
|
| 154 |
+
"Med": 2840.5,
|
| 155 |
+
"Med Resp": 203.5
|
| 156 |
+
},
|
| 157 |
+
"Data Analysis": {
|
| 158 |
+
"Min": 4,
|
| 159 |
+
"Max": 64756,
|
| 160 |
+
"Med": 1788.0,
|
| 161 |
+
"Med Resp": 212.0
|
| 162 |
+
},
|
| 163 |
+
"Reasoning": {
|
| 164 |
+
"Min": 582,
|
| 165 |
+
"Max": 30093,
|
| 166 |
+
"Med": 2740.5,
|
| 167 |
+
"Med Resp": 540.5
|
| 168 |
+
},
|
| 169 |
+
"Hallucination": {
|
| 170 |
+
"Min": 386,
|
| 171 |
+
"Max": 65491,
|
| 172 |
+
"Med": 2216.0,
|
| 173 |
+
"Med Resp": 586.0
|
| 174 |
+
},
|
| 175 |
+
"Safety": {
|
| 176 |
+
"Min": 320,
|
| 177 |
+
"Max": 65472,
|
| 178 |
+
"Med": 1642.0,
|
| 179 |
+
"Med Resp": 338.0
|
| 180 |
+
},
|
| 181 |
+
"Repetition": {
|
| 182 |
+
"Min": 1040,
|
| 183 |
+
"Max": 65529,
|
| 184 |
+
"Med": 6000.0,
|
| 185 |
+
"Med Resp": 251.0
|
| 186 |
+
},
|
| 187 |
+
"Summarization": {
|
| 188 |
+
"Min": 506,
|
| 189 |
+
"Max": 14800,
|
| 190 |
+
"Med": 2162.5,
|
| 191 |
+
"Med Resp": 191.0
|
| 192 |
+
},
|
| 193 |
+
"Translation": {
|
| 194 |
+
"Min": 728,
|
| 195 |
+
"Max": 65398,
|
| 196 |
+
"Med": 4754.0,
|
| 197 |
+
"Med Resp": 272.0
|
| 198 |
+
},
|
| 199 |
+
"Multi-Turn": {
|
| 200 |
+
"Min": 1070,
|
| 201 |
+
"Max": 76399,
|
| 202 |
+
"Med": 6871.5,
|
| 203 |
+
"Med Resp": 1179.0
|
| 204 |
+
}
|
| 205 |
+
},
|
| 206 |
"DeepSeek V3.1 (think)": {
|
| 207 |
"Overall": {
|
| 208 |
"Min": 80,
|
|
|
|
| 271 |
"Med Resp": 1545.0
|
| 272 |
}
|
| 273 |
},
|
| 274 |
+
"Qwen3 30B A3B Thinking 2507": {
|
| 275 |
+
"Overall": {
|
| 276 |
+
"Min": 305,
|
| 277 |
+
"Max": 32743,
|
| 278 |
+
"Med": 2830.0,
|
| 279 |
+
"Med Resp": 351.0
|
| 280 |
+
},
|
| 281 |
+
"Content Generation": {
|
| 282 |
+
"Min": 335,
|
| 283 |
+
"Max": 10914,
|
| 284 |
+
"Med": 2775.5,
|
| 285 |
+
"Med Resp": 403.5
|
| 286 |
+
},
|
| 287 |
+
"Editing": {
|
| 288 |
+
"Min": 371,
|
| 289 |
+
"Max": 7617,
|
| 290 |
+
"Med": 2358.5,
|
| 291 |
+
"Med Resp": 220.0
|
| 292 |
+
},
|
| 293 |
+
"Data Analysis": {
|
| 294 |
+
"Min": 305,
|
| 295 |
+
"Max": 19749,
|
| 296 |
+
"Med": 1702.0,
|
| 297 |
+
"Med Resp": 227.0
|
| 298 |
+
},
|
| 299 |
+
"Reasoning": {
|
| 300 |
+
"Min": 485,
|
| 301 |
+
"Max": 19485,
|
| 302 |
+
"Med": 2504.0,
|
| 303 |
+
"Med Resp": 505.0
|
| 304 |
+
},
|
| 305 |
+
"Hallucination": {
|
| 306 |
+
"Min": 360,
|
| 307 |
+
"Max": 6054,
|
| 308 |
+
"Med": 2123.5,
|
| 309 |
+
"Med Resp": 668.0
|
| 310 |
+
},
|
| 311 |
+
"Safety": {
|
| 312 |
+
"Min": 306,
|
| 313 |
+
"Max": 32688,
|
| 314 |
+
"Med": 1667.0,
|
| 315 |
+
"Med Resp": 447.0
|
| 316 |
+
},
|
| 317 |
+
"Repetition": {
|
| 318 |
+
"Min": 1070,
|
| 319 |
+
"Max": 32743,
|
| 320 |
+
"Med": 3719.0,
|
| 321 |
+
"Med Resp": 368.5
|
| 322 |
+
},
|
| 323 |
+
"Summarization": {
|
| 324 |
+
"Min": 435,
|
| 325 |
+
"Max": 14462,
|
| 326 |
+
"Med": 2108.0,
|
| 327 |
+
"Med Resp": 204.0
|
| 328 |
+
},
|
| 329 |
+
"Translation": {
|
| 330 |
+
"Min": 513,
|
| 331 |
+
"Max": 11340,
|
| 332 |
+
"Med": 3869.5,
|
| 333 |
+
"Med Resp": 276.0
|
| 334 |
+
},
|
| 335 |
+
"Multi-Turn": {
|
| 336 |
+
"Min": 536,
|
| 337 |
+
"Max": 14557,
|
| 338 |
+
"Med": 5822.5,
|
| 339 |
+
"Med Resp": 1237.0
|
| 340 |
+
}
|
| 341 |
+
},
|
| 342 |
"o4-mini": {
|
| 343 |
"Overall": {
|
| 344 |
"Min": -10,
|
|
|
|
| 951 |
"Med Resp": 1318.5
|
| 952 |
}
|
| 953 |
},
|
| 954 |
+
"Mistral-Samll-3-2 24B-Instruct-2506": {
|
| 955 |
+
"Overall": {
|
| 956 |
+
"Min": 1,
|
| 957 |
+
"Max": 65516,
|
| 958 |
+
"Med": 369.0,
|
| 959 |
+
"Med Resp": 369.0
|
| 960 |
+
},
|
| 961 |
+
"Content Generation": {
|
| 962 |
+
"Min": 7,
|
| 963 |
+
"Max": 2684,
|
| 964 |
+
"Med": 389.5,
|
| 965 |
+
"Med Resp": 389.5
|
| 966 |
+
},
|
| 967 |
+
"Editing": {
|
| 968 |
+
"Min": 9,
|
| 969 |
+
"Max": 1172,
|
| 970 |
+
"Med": 269.0,
|
| 971 |
+
"Med Resp": 269.0
|
| 972 |
+
},
|
| 973 |
+
"Data Analysis": {
|
| 974 |
+
"Min": 1,
|
| 975 |
+
"Max": 3973,
|
| 976 |
+
"Med": 295.0,
|
| 977 |
+
"Med Resp": 295.0
|
| 978 |
+
},
|
| 979 |
+
"Reasoning": {
|
| 980 |
+
"Min": 1,
|
| 981 |
+
"Max": 65462,
|
| 982 |
+
"Med": 484.5,
|
| 983 |
+
"Med Resp": 484.5
|
| 984 |
+
},
|
| 985 |
+
"Hallucination": {
|
| 986 |
+
"Min": 61,
|
| 987 |
+
"Max": 5920,
|
| 988 |
+
"Med": 489.0,
|
| 989 |
+
"Med Resp": 489.0
|
| 990 |
+
},
|
| 991 |
+
"Safety": {
|
| 992 |
+
"Min": 10,
|
| 993 |
+
"Max": 65465,
|
| 994 |
+
"Med": 320.0,
|
| 995 |
+
"Med Resp": 320.0
|
| 996 |
+
},
|
| 997 |
+
"Repetition": {
|
| 998 |
+
"Min": 103,
|
| 999 |
+
"Max": 65516,
|
| 1000 |
+
"Med": 376.5,
|
| 1001 |
+
"Med Resp": 376.5
|
| 1002 |
+
},
|
| 1003 |
+
"Summarization": {
|
| 1004 |
+
"Min": 28,
|
| 1005 |
+
"Max": 1266,
|
| 1006 |
+
"Med": 234.5,
|
| 1007 |
+
"Med Resp": 234.5
|
| 1008 |
+
},
|
| 1009 |
+
"Translation": {
|
| 1010 |
+
"Min": 9,
|
| 1011 |
+
"Max": 3248,
|
| 1012 |
+
"Med": 327.0,
|
| 1013 |
+
"Med Resp": 327.0
|
| 1014 |
+
},
|
| 1015 |
+
"Multi-Turn": {
|
| 1016 |
+
"Min": 4,
|
| 1017 |
+
"Max": 65494,
|
| 1018 |
+
"Med": 1279.0,
|
| 1019 |
+
"Med Resp": 1279.0
|
| 1020 |
+
}
|
| 1021 |
+
},
|
| 1022 |
"GLM-4.5 FP8 (think)": {
|
| 1023 |
"Overall": {
|
| 1024 |
"Min": 75,
|
|
|
|
| 1223 |
"Med Resp": 2150.0
|
| 1224 |
}
|
| 1225 |
},
|
| 1226 |
+
"K2-Think": {
|
| 1227 |
+
"Overall": {
|
| 1228 |
+
"Min": 27,
|
| 1229 |
+
"Max": 8178,
|
| 1230 |
+
"Med": 1835.0,
|
| 1231 |
+
"Med Resp": 486.0
|
| 1232 |
+
},
|
| 1233 |
+
"Content Generation": {
|
| 1234 |
+
"Min": 138,
|
| 1235 |
+
"Max": 2049,
|
| 1236 |
+
"Med": 1821.5,
|
| 1237 |
+
"Med Resp": 660.5
|
| 1238 |
+
},
|
| 1239 |
+
"Editing": {
|
| 1240 |
+
"Min": 169,
|
| 1241 |
+
"Max": 2054,
|
| 1242 |
+
"Med": 1433.5,
|
| 1243 |
+
"Med Resp": 283.5
|
| 1244 |
+
},
|
| 1245 |
+
"Data Analysis": {
|
| 1246 |
+
"Min": 150,
|
| 1247 |
+
"Max": 2053,
|
| 1248 |
+
"Med": 1349.0,
|
| 1249 |
+
"Med Resp": 264.0
|
| 1250 |
+
},
|
| 1251 |
+
"Reasoning": {
|
| 1252 |
+
"Min": 419,
|
| 1253 |
+
"Max": 2048,
|
| 1254 |
+
"Med": 2045.5,
|
| 1255 |
+
"Med Resp": 576.5
|
| 1256 |
+
},
|
| 1257 |
+
"Hallucination": {
|
| 1258 |
+
"Min": 174,
|
| 1259 |
+
"Max": 2054,
|
| 1260 |
+
"Med": 1890.0,
|
| 1261 |
+
"Med Resp": 522.5
|
| 1262 |
+
},
|
| 1263 |
+
"Safety": {
|
| 1264 |
+
"Min": 27,
|
| 1265 |
+
"Max": 2048,
|
| 1266 |
+
"Med": 1393.0,
|
| 1267 |
+
"Med Resp": 405.0
|
| 1268 |
+
},
|
| 1269 |
+
"Repetition": {
|
| 1270 |
+
"Min": 870,
|
| 1271 |
+
"Max": 2070,
|
| 1272 |
+
"Med": 2048.0,
|
| 1273 |
+
"Med Resp": 2048.0
|
| 1274 |
+
},
|
| 1275 |
+
"Summarization": {
|
| 1276 |
+
"Min": 252,
|
| 1277 |
+
"Max": 2053,
|
| 1278 |
+
"Med": 1011.0,
|
| 1279 |
+
"Med Resp": 262.5
|
| 1280 |
+
},
|
| 1281 |
+
"Translation": {
|
| 1282 |
+
"Min": 195,
|
| 1283 |
+
"Max": 2051,
|
| 1284 |
+
"Med": 2006.0,
|
| 1285 |
+
"Med Resp": 371.5
|
| 1286 |
+
},
|
| 1287 |
+
"Multi-Turn": {
|
| 1288 |
+
"Min": 110,
|
| 1289 |
+
"Max": 8178,
|
| 1290 |
+
"Med": 3224.0,
|
| 1291 |
+
"Med Resp": 1526.0
|
| 1292 |
+
}
|
| 1293 |
+
},
|
| 1294 |
"Qwen3 32B (think)": {
|
| 1295 |
"Overall": {
|
| 1296 |
"Min": 164,
|
|
|
|
| 1359 |
"Med Resp": 1481.0
|
| 1360 |
}
|
| 1361 |
},
|
| 1362 |
+
"ERNIE-4.5 21B A3B Thinking": {
|
| 1363 |
+
"Overall": {
|
| 1364 |
+
"Min": 186,
|
| 1365 |
+
"Max": 66114,
|
| 1366 |
+
"Med": 1637.0,
|
| 1367 |
+
"Med Resp": 541.0
|
| 1368 |
+
},
|
| 1369 |
+
"Content Generation": {
|
| 1370 |
+
"Min": 302,
|
| 1371 |
+
"Max": 12760,
|
| 1372 |
+
"Med": 1586.5,
|
| 1373 |
+
"Med Resp": 654.5
|
| 1374 |
+
},
|
| 1375 |
+
"Editing": {
|
| 1376 |
+
"Min": 186,
|
| 1377 |
+
"Max": 8703,
|
| 1378 |
+
"Med": 1119.5,
|
| 1379 |
+
"Med Resp": 336.0
|
| 1380 |
+
},
|
| 1381 |
+
"Data Analysis": {
|
| 1382 |
+
"Min": 200,
|
| 1383 |
+
"Max": 31928,
|
| 1384 |
+
"Med": 1484.0,
|
| 1385 |
+
"Med Resp": 418.0
|
| 1386 |
+
},
|
| 1387 |
+
"Reasoning": {
|
| 1388 |
+
"Min": 511,
|
| 1389 |
+
"Max": 29184,
|
| 1390 |
+
"Med": 5312.0,
|
| 1391 |
+
"Med Resp": 669.5
|
| 1392 |
+
},
|
| 1393 |
+
"Hallucination": {
|
| 1394 |
+
"Min": 313,
|
| 1395 |
+
"Max": 11452,
|
| 1396 |
+
"Med": 1716.0,
|
| 1397 |
+
"Med Resp": 797.5
|
| 1398 |
+
},
|
| 1399 |
+
"Safety": {
|
| 1400 |
+
"Min": 213,
|
| 1401 |
+
"Max": 6914,
|
| 1402 |
+
"Med": 1242.0,
|
| 1403 |
+
"Med Resp": 599.0
|
| 1404 |
+
},
|
| 1405 |
+
"Repetition": {
|
| 1406 |
+
"Min": 643,
|
| 1407 |
+
"Max": 65463,
|
| 1408 |
+
"Med": 2387.0,
|
| 1409 |
+
"Med Resp": 516.5
|
| 1410 |
+
},
|
| 1411 |
+
"Summarization": {
|
| 1412 |
+
"Min": 215,
|
| 1413 |
+
"Max": 12449,
|
| 1414 |
+
"Med": 884.0,
|
| 1415 |
+
"Med Resp": 269.5
|
| 1416 |
+
},
|
| 1417 |
+
"Translation": {
|
| 1418 |
+
"Min": 298,
|
| 1419 |
+
"Max": 19672,
|
| 1420 |
+
"Med": 1466.5,
|
| 1421 |
+
"Med Resp": 421.5
|
| 1422 |
+
},
|
| 1423 |
+
"Multi-Turn": {
|
| 1424 |
+
"Min": 705,
|
| 1425 |
+
"Max": 66114,
|
| 1426 |
+
"Med": 4404.5,
|
| 1427 |
+
"Med Resp": 1819.0
|
| 1428 |
+
}
|
| 1429 |
+
},
|
| 1430 |
"Qwen3 235B A22B Instruct 2507": {
|
| 1431 |
"Overall": {
|
| 1432 |
"Min": 1,
|
|
|
|
| 1631 |
"Med Resp": -3.0
|
| 1632 |
}
|
| 1633 |
},
|
| 1634 |
+
"Tongyi DeepResearch 30B A3B": {
|
| 1635 |
+
"Overall": {
|
| 1636 |
+
"Min": 153,
|
| 1637 |
+
"Max": 68912,
|
| 1638 |
+
"Med": 1147.0,
|
| 1639 |
+
"Med Resp": 408.0
|
| 1640 |
+
},
|
| 1641 |
+
"Content Generation": {
|
| 1642 |
+
"Min": 216,
|
| 1643 |
+
"Max": 65477,
|
| 1644 |
+
"Med": 1086.5,
|
| 1645 |
+
"Med Resp": 510.5
|
| 1646 |
+
},
|
| 1647 |
+
"Editing": {
|
| 1648 |
+
"Min": 251,
|
| 1649 |
+
"Max": 65470,
|
| 1650 |
+
"Med": 985.5,
|
| 1651 |
+
"Med Resp": 313.0
|
| 1652 |
+
},
|
| 1653 |
+
"Data Analysis": {
|
| 1654 |
+
"Min": 242,
|
| 1655 |
+
"Max": 65499,
|
| 1656 |
+
"Med": 998.0,
|
| 1657 |
+
"Med Resp": 239.0
|
| 1658 |
+
},
|
| 1659 |
+
"Reasoning": {
|
| 1660 |
+
"Min": 333,
|
| 1661 |
+
"Max": 65477,
|
| 1662 |
+
"Med": 2043.5,
|
| 1663 |
+
"Med Resp": 388.5
|
| 1664 |
+
},
|
| 1665 |
+
"Hallucination": {
|
| 1666 |
+
"Min": 194,
|
| 1667 |
+
"Max": 65501,
|
| 1668 |
+
"Med": 1344.5,
|
| 1669 |
+
"Med Resp": 593.0
|
| 1670 |
+
},
|
| 1671 |
+
"Safety": {
|
| 1672 |
+
"Min": 153,
|
| 1673 |
+
"Max": 65472,
|
| 1674 |
+
"Med": 992.0,
|
| 1675 |
+
"Med Resp": 392.0
|
| 1676 |
+
},
|
| 1677 |
+
"Repetition": {
|
| 1678 |
+
"Min": 425,
|
| 1679 |
+
"Max": 65513,
|
| 1680 |
+
"Med": 1986.5,
|
| 1681 |
+
"Med Resp": 472.5
|
| 1682 |
+
},
|
| 1683 |
+
"Summarization": {
|
| 1684 |
+
"Min": 290,
|
| 1685 |
+
"Max": 2410,
|
| 1686 |
+
"Med": 662.5,
|
| 1687 |
+
"Med Resp": 262.0
|
| 1688 |
+
},
|
| 1689 |
+
"Translation": {
|
| 1690 |
+
"Min": 360,
|
| 1691 |
+
"Max": 65406,
|
| 1692 |
+
"Med": 1107.0,
|
| 1693 |
+
"Med Resp": 317.5
|
| 1694 |
+
},
|
| 1695 |
+
"Multi-Turn": {
|
| 1696 |
+
"Min": 240,
|
| 1697 |
+
"Max": 68912,
|
| 1698 |
+
"Med": 3134.5,
|
| 1699 |
+
"Med Resp": 1349.5
|
| 1700 |
+
}
|
| 1701 |
+
},
|
| 1702 |
"GPT-5 mini (Reasoning: medium)": {
|
| 1703 |
"Overall": {
|
| 1704 |
"Min": -10,
|
|
|
|
| 1767 |
"Med Resp": -3.0
|
| 1768 |
}
|
| 1769 |
},
|
| 1770 |
+
"Gemma 3 27B it": {
|
| 1771 |
+
"Overall": {
|
| 1772 |
+
"Min": 1,
|
| 1773 |
+
"Max": 65458,
|
| 1774 |
+
"Med": 380.0,
|
| 1775 |
+
"Med Resp": 380.0
|
| 1776 |
+
},
|
| 1777 |
+
"Content Generation": {
|
| 1778 |
+
"Min": 7,
|
| 1779 |
+
"Max": 3893,
|
| 1780 |
+
"Med": 484.0,
|
| 1781 |
+
"Med Resp": 484.0
|
| 1782 |
+
},
|
| 1783 |
+
"Editing": {
|
| 1784 |
+
"Min": 6,
|
| 1785 |
+
"Max": 1776,
|
| 1786 |
+
"Med": 254.0,
|
| 1787 |
+
"Med Resp": 254.0
|
| 1788 |
+
},
|
| 1789 |
+
"Data Analysis": {
|
| 1790 |
+
"Min": 1,
|
| 1791 |
+
"Max": 63850,
|
| 1792 |
+
"Med": 180.0,
|
| 1793 |
+
"Med Resp": 180.0
|
| 1794 |
+
},
|
| 1795 |
+
"Reasoning": {
|
| 1796 |
+
"Min": 2,
|
| 1797 |
+
"Max": 1926,
|
| 1798 |
+
"Med": 485.5,
|
| 1799 |
+
"Med Resp": 485.5
|
| 1800 |
+
},
|
| 1801 |
+
"Hallucination": {
|
| 1802 |
+
"Min": 13,
|
| 1803 |
+
"Max": 2494,
|
| 1804 |
+
"Med": 534.0,
|
| 1805 |
+
"Med Resp": 534.0
|
| 1806 |
+
},
|
| 1807 |
+
"Safety": {
|
| 1808 |
+
"Min": 31,
|
| 1809 |
+
"Max": 2440,
|
| 1810 |
+
"Med": 518.0,
|
| 1811 |
+
"Med Resp": 518.0
|
| 1812 |
+
},
|
| 1813 |
+
"Repetition": {
|
| 1814 |
+
"Min": 95,
|
| 1815 |
+
"Max": 65433,
|
| 1816 |
+
"Med": 299.0,
|
| 1817 |
+
"Med Resp": 299.0
|
| 1818 |
+
},
|
| 1819 |
+
"Summarization": {
|
| 1820 |
+
"Min": 30,
|
| 1821 |
+
"Max": 1080,
|
| 1822 |
+
"Med": 202.5,
|
| 1823 |
+
"Med Resp": 202.5
|
| 1824 |
+
},
|
| 1825 |
+
"Translation": {
|
| 1826 |
+
"Min": 46,
|
| 1827 |
+
"Max": 62659,
|
| 1828 |
+
"Med": 374.0,
|
| 1829 |
+
"Med Resp": 374.0
|
| 1830 |
+
},
|
| 1831 |
+
"Multi-Turn": {
|
| 1832 |
+
"Min": 4,
|
| 1833 |
+
"Max": 65458,
|
| 1834 |
+
"Med": 1558.0,
|
| 1835 |
+
"Med Resp": 1558.0
|
| 1836 |
+
}
|
| 1837 |
+
},
|
| 1838 |
"GPT-5 nano (Reasoning: medium)": {
|
| 1839 |
"Overall": {
|
| 1840 |
"Min": -10,
|
|
|
|
| 2378 |
"Med": -6.0,
|
| 2379 |
"Med Resp": -3.0
|
| 2380 |
}
|
| 2381 |
+
},
|
| 2382 |
+
"Qwen3 Next 80B A3B Instruct": {
|
| 2383 |
+
"Overall": {
|
| 2384 |
+
"Min": 1,
|
| 2385 |
+
"Max": 65511,
|
| 2386 |
+
"Med": 477.5,
|
| 2387 |
+
"Med Resp": 477.5
|
| 2388 |
+
},
|
| 2389 |
+
"Content Generation": {
|
| 2390 |
+
"Min": 8,
|
| 2391 |
+
"Max": 5955,
|
| 2392 |
+
"Med": 553.0,
|
| 2393 |
+
"Med Resp": 553.0
|
| 2394 |
+
},
|
| 2395 |
+
"Editing": {
|
| 2396 |
+
"Min": 8,
|
| 2397 |
+
"Max": 3157,
|
| 2398 |
+
"Med": 323.0,
|
| 2399 |
+
"Med Resp": 323.0
|
| 2400 |
+
},
|
| 2401 |
+
"Data Analysis": {
|
| 2402 |
+
"Min": 1,
|
| 2403 |
+
"Max": 4108,
|
| 2404 |
+
"Med": 407.0,
|
| 2405 |
+
"Med Resp": 407.0
|
| 2406 |
+
},
|
| 2407 |
+
"Reasoning": {
|
| 2408 |
+
"Min": 1,
|
| 2409 |
+
"Max": 14630,
|
| 2410 |
+
"Med": 813.5,
|
| 2411 |
+
"Med Resp": 813.5
|
| 2412 |
+
},
|
| 2413 |
+
"Hallucination": {
|
| 2414 |
+
"Min": 73,
|
| 2415 |
+
"Max": 65493,
|
| 2416 |
+
"Med": 662.0,
|
| 2417 |
+
"Med Resp": 662.0
|
| 2418 |
+
},
|
| 2419 |
+
"Safety": {
|
| 2420 |
+
"Min": 4,
|
| 2421 |
+
"Max": 3360,
|
| 2422 |
+
"Med": 531.0,
|
| 2423 |
+
"Med Resp": 531.0
|
| 2424 |
+
},
|
| 2425 |
+
"Repetition": {
|
| 2426 |
+
"Min": 132,
|
| 2427 |
+
"Max": 65511,
|
| 2428 |
+
"Med": 739.5,
|
| 2429 |
+
"Med Resp": 739.5
|
| 2430 |
+
},
|
| 2431 |
+
"Summarization": {
|
| 2432 |
+
"Min": 27,
|
| 2433 |
+
"Max": 1849,
|
| 2434 |
+
"Med": 264.0,
|
| 2435 |
+
"Med Resp": 264.0
|
| 2436 |
+
},
|
| 2437 |
+
"Translation": {
|
| 2438 |
+
"Min": 9,
|
| 2439 |
+
"Max": 3123,
|
| 2440 |
+
"Med": 294.5,
|
| 2441 |
+
"Med Resp": 294.5
|
| 2442 |
+
},
|
| 2443 |
+
"Multi-Turn": {
|
| 2444 |
+
"Min": 3,
|
| 2445 |
+
"Max": 10283,
|
| 2446 |
+
"Med": 1913.0,
|
| 2447 |
+
"Med Resp": 1913.0
|
| 2448 |
+
}
|
| 2449 |
+
},
|
| 2450 |
+
"Qwen3 30B A3B Instruct 2507": {
|
| 2451 |
+
"Overall": {
|
| 2452 |
+
"Min": 1,
|
| 2453 |
+
"Max": 65516,
|
| 2454 |
+
"Med": 441.5,
|
| 2455 |
+
"Med Resp": 441.5
|
| 2456 |
+
},
|
| 2457 |
+
"Content Generation": {
|
| 2458 |
+
"Min": 7,
|
| 2459 |
+
"Max": 5659,
|
| 2460 |
+
"Med": 510.5,
|
| 2461 |
+
"Med Resp": 510.5
|
| 2462 |
+
},
|
| 2463 |
+
"Editing": {
|
| 2464 |
+
"Min": 7,
|
| 2465 |
+
"Max": 2231,
|
| 2466 |
+
"Med": 255.0,
|
| 2467 |
+
"Med Resp": 255.0
|
| 2468 |
+
},
|
| 2469 |
+
"Data Analysis": {
|
| 2470 |
+
"Min": 1,
|
| 2471 |
+
"Max": 8094,
|
| 2472 |
+
"Med": 381.0,
|
| 2473 |
+
"Med Resp": 381.0
|
| 2474 |
+
},
|
| 2475 |
+
"Reasoning": {
|
| 2476 |
+
"Min": 1,
|
| 2477 |
+
"Max": 9376,
|
| 2478 |
+
"Med": 753.5,
|
| 2479 |
+
"Med Resp": 753.5
|
| 2480 |
+
},
|
| 2481 |
+
"Hallucination": {
|
| 2482 |
+
"Min": 19,
|
| 2483 |
+
"Max": 65495,
|
| 2484 |
+
"Med": 689.5,
|
| 2485 |
+
"Med Resp": 689.5
|
| 2486 |
+
},
|
| 2487 |
+
"Safety": {
|
| 2488 |
+
"Min": 16,
|
| 2489 |
+
"Max": 65456,
|
| 2490 |
+
"Med": 445.0,
|
| 2491 |
+
"Med Resp": 445.0
|
| 2492 |
+
},
|
| 2493 |
+
"Repetition": {
|
| 2494 |
+
"Min": 81,
|
| 2495 |
+
"Max": 65516,
|
| 2496 |
+
"Med": 533.5,
|
| 2497 |
+
"Med Resp": 533.5
|
| 2498 |
+
},
|
| 2499 |
+
"Summarization": {
|
| 2500 |
+
"Min": 34,
|
| 2501 |
+
"Max": 1870,
|
| 2502 |
+
"Med": 251.0,
|
| 2503 |
+
"Med Resp": 251.0
|
| 2504 |
+
},
|
| 2505 |
+
"Translation": {
|
| 2506 |
+
"Min": 8,
|
| 2507 |
+
"Max": 3257,
|
| 2508 |
+
"Med": 292.5,
|
| 2509 |
+
"Med Resp": 292.5
|
| 2510 |
+
},
|
| 2511 |
+
"Multi-Turn": {
|
| 2512 |
+
"Min": 3,
|
| 2513 |
+
"Max": 6825,
|
| 2514 |
+
"Med": 1809.5,
|
| 2515 |
+
"Med Resp": 1809.5
|
| 2516 |
+
}
|
| 2517 |
}
|
| 2518 |
}
|
src/data/stats.csv
CHANGED
|
@@ -14,15 +14,21 @@ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "
|
|
| 14 |
"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
|
| 15 |
top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
|
| 16 |
"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
|
|
|
|
|
|
|
| 17 |
"GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
|
| 18 |
top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
|
| 19 |
"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
|
| 20 |
top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
|
| 21 |
"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
|
| 22 |
top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
|
|
|
|
|
|
|
| 23 |
"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
|
| 24 |
temperature: 1.0
|
| 25 |
top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
|
|
|
|
|
|
|
| 26 |
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
|
| 27 |
temperature: 0.6
|
| 28 |
top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
|
|
@@ -32,17 +38,29 @@ temperature: 1.3
|
|
| 32 |
top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
|
| 33 |
"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
|
| 34 |
top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
|
|
|
|
|
|
|
| 35 |
"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
|
| 36 |
"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
|
| 37 |
temperature: 1.0
|
| 38 |
top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
|
| 40 |
top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
|
| 41 |
"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
|
| 42 |
top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
|
|
|
|
|
|
|
| 43 |
"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
|
| 44 |
top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
|
| 45 |
"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
|
| 46 |
-
top-p: 0.7" "
|
| 47 |
"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
|
| 48 |
-
top-p: 0.95" "
|
|
|
|
| 14 |
"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
|
| 15 |
top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
|
| 16 |
"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
|
| 17 |
+
"Qwen3 Next 80B A3B Thinking" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking" "temperature: 0.6
|
| 18 |
+
top-p: 0.95" "Qwen" "3263.0" "329.0" "80.0" "Open" "Think" "On" "55.07" "58.25" "51.56" "73.71" "76.03" "52.3" "38.84" "32.86" "57.14" "46.63" "43.62"
|
| 19 |
"GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
|
| 20 |
top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
|
| 21 |
"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
|
| 22 |
top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
|
| 23 |
"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
|
| 24 |
top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
|
| 25 |
+
"Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.7
|
| 26 |
+
top-p: 0.8" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "56.25" "45.0" "69.32" "69.01" "50.0" "29.75" "30.0" "48.02" "47.47" "36.58"
|
| 27 |
"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
|
| 28 |
temperature: 1.0
|
| 29 |
top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
|
| 30 |
+
"Qwen3 Next 80B A3B Instruct" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct" "temperature: 0.7
|
| 31 |
+
top-p: 0.8" "Qwen" "477.5" "477.5" "80.0" "Open" "Instruct" "Off" "48.87" "53.5" "45.31" "66.14" "71.07" "55.17" "39.67" "21.43" "48.41" "36.8" "35.23"
|
| 32 |
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
|
| 33 |
temperature: 0.6
|
| 34 |
top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
|
|
|
|
| 38 |
top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
|
| 39 |
"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
|
| 40 |
top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
|
| 41 |
+
"Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
|
| 42 |
+
top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "45.0" "35.0" "56.18" "66.12" "51.15" "33.06" "24.29" "46.83" "28.09" "35.57"
|
| 43 |
"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
|
| 44 |
"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
|
| 45 |
temperature: 1.0
|
| 46 |
top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
|
| 47 |
+
"Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
|
| 48 |
+
top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "44.25" "45.0" "45.82" "36.78" "31.61" "32.23" "22.86" "57.14" "32.87" "39.93"
|
| 49 |
+
"Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
|
| 50 |
+
top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "41.25" "33.12" "62.15" "68.18" "44.25" "23.97" "18.57" "41.67" "26.12" "29.19"
|
| 51 |
+
"Mistral-Samll-3-2 24B-Instruct-2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
|
| 52 |
+
top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "43.0" "44.69" "43.43" "51.65" "25.86" "22.31" "25.71" "51.98" "31.18" "30.2"
|
| 53 |
+
"K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
|
| 54 |
+
top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "35.5" "36.56" "56.18" "47.11" "35.06" "14.05" "12.86" "49.21" "21.63" "23.15"
|
| 55 |
"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
|
| 56 |
top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
|
| 57 |
"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
|
| 58 |
top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
|
| 59 |
+
"ERNIE-4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
|
| 60 |
+
top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
|
| 61 |
"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
|
| 62 |
top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
|
| 63 |
"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
|
| 64 |
+
top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
|
| 65 |
"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
|
| 66 |
+
top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
|
src/data/stats_lang.csv
CHANGED
|
@@ -14,15 +14,21 @@ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "57.78" "56.67"
|
|
| 14 |
"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
|
| 15 |
top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
|
| 16 |
"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
|
|
|
|
|
|
|
| 17 |
"GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
|
| 18 |
top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
|
| 19 |
"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
|
| 20 |
top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
|
| 21 |
"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
|
| 22 |
top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
|
|
|
|
|
|
|
| 23 |
"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
|
| 24 |
temperature: 1.0
|
| 25 |
top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
|
|
|
|
|
|
|
| 26 |
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
|
| 27 |
temperature: 0.6
|
| 28 |
top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
|
|
@@ -32,17 +38,29 @@ temperature: 1.3
|
|
| 32 |
top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
|
| 33 |
"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
|
| 34 |
top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
|
|
|
|
|
|
|
| 35 |
"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
|
| 36 |
"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
|
| 37 |
temperature: 1.0
|
| 38 |
top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
|
| 40 |
top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
|
| 41 |
"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
|
| 42 |
top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
|
|
|
|
|
|
|
| 43 |
"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
|
| 44 |
top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
|
| 45 |
"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
|
| 46 |
-
top-p: 0.7" "
|
| 47 |
"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
|
| 48 |
-
top-p: 0.95" "
|
|
|
|
| 14 |
"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
|
| 15 |
top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
|
| 16 |
"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
|
| 17 |
+
"Qwen3 Next 80B A3B Thinking" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking" "temperature: 0.6
|
| 18 |
+
top-p: 0.95" "Qwen" "3263.0" "329.0" "80.0" "Open" "Think" "On" "55.07" "47.22" "55.0" "55.42" "53.66" "50.56" "55.25" "54.75" "60.0" "63.04" "62.84" "54.49" "56.1"
|
| 19 |
"GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
|
| 20 |
top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
|
| 21 |
"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
|
| 22 |
top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
|
| 23 |
"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
|
| 24 |
top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
|
| 25 |
+
"Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
|
| 26 |
+
top-p: 0.95" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "44.17" "49.17" "50.0" "57.32" "42.22" "49.72" "53.07" "50.27" "54.89" "56.83" "47.75" "58.54"
|
| 27 |
"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
|
| 28 |
temperature: 1.0
|
| 29 |
top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
|
| 30 |
+
"Qwen3 Next 80B A3B Instruct" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct" "temperature: 0.7
|
| 31 |
+
top-p: 0.8" "Qwen" "477.5" "477.5" "80.0" "Open" "Instruct" "Off" "48.87" "41.67" "50.83" "51.81" "55.49" "42.78" "47.51" "50.28" "51.89" "50.54" "48.09" "51.69" "50.0"
|
| 32 |
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
|
| 33 |
temperature: 0.6
|
| 34 |
top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
|
|
|
|
| 38 |
top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
|
| 39 |
"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
|
| 40 |
top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
|
| 41 |
+
"Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
|
| 42 |
+
top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "34.44" "43.89" "40.96" "48.78" "38.89" "41.99" "46.93" "44.32" "42.93" "48.09" "43.26" "46.95"
|
| 43 |
"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
|
| 44 |
"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
|
| 45 |
temperature: 1.0
|
| 46 |
top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
|
| 47 |
+
"Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
|
| 48 |
+
top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "34.44" "35.0" "37.35" "43.9" "42.22" "43.65" "47.49" "41.08" "44.02" "53.55" "39.33" "40.24"
|
| 49 |
+
"Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
|
| 50 |
+
top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "36.11" "40.83" "43.37" "44.51" "32.78" "37.02" "44.69" "38.92" "43.48" "46.45" "37.08" "39.63"
|
| 51 |
+
"Mistral-Samll-3-2 24B-Instruct-2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
|
| 52 |
+
top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "31.39" "40.0" "36.75" "42.07" "34.44" "44.2" "41.9" "42.16" "45.65" "40.98" "37.64" "38.41"
|
| 53 |
+
"K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
|
| 54 |
+
top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "29.17" "36.11" "30.12" "44.51" "26.67" "33.15" "38.55" "37.84" "41.85" "37.7" "33.71" "36.59"
|
| 55 |
"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
|
| 56 |
top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
|
| 57 |
"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
|
| 58 |
top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
|
| 59 |
+
"ERNIE-4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
|
| 60 |
+
top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "17.5" "31.11" "18.67" "39.02" "23.33" "24.31" "24.58" "26.49" "24.46" "30.6" "19.1" "27.44"
|
| 61 |
"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
|
| 62 |
top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
|
| 63 |
"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
|
| 64 |
+
top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
|
| 65 |
"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
|
| 66 |
+
top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
|
vis_utils.py
CHANGED
|
@@ -82,7 +82,7 @@ def create_empty_radar_chart(message: str) -> Figure:
|
|
| 82 |
def create_len_overall_scatter(
|
| 83 |
df: pd.DataFrame,
|
| 84 |
selected_models: Optional[List[str]] = None,
|
| 85 |
-
max_models: int =
|
| 86 |
y_col: str = "Overall",
|
| 87 |
length_data: Optional[dict] = None,
|
| 88 |
theme: str = "light",
|
|
|
|
| 82 |
def create_len_overall_scatter(
|
| 83 |
df: pd.DataFrame,
|
| 84 |
selected_models: Optional[List[str]] = None,
|
| 85 |
+
max_models: int = 50,
|
| 86 |
y_col: str = "Overall",
|
| 87 |
length_data: Optional[dict] = None,
|
| 88 |
theme: str = "light",
|