j_yoon.song commited on
Commit
289ccbe
Β·
1 Parent(s): cb27169

Add model & update visualization

Browse files
src/about.py CHANGED
@@ -24,13 +24,9 @@ LINK = """
24
  <h3 style="text-align: right; margin-top: 0;">
25
  <span>✨</span>
26
  <a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
27
- <span>πŸŒ•</span>
28
- <a href="https://github.com/samsung" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">GitHub</a> |
29
- <span>🌎</span>
30
- <a href="https://x.com/samsungresearch" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">X</a> |
31
  <span>🌠</span>
32
  <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
33
- <span>πŸ”­</span> Updated: 2025-09-16
34
  </h3>
35
  """
36
 
 
24
  <h3 style="text-align: right; margin-top: 0;">
25
  <span>✨</span>
26
  <a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
 
 
 
 
27
  <span>🌠</span>
28
  <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
29
+ <span>πŸ”­</span> Updated: 2025-09-23
30
  </h3>
31
  """
32
 
src/data/length_data.json CHANGED
@@ -135,6 +135,74 @@
135
  "Med Resp": 1488.0
136
  }
137
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  "DeepSeek V3.1 (think)": {
139
  "Overall": {
140
  "Min": 80,
@@ -203,6 +271,74 @@
203
  "Med Resp": 1545.0
204
  }
205
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  "o4-mini": {
207
  "Overall": {
208
  "Min": -10,
@@ -815,6 +951,74 @@
815
  "Med Resp": 1318.5
816
  }
817
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  "GLM-4.5 FP8 (think)": {
819
  "Overall": {
820
  "Min": 75,
@@ -1019,6 +1223,74 @@
1019
  "Med Resp": 2150.0
1020
  }
1021
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1022
  "Qwen3 32B (think)": {
1023
  "Overall": {
1024
  "Min": 164,
@@ -1087,6 +1359,74 @@
1087
  "Med Resp": 1481.0
1088
  }
1089
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
  "Qwen3 235B A22B Instruct 2507": {
1091
  "Overall": {
1092
  "Min": 1,
@@ -1291,6 +1631,74 @@
1291
  "Med Resp": -3.0
1292
  }
1293
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1294
  "GPT-5 mini (Reasoning: medium)": {
1295
  "Overall": {
1296
  "Min": -10,
@@ -1359,6 +1767,74 @@
1359
  "Med Resp": -3.0
1360
  }
1361
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1362
  "GPT-5 nano (Reasoning: medium)": {
1363
  "Overall": {
1364
  "Min": -10,
@@ -1902,5 +2378,141 @@
1902
  "Med": -6.0,
1903
  "Med Resp": -3.0
1904
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1905
  }
1906
  }
 
135
  "Med Resp": 1488.0
136
  }
137
  },
138
+ "Qwen3 Next 80B A3B Thinking": {
139
+ "Overall": {
140
+ "Min": 4,
141
+ "Max": 76399,
142
+ "Med": 3263.0,
143
+ "Med Resp": 329.0
144
+ },
145
+ "Content Generation": {
146
+ "Min": 651,
147
+ "Max": 65449,
148
+ "Med": 3195.0,
149
+ "Med Resp": 368.5
150
+ },
151
+ "Editing": {
152
+ "Min": 466,
153
+ "Max": 65112,
154
+ "Med": 2840.5,
155
+ "Med Resp": 203.5
156
+ },
157
+ "Data Analysis": {
158
+ "Min": 4,
159
+ "Max": 64756,
160
+ "Med": 1788.0,
161
+ "Med Resp": 212.0
162
+ },
163
+ "Reasoning": {
164
+ "Min": 582,
165
+ "Max": 30093,
166
+ "Med": 2740.5,
167
+ "Med Resp": 540.5
168
+ },
169
+ "Hallucination": {
170
+ "Min": 386,
171
+ "Max": 65491,
172
+ "Med": 2216.0,
173
+ "Med Resp": 586.0
174
+ },
175
+ "Safety": {
176
+ "Min": 320,
177
+ "Max": 65472,
178
+ "Med": 1642.0,
179
+ "Med Resp": 338.0
180
+ },
181
+ "Repetition": {
182
+ "Min": 1040,
183
+ "Max": 65529,
184
+ "Med": 6000.0,
185
+ "Med Resp": 251.0
186
+ },
187
+ "Summarization": {
188
+ "Min": 506,
189
+ "Max": 14800,
190
+ "Med": 2162.5,
191
+ "Med Resp": 191.0
192
+ },
193
+ "Translation": {
194
+ "Min": 728,
195
+ "Max": 65398,
196
+ "Med": 4754.0,
197
+ "Med Resp": 272.0
198
+ },
199
+ "Multi-Turn": {
200
+ "Min": 1070,
201
+ "Max": 76399,
202
+ "Med": 6871.5,
203
+ "Med Resp": 1179.0
204
+ }
205
+ },
206
  "DeepSeek V3.1 (think)": {
207
  "Overall": {
208
  "Min": 80,
 
271
  "Med Resp": 1545.0
272
  }
273
  },
274
+ "Qwen3 30B A3B Thinking 2507": {
275
+ "Overall": {
276
+ "Min": 305,
277
+ "Max": 32743,
278
+ "Med": 2830.0,
279
+ "Med Resp": 351.0
280
+ },
281
+ "Content Generation": {
282
+ "Min": 335,
283
+ "Max": 10914,
284
+ "Med": 2775.5,
285
+ "Med Resp": 403.5
286
+ },
287
+ "Editing": {
288
+ "Min": 371,
289
+ "Max": 7617,
290
+ "Med": 2358.5,
291
+ "Med Resp": 220.0
292
+ },
293
+ "Data Analysis": {
294
+ "Min": 305,
295
+ "Max": 19749,
296
+ "Med": 1702.0,
297
+ "Med Resp": 227.0
298
+ },
299
+ "Reasoning": {
300
+ "Min": 485,
301
+ "Max": 19485,
302
+ "Med": 2504.0,
303
+ "Med Resp": 505.0
304
+ },
305
+ "Hallucination": {
306
+ "Min": 360,
307
+ "Max": 6054,
308
+ "Med": 2123.5,
309
+ "Med Resp": 668.0
310
+ },
311
+ "Safety": {
312
+ "Min": 306,
313
+ "Max": 32688,
314
+ "Med": 1667.0,
315
+ "Med Resp": 447.0
316
+ },
317
+ "Repetition": {
318
+ "Min": 1070,
319
+ "Max": 32743,
320
+ "Med": 3719.0,
321
+ "Med Resp": 368.5
322
+ },
323
+ "Summarization": {
324
+ "Min": 435,
325
+ "Max": 14462,
326
+ "Med": 2108.0,
327
+ "Med Resp": 204.0
328
+ },
329
+ "Translation": {
330
+ "Min": 513,
331
+ "Max": 11340,
332
+ "Med": 3869.5,
333
+ "Med Resp": 276.0
334
+ },
335
+ "Multi-Turn": {
336
+ "Min": 536,
337
+ "Max": 14557,
338
+ "Med": 5822.5,
339
+ "Med Resp": 1237.0
340
+ }
341
+ },
342
  "o4-mini": {
343
  "Overall": {
344
  "Min": -10,
 
951
  "Med Resp": 1318.5
952
  }
953
  },
954
+ "Mistral-Samll-3-2 24B-Instruct-2506": {
955
+ "Overall": {
956
+ "Min": 1,
957
+ "Max": 65516,
958
+ "Med": 369.0,
959
+ "Med Resp": 369.0
960
+ },
961
+ "Content Generation": {
962
+ "Min": 7,
963
+ "Max": 2684,
964
+ "Med": 389.5,
965
+ "Med Resp": 389.5
966
+ },
967
+ "Editing": {
968
+ "Min": 9,
969
+ "Max": 1172,
970
+ "Med": 269.0,
971
+ "Med Resp": 269.0
972
+ },
973
+ "Data Analysis": {
974
+ "Min": 1,
975
+ "Max": 3973,
976
+ "Med": 295.0,
977
+ "Med Resp": 295.0
978
+ },
979
+ "Reasoning": {
980
+ "Min": 1,
981
+ "Max": 65462,
982
+ "Med": 484.5,
983
+ "Med Resp": 484.5
984
+ },
985
+ "Hallucination": {
986
+ "Min": 61,
987
+ "Max": 5920,
988
+ "Med": 489.0,
989
+ "Med Resp": 489.0
990
+ },
991
+ "Safety": {
992
+ "Min": 10,
993
+ "Max": 65465,
994
+ "Med": 320.0,
995
+ "Med Resp": 320.0
996
+ },
997
+ "Repetition": {
998
+ "Min": 103,
999
+ "Max": 65516,
1000
+ "Med": 376.5,
1001
+ "Med Resp": 376.5
1002
+ },
1003
+ "Summarization": {
1004
+ "Min": 28,
1005
+ "Max": 1266,
1006
+ "Med": 234.5,
1007
+ "Med Resp": 234.5
1008
+ },
1009
+ "Translation": {
1010
+ "Min": 9,
1011
+ "Max": 3248,
1012
+ "Med": 327.0,
1013
+ "Med Resp": 327.0
1014
+ },
1015
+ "Multi-Turn": {
1016
+ "Min": 4,
1017
+ "Max": 65494,
1018
+ "Med": 1279.0,
1019
+ "Med Resp": 1279.0
1020
+ }
1021
+ },
1022
  "GLM-4.5 FP8 (think)": {
1023
  "Overall": {
1024
  "Min": 75,
 
1223
  "Med Resp": 2150.0
1224
  }
1225
  },
1226
+ "K2-Think": {
1227
+ "Overall": {
1228
+ "Min": 27,
1229
+ "Max": 8178,
1230
+ "Med": 1835.0,
1231
+ "Med Resp": 486.0
1232
+ },
1233
+ "Content Generation": {
1234
+ "Min": 138,
1235
+ "Max": 2049,
1236
+ "Med": 1821.5,
1237
+ "Med Resp": 660.5
1238
+ },
1239
+ "Editing": {
1240
+ "Min": 169,
1241
+ "Max": 2054,
1242
+ "Med": 1433.5,
1243
+ "Med Resp": 283.5
1244
+ },
1245
+ "Data Analysis": {
1246
+ "Min": 150,
1247
+ "Max": 2053,
1248
+ "Med": 1349.0,
1249
+ "Med Resp": 264.0
1250
+ },
1251
+ "Reasoning": {
1252
+ "Min": 419,
1253
+ "Max": 2048,
1254
+ "Med": 2045.5,
1255
+ "Med Resp": 576.5
1256
+ },
1257
+ "Hallucination": {
1258
+ "Min": 174,
1259
+ "Max": 2054,
1260
+ "Med": 1890.0,
1261
+ "Med Resp": 522.5
1262
+ },
1263
+ "Safety": {
1264
+ "Min": 27,
1265
+ "Max": 2048,
1266
+ "Med": 1393.0,
1267
+ "Med Resp": 405.0
1268
+ },
1269
+ "Repetition": {
1270
+ "Min": 870,
1271
+ "Max": 2070,
1272
+ "Med": 2048.0,
1273
+ "Med Resp": 2048.0
1274
+ },
1275
+ "Summarization": {
1276
+ "Min": 252,
1277
+ "Max": 2053,
1278
+ "Med": 1011.0,
1279
+ "Med Resp": 262.5
1280
+ },
1281
+ "Translation": {
1282
+ "Min": 195,
1283
+ "Max": 2051,
1284
+ "Med": 2006.0,
1285
+ "Med Resp": 371.5
1286
+ },
1287
+ "Multi-Turn": {
1288
+ "Min": 110,
1289
+ "Max": 8178,
1290
+ "Med": 3224.0,
1291
+ "Med Resp": 1526.0
1292
+ }
1293
+ },
1294
  "Qwen3 32B (think)": {
1295
  "Overall": {
1296
  "Min": 164,
 
1359
  "Med Resp": 1481.0
1360
  }
1361
  },
1362
+ "ERNIE-4.5 21B A3B Thinking": {
1363
+ "Overall": {
1364
+ "Min": 186,
1365
+ "Max": 66114,
1366
+ "Med": 1637.0,
1367
+ "Med Resp": 541.0
1368
+ },
1369
+ "Content Generation": {
1370
+ "Min": 302,
1371
+ "Max": 12760,
1372
+ "Med": 1586.5,
1373
+ "Med Resp": 654.5
1374
+ },
1375
+ "Editing": {
1376
+ "Min": 186,
1377
+ "Max": 8703,
1378
+ "Med": 1119.5,
1379
+ "Med Resp": 336.0
1380
+ },
1381
+ "Data Analysis": {
1382
+ "Min": 200,
1383
+ "Max": 31928,
1384
+ "Med": 1484.0,
1385
+ "Med Resp": 418.0
1386
+ },
1387
+ "Reasoning": {
1388
+ "Min": 511,
1389
+ "Max": 29184,
1390
+ "Med": 5312.0,
1391
+ "Med Resp": 669.5
1392
+ },
1393
+ "Hallucination": {
1394
+ "Min": 313,
1395
+ "Max": 11452,
1396
+ "Med": 1716.0,
1397
+ "Med Resp": 797.5
1398
+ },
1399
+ "Safety": {
1400
+ "Min": 213,
1401
+ "Max": 6914,
1402
+ "Med": 1242.0,
1403
+ "Med Resp": 599.0
1404
+ },
1405
+ "Repetition": {
1406
+ "Min": 643,
1407
+ "Max": 65463,
1408
+ "Med": 2387.0,
1409
+ "Med Resp": 516.5
1410
+ },
1411
+ "Summarization": {
1412
+ "Min": 215,
1413
+ "Max": 12449,
1414
+ "Med": 884.0,
1415
+ "Med Resp": 269.5
1416
+ },
1417
+ "Translation": {
1418
+ "Min": 298,
1419
+ "Max": 19672,
1420
+ "Med": 1466.5,
1421
+ "Med Resp": 421.5
1422
+ },
1423
+ "Multi-Turn": {
1424
+ "Min": 705,
1425
+ "Max": 66114,
1426
+ "Med": 4404.5,
1427
+ "Med Resp": 1819.0
1428
+ }
1429
+ },
1430
  "Qwen3 235B A22B Instruct 2507": {
1431
  "Overall": {
1432
  "Min": 1,
 
1631
  "Med Resp": -3.0
1632
  }
1633
  },
1634
+ "Tongyi DeepResearch 30B A3B": {
1635
+ "Overall": {
1636
+ "Min": 153,
1637
+ "Max": 68912,
1638
+ "Med": 1147.0,
1639
+ "Med Resp": 408.0
1640
+ },
1641
+ "Content Generation": {
1642
+ "Min": 216,
1643
+ "Max": 65477,
1644
+ "Med": 1086.5,
1645
+ "Med Resp": 510.5
1646
+ },
1647
+ "Editing": {
1648
+ "Min": 251,
1649
+ "Max": 65470,
1650
+ "Med": 985.5,
1651
+ "Med Resp": 313.0
1652
+ },
1653
+ "Data Analysis": {
1654
+ "Min": 242,
1655
+ "Max": 65499,
1656
+ "Med": 998.0,
1657
+ "Med Resp": 239.0
1658
+ },
1659
+ "Reasoning": {
1660
+ "Min": 333,
1661
+ "Max": 65477,
1662
+ "Med": 2043.5,
1663
+ "Med Resp": 388.5
1664
+ },
1665
+ "Hallucination": {
1666
+ "Min": 194,
1667
+ "Max": 65501,
1668
+ "Med": 1344.5,
1669
+ "Med Resp": 593.0
1670
+ },
1671
+ "Safety": {
1672
+ "Min": 153,
1673
+ "Max": 65472,
1674
+ "Med": 992.0,
1675
+ "Med Resp": 392.0
1676
+ },
1677
+ "Repetition": {
1678
+ "Min": 425,
1679
+ "Max": 65513,
1680
+ "Med": 1986.5,
1681
+ "Med Resp": 472.5
1682
+ },
1683
+ "Summarization": {
1684
+ "Min": 290,
1685
+ "Max": 2410,
1686
+ "Med": 662.5,
1687
+ "Med Resp": 262.0
1688
+ },
1689
+ "Translation": {
1690
+ "Min": 360,
1691
+ "Max": 65406,
1692
+ "Med": 1107.0,
1693
+ "Med Resp": 317.5
1694
+ },
1695
+ "Multi-Turn": {
1696
+ "Min": 240,
1697
+ "Max": 68912,
1698
+ "Med": 3134.5,
1699
+ "Med Resp": 1349.5
1700
+ }
1701
+ },
1702
  "GPT-5 mini (Reasoning: medium)": {
1703
  "Overall": {
1704
  "Min": -10,
 
1767
  "Med Resp": -3.0
1768
  }
1769
  },
1770
+ "Gemma 3 27B it": {
1771
+ "Overall": {
1772
+ "Min": 1,
1773
+ "Max": 65458,
1774
+ "Med": 380.0,
1775
+ "Med Resp": 380.0
1776
+ },
1777
+ "Content Generation": {
1778
+ "Min": 7,
1779
+ "Max": 3893,
1780
+ "Med": 484.0,
1781
+ "Med Resp": 484.0
1782
+ },
1783
+ "Editing": {
1784
+ "Min": 6,
1785
+ "Max": 1776,
1786
+ "Med": 254.0,
1787
+ "Med Resp": 254.0
1788
+ },
1789
+ "Data Analysis": {
1790
+ "Min": 1,
1791
+ "Max": 63850,
1792
+ "Med": 180.0,
1793
+ "Med Resp": 180.0
1794
+ },
1795
+ "Reasoning": {
1796
+ "Min": 2,
1797
+ "Max": 1926,
1798
+ "Med": 485.5,
1799
+ "Med Resp": 485.5
1800
+ },
1801
+ "Hallucination": {
1802
+ "Min": 13,
1803
+ "Max": 2494,
1804
+ "Med": 534.0,
1805
+ "Med Resp": 534.0
1806
+ },
1807
+ "Safety": {
1808
+ "Min": 31,
1809
+ "Max": 2440,
1810
+ "Med": 518.0,
1811
+ "Med Resp": 518.0
1812
+ },
1813
+ "Repetition": {
1814
+ "Min": 95,
1815
+ "Max": 65433,
1816
+ "Med": 299.0,
1817
+ "Med Resp": 299.0
1818
+ },
1819
+ "Summarization": {
1820
+ "Min": 30,
1821
+ "Max": 1080,
1822
+ "Med": 202.5,
1823
+ "Med Resp": 202.5
1824
+ },
1825
+ "Translation": {
1826
+ "Min": 46,
1827
+ "Max": 62659,
1828
+ "Med": 374.0,
1829
+ "Med Resp": 374.0
1830
+ },
1831
+ "Multi-Turn": {
1832
+ "Min": 4,
1833
+ "Max": 65458,
1834
+ "Med": 1558.0,
1835
+ "Med Resp": 1558.0
1836
+ }
1837
+ },
1838
  "GPT-5 nano (Reasoning: medium)": {
1839
  "Overall": {
1840
  "Min": -10,
 
2378
  "Med": -6.0,
2379
  "Med Resp": -3.0
2380
  }
2381
+ },
2382
+ "Qwen3 Next 80B A3B Instruct": {
2383
+ "Overall": {
2384
+ "Min": 1,
2385
+ "Max": 65511,
2386
+ "Med": 477.5,
2387
+ "Med Resp": 477.5
2388
+ },
2389
+ "Content Generation": {
2390
+ "Min": 8,
2391
+ "Max": 5955,
2392
+ "Med": 553.0,
2393
+ "Med Resp": 553.0
2394
+ },
2395
+ "Editing": {
2396
+ "Min": 8,
2397
+ "Max": 3157,
2398
+ "Med": 323.0,
2399
+ "Med Resp": 323.0
2400
+ },
2401
+ "Data Analysis": {
2402
+ "Min": 1,
2403
+ "Max": 4108,
2404
+ "Med": 407.0,
2405
+ "Med Resp": 407.0
2406
+ },
2407
+ "Reasoning": {
2408
+ "Min": 1,
2409
+ "Max": 14630,
2410
+ "Med": 813.5,
2411
+ "Med Resp": 813.5
2412
+ },
2413
+ "Hallucination": {
2414
+ "Min": 73,
2415
+ "Max": 65493,
2416
+ "Med": 662.0,
2417
+ "Med Resp": 662.0
2418
+ },
2419
+ "Safety": {
2420
+ "Min": 4,
2421
+ "Max": 3360,
2422
+ "Med": 531.0,
2423
+ "Med Resp": 531.0
2424
+ },
2425
+ "Repetition": {
2426
+ "Min": 132,
2427
+ "Max": 65511,
2428
+ "Med": 739.5,
2429
+ "Med Resp": 739.5
2430
+ },
2431
+ "Summarization": {
2432
+ "Min": 27,
2433
+ "Max": 1849,
2434
+ "Med": 264.0,
2435
+ "Med Resp": 264.0
2436
+ },
2437
+ "Translation": {
2438
+ "Min": 9,
2439
+ "Max": 3123,
2440
+ "Med": 294.5,
2441
+ "Med Resp": 294.5
2442
+ },
2443
+ "Multi-Turn": {
2444
+ "Min": 3,
2445
+ "Max": 10283,
2446
+ "Med": 1913.0,
2447
+ "Med Resp": 1913.0
2448
+ }
2449
+ },
2450
+ "Qwen3 30B A3B Instruct 2507": {
2451
+ "Overall": {
2452
+ "Min": 1,
2453
+ "Max": 65516,
2454
+ "Med": 441.5,
2455
+ "Med Resp": 441.5
2456
+ },
2457
+ "Content Generation": {
2458
+ "Min": 7,
2459
+ "Max": 5659,
2460
+ "Med": 510.5,
2461
+ "Med Resp": 510.5
2462
+ },
2463
+ "Editing": {
2464
+ "Min": 7,
2465
+ "Max": 2231,
2466
+ "Med": 255.0,
2467
+ "Med Resp": 255.0
2468
+ },
2469
+ "Data Analysis": {
2470
+ "Min": 1,
2471
+ "Max": 8094,
2472
+ "Med": 381.0,
2473
+ "Med Resp": 381.0
2474
+ },
2475
+ "Reasoning": {
2476
+ "Min": 1,
2477
+ "Max": 9376,
2478
+ "Med": 753.5,
2479
+ "Med Resp": 753.5
2480
+ },
2481
+ "Hallucination": {
2482
+ "Min": 19,
2483
+ "Max": 65495,
2484
+ "Med": 689.5,
2485
+ "Med Resp": 689.5
2486
+ },
2487
+ "Safety": {
2488
+ "Min": 16,
2489
+ "Max": 65456,
2490
+ "Med": 445.0,
2491
+ "Med Resp": 445.0
2492
+ },
2493
+ "Repetition": {
2494
+ "Min": 81,
2495
+ "Max": 65516,
2496
+ "Med": 533.5,
2497
+ "Med Resp": 533.5
2498
+ },
2499
+ "Summarization": {
2500
+ "Min": 34,
2501
+ "Max": 1870,
2502
+ "Med": 251.0,
2503
+ "Med Resp": 251.0
2504
+ },
2505
+ "Translation": {
2506
+ "Min": 8,
2507
+ "Max": 3257,
2508
+ "Med": 292.5,
2509
+ "Med Resp": 292.5
2510
+ },
2511
+ "Multi-Turn": {
2512
+ "Min": 3,
2513
+ "Max": 6825,
2514
+ "Med": 1809.5,
2515
+ "Med Resp": 1809.5
2516
+ }
2517
  }
2518
  }
src/data/stats.csv CHANGED
@@ -14,15 +14,21 @@ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "
14
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
15
  top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
16
  "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
 
 
17
  "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
18
  top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
19
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
20
  top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
21
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
22
  top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
 
 
23
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
24
  temperature: 1.0
25
  top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
 
 
26
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
27
  temperature: 0.6
28
  top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
@@ -32,17 +38,29 @@ temperature: 1.3
32
  top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
33
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
34
  top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
 
 
35
  "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
36
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
37
  temperature: 1.0
38
  top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
 
 
 
 
 
 
 
 
39
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
40
  top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
41
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
42
  top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
 
 
43
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
44
  top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
45
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
46
- top-p: 0.7" "kt" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
47
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
48
- top-p: 0.95" "kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
 
14
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
15
  top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
16
  "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
17
+ "Qwen3 Next 80B A3B Thinking" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking" "temperature: 0.6
18
+ top-p: 0.95" "Qwen" "3263.0" "329.0" "80.0" "Open" "Think" "On" "55.07" "58.25" "51.56" "73.71" "76.03" "52.3" "38.84" "32.86" "57.14" "46.63" "43.62"
19
  "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
20
  top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
21
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
22
  top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
23
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
24
  top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
25
+ "Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.7
26
+ top-p: 0.8" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "56.25" "45.0" "69.32" "69.01" "50.0" "29.75" "30.0" "48.02" "47.47" "36.58"
27
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
28
  temperature: 1.0
29
  top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
30
+ "Qwen3 Next 80B A3B Instruct" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct" "temperature: 0.7
31
+ top-p: 0.8" "Qwen" "477.5" "477.5" "80.0" "Open" "Instruct" "Off" "48.87" "53.5" "45.31" "66.14" "71.07" "55.17" "39.67" "21.43" "48.41" "36.8" "35.23"
32
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
33
  temperature: 0.6
34
  top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
 
38
  top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
39
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
40
  top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
41
+ "Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
42
+ top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "45.0" "35.0" "56.18" "66.12" "51.15" "33.06" "24.29" "46.83" "28.09" "35.57"
43
  "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
44
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
45
  temperature: 1.0
46
  top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
47
+ "Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
48
+ top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "44.25" "45.0" "45.82" "36.78" "31.61" "32.23" "22.86" "57.14" "32.87" "39.93"
49
+ "Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
50
+ top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "41.25" "33.12" "62.15" "68.18" "44.25" "23.97" "18.57" "41.67" "26.12" "29.19"
51
+ "Mistral-Samll-3-2 24B-Instruct-2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
52
+ top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "43.0" "44.69" "43.43" "51.65" "25.86" "22.31" "25.71" "51.98" "31.18" "30.2"
53
+ "K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
54
+ top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "35.5" "36.56" "56.18" "47.11" "35.06" "14.05" "12.86" "49.21" "21.63" "23.15"
55
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
56
  top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
57
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
58
  top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
59
+ "ERNIE-4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
60
+ top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
61
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
62
  top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
63
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
64
+ top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
65
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
66
+ top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
src/data/stats_lang.csv CHANGED
@@ -14,15 +14,21 @@ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "57.78" "56.67"
14
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
15
  top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
16
  "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
 
 
17
  "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
18
  top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
19
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
20
  top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
21
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
22
  top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
 
 
23
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
24
  temperature: 1.0
25
  top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
 
 
26
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
27
  temperature: 0.6
28
  top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
@@ -32,17 +38,29 @@ temperature: 1.3
32
  top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
33
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
34
  top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
 
 
35
  "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
36
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
37
  temperature: 1.0
38
  top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
 
 
 
 
 
 
 
 
39
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
40
  top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
41
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
42
  top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
 
 
43
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
44
  top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
45
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
46
- top-p: 0.7" "kt" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
47
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
48
- top-p: 0.95" "kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
 
14
  "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
15
  top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
16
  "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
17
+ "Qwen3 Next 80B A3B Thinking" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking" "temperature: 0.6
18
+ top-p: 0.95" "Qwen" "3263.0" "329.0" "80.0" "Open" "Think" "On" "55.07" "47.22" "55.0" "55.42" "53.66" "50.56" "55.25" "54.75" "60.0" "63.04" "62.84" "54.49" "56.1"
19
  "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
20
  top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
21
  "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
22
  top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
23
  "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
24
  top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
25
+ "Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
26
+ top-p: 0.95" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "44.17" "49.17" "50.0" "57.32" "42.22" "49.72" "53.07" "50.27" "54.89" "56.83" "47.75" "58.54"
27
  "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
28
  temperature: 1.0
29
  top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
30
+ "Qwen3 Next 80B A3B Instruct" "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct" "temperature: 0.7
31
+ top-p: 0.8" "Qwen" "477.5" "477.5" "80.0" "Open" "Instruct" "Off" "48.87" "41.67" "50.83" "51.81" "55.49" "42.78" "47.51" "50.28" "51.89" "50.54" "48.09" "51.69" "50.0"
32
  "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
33
  temperature: 0.6
34
  top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
 
38
  top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
39
  "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
40
  top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
41
+ "Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
42
+ top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "34.44" "43.89" "40.96" "48.78" "38.89" "41.99" "46.93" "44.32" "42.93" "48.09" "43.26" "46.95"
43
  "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
44
  "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
45
  temperature: 1.0
46
  top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
47
+ "Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
48
+ top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "34.44" "35.0" "37.35" "43.9" "42.22" "43.65" "47.49" "41.08" "44.02" "53.55" "39.33" "40.24"
49
+ "Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
50
+ top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "36.11" "40.83" "43.37" "44.51" "32.78" "37.02" "44.69" "38.92" "43.48" "46.45" "37.08" "39.63"
51
+ "Mistral-Samll-3-2 24B-Instruct-2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
52
+ top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "31.39" "40.0" "36.75" "42.07" "34.44" "44.2" "41.9" "42.16" "45.65" "40.98" "37.64" "38.41"
53
+ "K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
54
+ top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "29.17" "36.11" "30.12" "44.51" "26.67" "33.15" "38.55" "37.84" "41.85" "37.7" "33.71" "36.59"
55
  "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
56
  top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
57
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
58
  top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
59
+ "ERNIE-4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
60
+ top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "17.5" "31.11" "18.67" "39.02" "23.33" "24.31" "24.58" "26.49" "24.46" "30.6" "19.1" "27.44"
61
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
62
  top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
63
  "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
64
+ top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
65
  "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
66
+ top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
vis_utils.py CHANGED
@@ -82,7 +82,7 @@ def create_empty_radar_chart(message: str) -> Figure:
82
  def create_len_overall_scatter(
83
  df: pd.DataFrame,
84
  selected_models: Optional[List[str]] = None,
85
- max_models: int = 30,
86
  y_col: str = "Overall",
87
  length_data: Optional[dict] = None,
88
  theme: str = "light",
 
82
  def create_len_overall_scatter(
83
  df: pd.DataFrame,
84
  selected_models: Optional[List[str]] = None,
85
+ max_models: int = 50,
86
  y_col: str = "Overall",
87
  length_data: Optional[dict] = None,
88
  theme: str = "light",