Spaces:

SamsungResearch
/

TRUEBench

Running

App Files Files Community

j_yoon.song commited on Sep 23

Commit

289ccbe

1 Parent(s): cb27169

Add model & update visualization

Browse files

Files changed (5) hide show

src/about.py +1 -5
src/data/length_data.json +612 -0
src/data/stats.csv +20 -2
src/data/stats_lang.csv +20 -2
vis_utils.py +1 -1

src/about.py CHANGED Viewed

@@ -24,13 +24,9 @@ LINK = """
 <h3 style="text-align: right; margin-top: 0;">
     <span>✨</span>
     <a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
-    <span>🌕</span>
-    <a href="https://github.com/samsung" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">GitHub</a> |
-    <span>🌎</span>
-    <a href="https://x.com/samsungresearch" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">X</a> |
     <span>🌠</span>
     <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
-    <span>🔭</span> Updated: 2025-09-16
 </h3>
 """

 <h3 style="text-align: right; margin-top: 0;">
     <span>✨</span>
     <a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
     <span>🌠</span>
     <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
+    <span>🔭</span> Updated: 2025-09-23
 </h3>
 """

src/data/length_data.json CHANGED Viewed

@@ -135,6 +135,74 @@
             "Med Resp": 1488.0
         }
     },
     "DeepSeek V3.1 (think)": {
         "Overall": {
             "Min": 80,
@@ -203,6 +271,74 @@
             "Med Resp": 1545.0
         }
     },
     "o4-mini": {
         "Overall": {
             "Min": -10,
@@ -815,6 +951,74 @@
             "Med Resp": 1318.5
         }
     },
     "GLM-4.5 FP8 (think)": {
         "Overall": {
             "Min": 75,
@@ -1019,6 +1223,74 @@
             "Med Resp": 2150.0
         }
     },
     "Qwen3 32B (think)": {
         "Overall": {
             "Min": 164,
@@ -1087,6 +1359,74 @@
             "Med Resp": 1481.0
         }
     },
     "Qwen3 235B A22B Instruct 2507": {
         "Overall": {
             "Min": 1,
@@ -1291,6 +1631,74 @@
             "Med Resp": -3.0
         }
     },
     "GPT-5 mini (Reasoning: medium)": {
         "Overall": {
             "Min": -10,
@@ -1359,6 +1767,74 @@
             "Med Resp": -3.0
         }
     },
     "GPT-5 nano (Reasoning: medium)": {
         "Overall": {
             "Min": -10,
@@ -1902,5 +2378,141 @@
             "Med": -6.0,
             "Med Resp": -3.0
         }
     }
 }

             "Med Resp": 1488.0
         }
     },
+    "Qwen3 Next 80B A3B Thinking": {
+        "Overall": {
+            "Min": 4,
+            "Max": 76399,
+            "Med": 3263.0,
+            "Med Resp": 329.0
+        },
+        "Content Generation": {
+            "Min": 651,
+            "Max": 65449,
+            "Med": 3195.0,
+            "Med Resp": 368.5
+        },
+        "Editing": {
+            "Min": 466,
+            "Max": 65112,
+            "Med": 2840.5,
+            "Med Resp": 203.5
+        },
+        "Data Analysis": {
+            "Min": 4,
+            "Max": 64756,
+            "Med": 1788.0,
+            "Med Resp": 212.0
+        },
+        "Reasoning": {
+            "Min": 582,
+            "Max": 30093,
+            "Med": 2740.5,
+            "Med Resp": 540.5
+        },
+        "Hallucination": {
+            "Min": 386,
+            "Max": 65491,
+            "Med": 2216.0,
+            "Med Resp": 586.0
+        },
+        "Safety": {
+            "Min": 320,
+            "Max": 65472,
+            "Med": 1642.0,
+            "Med Resp": 338.0
+        },
+        "Repetition": {
+            "Min": 1040,
+            "Max": 65529,
+            "Med": 6000.0,
+            "Med Resp": 251.0
+        },
+        "Summarization": {
+            "Min": 506,
+            "Max": 14800,
+            "Med": 2162.5,
+            "Med Resp": 191.0
+        },
+        "Translation": {
+            "Min": 728,
+            "Max": 65398,
+            "Med": 4754.0,
+            "Med Resp": 272.0
+        },
+        "Multi-Turn": {
+            "Min": 1070,
+            "Max": 76399,
+            "Med": 6871.5,
+            "Med Resp": 1179.0
+        }
+    },
     "DeepSeek V3.1 (think)": {
         "Overall": {
             "Min": 80,
             "Med Resp": 1545.0
         }
     },
+    "Qwen3 30B A3B Thinking 2507": {
+        "Overall": {
+            "Min": 305,
+            "Max": 32743,
+            "Med": 2830.0,
+            "Med Resp": 351.0
+        },
+        "Content Generation": {
+            "Min": 335,
+            "Max": 10914,
+            "Med": 2775.5,
+            "Med Resp": 403.5
+        },
+        "Editing": {
+            "Min": 371,
+            "Max": 7617,
+            "Med": 2358.5,
+            "Med Resp": 220.0
+        },
+        "Data Analysis": {
+            "Min": 305,
+            "Max": 19749,
+            "Med": 1702.0,
+            "Med Resp": 227.0
+        },
+        "Reasoning": {
+            "Min": 485,
+            "Max": 19485,
+            "Med": 2504.0,
+            "Med Resp": 505.0
+        },
+        "Hallucination": {
+            "Min": 360,
+            "Max": 6054,
+            "Med": 2123.5,
+            "Med Resp": 668.0
+        },
+        "Safety": {
+            "Min": 306,
+            "Max": 32688,
+            "Med": 1667.0,
+            "Med Resp": 447.0
+        },
+        "Repetition": {
+            "Min": 1070,
+            "Max": 32743,
+            "Med": 3719.0,
+            "Med Resp": 368.5
+        },
+        "Summarization": {
+            "Min": 435,
+            "Max": 14462,
+            "Med": 2108.0,
+            "Med Resp": 204.0
+        },
+        "Translation": {
+            "Min": 513,
+            "Max": 11340,
+            "Med": 3869.5,
+            "Med Resp": 276.0
+        },
+        "Multi-Turn": {
+            "Min": 536,
+            "Max": 14557,
+            "Med": 5822.5,
+            "Med Resp": 1237.0
+        }
+    },
     "o4-mini": {
         "Overall": {
             "Min": -10,
             "Med Resp": 1318.5
         }
     },
+    "Mistral-Samll-3-2 24B-Instruct-2506": {
+        "Overall": {
+            "Min": 1,
+            "Max": 65516,
+            "Med": 369.0,
+            "Med Resp": 369.0
+        },
+        "Content Generation": {
+            "Min": 7,
+            "Max": 2684,
+            "Med": 389.5,
+            "Med Resp": 389.5
+        },
+        "Editing": {
+            "Min": 9,
+            "Max": 1172,
+            "Med": 269.0,
+            "Med Resp": 269.0
+        },
+        "Data Analysis": {
+            "Min": 1,
+            "Max": 3973,
+            "Med": 295.0,
+            "Med Resp": 295.0
+        },
+        "Reasoning": {
+            "Min": 1,
+            "Max": 65462,
+            "Med": 484.5,
+            "Med Resp": 484.5
+        },
+        "Hallucination": {
+            "Min": 61,
+            "Max": 5920,
+            "Med": 489.0,
+            "Med Resp": 489.0
+        },
+        "Safety": {
+            "Min": 10,
+            "Max": 65465,
+            "Med": 320.0,
+            "Med Resp": 320.0
+        },
+        "Repetition": {
+            "Min": 103,
+            "Max": 65516,
+            "Med": 376.5,
+            "Med Resp": 376.5
+        },
+        "Summarization": {
+            "Min": 28,
+            "Max": 1266,
+            "Med": 234.5,
+            "Med Resp": 234.5
+        },
+        "Translation": {
+            "Min": 9,
+            "Max": 3248,
+            "Med": 327.0,
+            "Med Resp": 327.0
+        },
+        "Multi-Turn": {
+            "Min": 4,
+            "Max": 65494,
+            "Med": 1279.0,
+            "Med Resp": 1279.0
+        }
+    },
     "GLM-4.5 FP8 (think)": {
         "Overall": {
             "Min": 75,
             "Med Resp": 2150.0
         }
     },
+    "K2-Think": {
+        "Overall": {
+            "Min": 27,
+            "Max": 8178,
+            "Med": 1835.0,
+            "Med Resp": 486.0
+        },
+        "Content Generation": {
+            "Min": 138,
+            "Max": 2049,
+            "Med": 1821.5,
+            "Med Resp": 660.5
+        },
+        "Editing": {
+            "Min": 169,
+            "Max": 2054,
+            "Med": 1433.5,
+            "Med Resp": 283.5
+        },
+        "Data Analysis": {
+            "Min": 150,
+            "Max": 2053,
+            "Med": 1349.0,
+            "Med Resp": 264.0
+        },
+        "Reasoning": {
+            "Min": 419,
+            "Max": 2048,
+            "Med": 2045.5,
+            "Med Resp": 576.5
+        },
+        "Hallucination": {
+            "Min": 174,
+            "Max": 2054,
+            "Med": 1890.0,
+            "Med Resp": 522.5
+        },
+        "Safety": {
+            "Min": 27,
+            "Max": 2048,
+            "Med": 1393.0,
+            "Med Resp": 405.0
+        },
+        "Repetition": {
+            "Min": 870,
+            "Max": 2070,
+            "Med": 2048.0,
+            "Med Resp": 2048.0
+        },
+        "Summarization": {
+            "Min": 252,
+            "Max": 2053,
+            "Med": 1011.0,
+            "Med Resp": 262.5
+        },
+        "Translation": {
+            "Min": 195,
+            "Max": 2051,
+            "Med": 2006.0,
+            "Med Resp": 371.5
+        },
+        "Multi-Turn": {
+            "Min": 110,
+            "Max": 8178,
+            "Med": 3224.0,
+            "Med Resp": 1526.0
+        }
+    },
     "Qwen3 32B (think)": {
         "Overall": {
             "Min": 164,
             "Med Resp": 1481.0
         }
     },
+    "ERNIE-4.5 21B A3B Thinking": {
+        "Overall": {
+            "Min": 186,
+            "Max": 66114,
+            "Med": 1637.0,
+            "Med Resp": 541.0
+        },
+        "Content Generation": {
+            "Min": 302,
+            "Max": 12760,
+            "Med": 1586.5,
+            "Med Resp": 654.5
+        },
+        "Editing": {
+            "Min": 186,
+            "Max": 8703,
+            "Med": 1119.5,
+            "Med Resp": 336.0
+        },
+        "Data Analysis": {
+            "Min": 200,
+            "Max": 31928,
+            "Med": 1484.0,
+            "Med Resp": 418.0
+        },
+        "Reasoning": {
+            "Min": 511,
+            "Max": 29184,
+            "Med": 5312.0,
+            "Med Resp": 669.5
+        },
+        "Hallucination": {
+            "Min": 313,
+            "Max": 11452,
+            "Med": 1716.0,
+            "Med Resp": 797.5
+        },
+        "Safety": {
+            "Min": 213,
+            "Max": 6914,
+            "Med": 1242.0,
+            "Med Resp": 599.0
+        },
+        "Repetition": {
+            "Min": 643,
+            "Max": 65463,
+            "Med": 2387.0,
+            "Med Resp": 516.5
+        },
+        "Summarization": {
+            "Min": 215,
+            "Max": 12449,
+            "Med": 884.0,
+            "Med Resp": 269.5
+        },
+        "Translation": {
+            "Min": 298,
+            "Max": 19672,
+            "Med": 1466.5,
+            "Med Resp": 421.5
+        },
+        "Multi-Turn": {
+            "Min": 705,
+            "Max": 66114,
+            "Med": 4404.5,
+            "Med Resp": 1819.0
+        }
+    },
     "Qwen3 235B A22B Instruct 2507": {
         "Overall": {
             "Min": 1,
             "Med Resp": -3.0
         }
     },
+    "Tongyi DeepResearch 30B A3B": {
+        "Overall": {
+            "Min": 153,
+            "Max": 68912,
+            "Med": 1147.0,
+            "Med Resp": 408.0
+        },
+        "Content Generation": {
+            "Min": 216,
+            "Max": 65477,
+            "Med": 1086.5,
+            "Med Resp": 510.5
+        },
+        "Editing": {
+            "Min": 251,
+            "Max": 65470,
+            "Med": 985.5,
+            "Med Resp": 313.0
+        },
+        "Data Analysis": {
+            "Min": 242,
+            "Max": 65499,
+            "Med": 998.0,
+            "Med Resp": 239.0
+        },
+        "Reasoning": {
+            "Min": 333,
+            "Max": 65477,
+            "Med": 2043.5,
+            "Med Resp": 388.5
+        },
+        "Hallucination": {
+            "Min": 194,
+            "Max": 65501,
+            "Med": 1344.5,
+            "Med Resp": 593.0
+        },
+        "Safety": {
+            "Min": 153,
+            "Max": 65472,
+            "Med": 992.0,
+            "Med Resp": 392.0
+        },
+        "Repetition": {
+            "Min": 425,
+            "Max": 65513,
+            "Med": 1986.5,
+            "Med Resp": 472.5
+        },
+        "Summarization": {
+            "Min": 290,
+            "Max": 2410,
+            "Med": 662.5,
+            "Med Resp": 262.0
+        },
+        "Translation": {
+            "Min": 360,
+            "Max": 65406,
+            "Med": 1107.0,
+            "Med Resp": 317.5
+        },
+        "Multi-Turn": {
+            "Min": 240,
+            "Max": 68912,
+            "Med": 3134.5,
+            "Med Resp": 1349.5
+        }
+    },
     "GPT-5 mini (Reasoning: medium)": {
         "Overall": {
             "Min": -10,
             "Med Resp": -3.0
         }
     },
+    "Gemma 3 27B it": {
+        "Overall": {
+            "Min": 1,
+            "Max": 65458,
+            "Med": 380.0,
+            "Med Resp": 380.0
+        },
+        "Content Generation": {
+            "Min": 7,
+            "Max": 3893,
+            "Med": 484.0,
+            "Med Resp": 484.0
+        },
+        "Editing": {
+            "Min": 6,
+            "Max": 1776,
+            "Med": 254.0,
+            "Med Resp": 254.0
+        },
+        "Data Analysis": {
+            "Min": 1,
+            "Max": 63850,
+            "Med": 180.0,
+            "Med Resp": 180.0
+        },
+        "Reasoning": {
+            "Min": 2,
+            "Max": 1926,
+            "Med": 485.5,
+            "Med Resp": 485.5
+        },
+        "Hallucination": {
+            "Min": 13,
+            "Max": 2494,
+            "Med": 534.0,
+            "Med Resp": 534.0
+        },
+        "Safety": {
+            "Min": 31,
+            "Max": 2440,
+            "Med": 518.0,
+            "Med Resp": 518.0
+        },
+        "Repetition": {
+            "Min": 95,
+            "Max": 65433,
+            "Med": 299.0,
+            "Med Resp": 299.0
+        },
+        "Summarization": {
+            "Min": 30,
+            "Max": 1080,
+            "Med": 202.5,
+            "Med Resp": 202.5
+        },
+        "Translation": {
+            "Min": 46,
+            "Max": 62659,
+            "Med": 374.0,
+            "Med Resp": 374.0
+        },
+        "Multi-Turn": {
+            "Min": 4,
+            "Max": 65458,
+            "Med": 1558.0,
+            "Med Resp": 1558.0
+        }
+    },
     "GPT-5 nano (Reasoning: medium)": {
         "Overall": {
             "Min": -10,
             "Med": -6.0,
             "Med Resp": -3.0
         }
+    },
+    "Qwen3 Next 80B A3B Instruct": {
+        "Overall": {
+            "Min": 1,
+            "Max": 65511,
+            "Med": 477.5,
+            "Med Resp": 477.5
+        },
+        "Content Generation": {
+            "Min": 8,
+            "Max": 5955,
+            "Med": 553.0,
+            "Med Resp": 553.0
+        },
+        "Editing": {
+            "Min": 8,
+            "Max": 3157,
+            "Med": 323.0,
+            "Med Resp": 323.0
+        },
+        "Data Analysis": {
+            "Min": 1,
+            "Max": 4108,
+            "Med": 407.0,
+            "Med Resp": 407.0
+        },
+        "Reasoning": {
+            "Min": 1,
+            "Max": 14630,
+            "Med": 813.5,
+            "Med Resp": 813.5
+        },
+        "Hallucination": {
+            "Min": 73,
+            "Max": 65493,
+            "Med": 662.0,
+            "Med Resp": 662.0
+        },
+        "Safety": {
+            "Min": 4,
+            "Max": 3360,
+            "Med": 531.0,
+            "Med Resp": 531.0
+        },
+        "Repetition": {
+            "Min": 132,
+            "Max": 65511,
+            "Med": 739.5,
+            "Med Resp": 739.5
+        },
+        "Summarization": {
+            "Min": 27,
+            "Max": 1849,
+            "Med": 264.0,
+            "Med Resp": 264.0
+        },
+        "Translation": {
+            "Min": 9,
+            "Max": 3123,
+            "Med": 294.5,
+            "Med Resp": 294.5
+        },
+        "Multi-Turn": {
+            "Min": 3,
+            "Max": 10283,
+            "Med": 1913.0,
+            "Med Resp": 1913.0
+        }
+    },
+    "Qwen3 30B A3B Instruct 2507": {
+        "Overall": {
+            "Min": 1,
+            "Max": 65516,
+            "Med": 441.5,
+            "Med Resp": 441.5
+        },
+        "Content Generation": {
+            "Min": 7,
+            "Max": 5659,
+            "Med": 510.5,
+            "Med Resp": 510.5
+        },
+        "Editing": {
+            "Min": 7,
+            "Max": 2231,
+            "Med": 255.0,
+            "Med Resp": 255.0
+        },
+        "Data Analysis": {
+            "Min": 1,
+            "Max": 8094,
+            "Med": 381.0,
+            "Med Resp": 381.0
+        },
+        "Reasoning": {
+            "Min": 1,
+            "Max": 9376,
+            "Med": 753.5,
+            "Med Resp": 753.5
+        },
+        "Hallucination": {
+            "Min": 19,
+            "Max": 65495,
+            "Med": 689.5,
+            "Med Resp": 689.5
+        },
+        "Safety": {
+            "Min": 16,
+            "Max": 65456,
+            "Med": 445.0,
+            "Med Resp": 445.0
+        },
+        "Repetition": {
+            "Min": 81,
+            "Max": 65516,
+            "Med": 533.5,
+            "Med Resp": 533.5
+        },
+        "Summarization": {
+            "Min": 34,
+            "Max": 1870,
+            "Med": 251.0,
+            "Med Resp": 251.0
+        },
+        "Translation": {
+            "Min": 8,
+            "Max": 3257,
+            "Med": 292.5,
+            "Med Resp": 292.5
+        },
+        "Multi-Turn": {
+            "Min": 3,
+            "Max": 6825,
+            "Med": 1809.5,
+            "Med Resp": 1809.5
+        }
     }
 }

src/data/stats.csv CHANGED Viewed

@@ -14,15 +14,21 @@ top-p: 0.95"	"Grok"	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"61.0"	"66.25"	"
 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"57.5"	"53.12"	"73.31"	"75.21"	"55.17"	"25.62"	"35.71"	"55.56"	"56.18"	"40.27"
 "GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"63.5"	"47.19"	"68.92"	"75.21"	"55.17"	"52.07"	"34.29"	"63.49"	"40.73"	"42.95"
 "GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
 top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"60.75"	"53.75"	"68.92"	"74.38"	"47.13"	"33.06"	"41.43"	"60.32"	"46.07"	"35.91"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
 top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"58.0"	"49.69"	"68.13"	"73.97"	"55.17"	"45.45"	"30.0"	"55.95"	"38.48"	"41.61"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"52.0"	"50.0"	"67.33"	"69.83"	"50.0"	"33.88"	"35.71"	"59.52"	"41.85"	"40.27"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"58.5"	"48.44"	"68.92"	"69.83"	"41.38"	"39.67"	"25.71"	"50.79"	"35.67"	"32.21"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"49.75"	"50.0"	"65.34"	"59.09"	"48.85"	"38.02"	"32.86"	"57.94"	"36.52"	"38.93"
@@ -32,17 +38,29 @@ temperature: 1.3
 top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"46.25"	"45.0"	"58.96"	"60.33"	"41.95"	"21.49"	"30.0"	"55.95"	"38.48"	"33.22"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"52.25"	"41.56"	"68.92"	"66.53"	"35.06"	"19.83"	"25.71"	"46.43"	"30.9"	"32.89"
 "A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"56.0"	"43.75"	"43.43"	"42.56"	"40.23"	"15.7"	"24.29"	"53.97"	"33.43"	"32.21"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"52.0"	"40.0"	"61.35"	"65.7"	"43.1"	"41.32"	"22.86"	"36.51"	"20.51"	"22.82"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
 top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"34.25"	"29.38"	"56.97"	"57.44"	"24.71"	"27.27"	"17.14"	"38.49"	"18.54"	"25.5"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
 top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"35.0"	"26.56"	"53.78"	"58.68"	"27.59"	"26.45"	"17.14"	"29.76"	"17.13"	"20.47"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
 top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"28.0"	"24.69"	"16.73"	"19.42"	"17.24"	"28.1"	"11.43"	"31.35"	"13.76"	"11.74"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
-top-p: 0.7"	"kt"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"21.75"	"17.5"	"16.73"	"18.6"	"27.59"	"59.5"	"14.29"	"25.4"	"12.64"	"11.41"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
-top-p: 0.95"	"kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"14.25"	"10.62"	"13.55"	"11.16"	"22.41"	"22.31"	"4.29"	"11.9"	"6.74"	"5.37"

 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"57.5"	"53.12"	"73.31"	"75.21"	"55.17"	"25.62"	"35.71"	"55.56"	"56.18"	"40.27"
 "GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"63.5"	"47.19"	"68.92"	"75.21"	"55.17"	"52.07"	"34.29"	"63.49"	"40.73"	"42.95"
+"Qwen3 Next 80B A3B Thinking"	"https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"3263.0"	"329.0"	"80.0"	"Open"	"Think"	"On"	"55.07"	"58.25"	"51.56"	"73.71"	"76.03"	"52.3"	"38.84"	"32.86"	"57.14"	"46.63"	"43.62"
 "GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
 top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"60.75"	"53.75"	"68.92"	"74.38"	"47.13"	"33.06"	"41.43"	"60.32"	"46.07"	"35.91"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
 top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"58.0"	"49.69"	"68.13"	"73.97"	"55.17"	"45.45"	"30.0"	"55.95"	"38.48"	"41.61"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"52.0"	"50.0"	"67.33"	"69.83"	"50.0"	"33.88"	"35.71"	"59.52"	"41.85"	"40.27"
+"Qwen3 30B A3B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"2830.0"	"351.0"	"30.0"	"Open"	"Think"	"On"	"50.44"	"56.25"	"45.0"	"69.32"	"69.01"	"50.0"	"29.75"	"30.0"	"48.02"	"47.47"	"36.58"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"58.5"	"48.44"	"68.92"	"69.83"	"41.38"	"39.67"	"25.71"	"50.79"	"35.67"	"32.21"
+"Qwen3 Next 80B A3B Instruct"	"https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"477.5"	"477.5"	"80.0"	"Open"	"Instruct"	"Off"	"48.87"	"53.5"	"45.31"	"66.14"	"71.07"	"55.17"	"39.67"	"21.43"	"48.41"	"36.8"	"35.23"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"49.75"	"50.0"	"65.34"	"59.09"	"48.85"	"38.02"	"32.86"	"57.94"	"36.52"	"38.93"
 top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"46.25"	"45.0"	"58.96"	"60.33"	"41.95"	"21.49"	"30.0"	"55.95"	"38.48"	"33.22"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"52.25"	"41.56"	"68.92"	"66.53"	"35.06"	"19.83"	"25.71"	"46.43"	"30.9"	"32.89"
+"Qwen3 30B A3B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"441.5"	"441.5"	"30.0"	"Open"	"Instruct"	"Off"	"42.79"	"45.0"	"35.0"	"56.18"	"66.12"	"51.15"	"33.06"	"24.29"	"46.83"	"28.09"	"35.57"
 "A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"56.0"	"43.75"	"43.43"	"42.56"	"40.23"	"15.7"	"24.29"	"53.97"	"33.43"	"32.21"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"52.0"	"40.0"	"61.35"	"65.7"	"43.1"	"41.32"	"22.86"	"36.51"	"20.51"	"22.82"
+"Gemma 3 27B it"	"https://huggingface.co/google/gemma-3-27b-it"	"temperature: 1.0
+top-p: 0.95"	"Gemma"	"380.0"	"380.0"	"27.0"	"Open"	"Instruct"	"Off"	"40.86"	"44.25"	"45.0"	"45.82"	"36.78"	"31.61"	"32.23"	"22.86"	"57.14"	"32.87"	"39.93"
+"Tongyi DeepResearch 30B A3B"	"https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"	"temperature: 0.6
+top-p: 0.95"	"Alibaba"	"1147.0"	"408.0"	"30.0"	"Open"	"Think"	"On"	"40.1"	"41.25"	"33.12"	"62.15"	"68.18"	"44.25"	"23.97"	"18.57"	"41.67"	"26.12"	"29.19"
+"Mistral-Samll-3-2 24B-Instruct-2506"	"https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"	"temperature: 0.15
+top-p: 0.95"	"mistralai"	"369.0"	"369.0"	"24.0"	"Open"	"Instruct"	"Off"	"39.09"	"43.0"	"44.69"	"43.43"	"51.65"	"25.86"	"22.31"	"25.71"	"51.98"	"31.18"	"30.2"
+"K2-Think"	"https://huggingface.co/LLM360/K2-Think"	"temperature: 1.0
+top-p: 0.95"	"LLM360"	"1835.0"	"486.0"	"32.8"	"Open"	"Think"	"On"	"35.06"	"35.5"	"36.56"	"56.18"	"47.11"	"35.06"	"14.05"	"12.86"	"49.21"	"21.63"	"23.15"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
 top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"34.25"	"29.38"	"56.97"	"57.44"	"24.71"	"27.27"	"17.14"	"38.49"	"18.54"	"25.5"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
 top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"35.0"	"26.56"	"53.78"	"58.68"	"27.59"	"26.45"	"17.14"	"29.76"	"17.13"	"20.47"
+"ERNIE-4.5 21B A3B Thinking"	"https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking"	"temperature: 0.6
+top-p: 0.95"	"ERNIE"	"1637.0"	"541.0"	"21.0"	"Open"	"Think"	"On"	"25.32"	"27.25"	"20.31"	"42.23"	"49.59"	"23.56"	"31.4"	"17.14"	"28.17"	"7.3"	"13.76"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
 top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"28.0"	"24.69"	"16.73"	"19.42"	"17.24"	"28.1"	"11.43"	"31.35"	"13.76"	"11.74"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
+top-p: 0.7"	"KT"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"21.75"	"17.5"	"16.73"	"18.6"	"27.59"	"59.5"	"14.29"	"25.4"	"12.64"	"11.41"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
+top-p: 0.95"	"Kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"14.25"	"10.62"	"13.55"	"11.16"	"22.41"	"22.31"	"4.29"	"11.9"	"6.74"	"5.37"

src/data/stats_lang.csv CHANGED Viewed

@@ -14,15 +14,21 @@ top-p: 0.95"	"Grok"	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"57.78"	"56.67"
 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"49.17"	"53.33"	"56.02"	"58.54"	"50.56"	"62.43"	"60.89"	"52.97"	"56.52"	"60.11"	"53.93"	"60.37"
 "GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"51.94"	"53.89"	"57.23"	"53.66"	"55.56"	"58.01"	"59.78"	"54.59"	"56.52"	"59.02"	"57.3"	"51.83"
 "GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
 top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"46.94"	"54.17"	"60.84"	"58.54"	"48.89"	"55.8"	"54.75"	"48.11"	"57.61"	"57.92"	"57.87"	"54.88"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
 top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"46.67"	"55.28"	"53.61"	"59.15"	"46.11"	"51.38"	"55.87"	"54.59"	"53.26"	"56.28"	"54.49"	"53.05"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"44.44"	"48.33"	"56.63"	"48.78"	"48.89"	"55.25"	"53.07"	"52.97"	"56.52"	"57.92"	"50.56"	"54.27"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"46.67"	"51.39"	"51.81"	"47.56"	"45.0"	"51.38"	"54.75"	"50.27"	"51.63"	"47.54"	"46.07"	"45.12"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"42.22"	"49.44"	"50.0"	"53.05"	"47.22"	"48.62"	"50.28"	"48.11"	"51.63"	"54.1"	"44.38"	"53.05"
@@ -32,17 +38,29 @@ temperature: 1.3
 top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"37.5"	"43.61"	"46.99"	"51.22"	"45.56"	"44.75"	"44.69"	"44.32"	"48.91"	"49.18"	"44.94"	"49.39"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"38.89"	"41.67"	"48.8"	"50.0"	"38.33"	"46.41"	"44.69"	"44.86"	"44.57"	"50.82"	"46.07"	"47.56"
 "A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"38.89"	"41.11"	"43.98"	"49.39"	"36.11"	"45.86"	"43.58"	"44.32"	"39.67"	"43.17"	"39.89"	"36.59"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"36.67"	"42.78"	"45.78"	"45.73"	"37.78"	"35.91"	"41.9"	"39.46"	"51.09"	"40.44"	"38.76"	"41.46"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
 top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"33.61"	"38.33"	"28.92"	"35.98"	"26.11"	"35.91"	"34.08"	"38.92"	"35.33"	"33.88"	"28.09"	"31.71"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
 top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"32.22"	"37.22"	"31.93"	"38.41"	"27.78"	"32.6"	"30.17"	"29.19"	"32.07"	"33.33"	"25.28"	"26.22"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
 top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"9.72"	"22.22"	"21.08"	"24.39"	"9.44"	"18.23"	"24.02"	"29.73"	"29.89"	"33.33"	"22.47"	"12.8"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
-top-p: 0.7"	"kt"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"26.39"	"26.39"	"17.47"	"26.83"	"13.33"	"18.78"	"20.67"	"16.22"	"20.65"	"21.31"	"12.92"	"9.15"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
-top-p: 0.95"	"kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"21.11"	"20.28"	"10.84"	"15.24"	"5.56"	"7.73"	"8.94"	"9.19"	"8.15"	"5.46"	"5.06"	"4.88"

 "Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"49.17"	"53.33"	"56.02"	"58.54"	"50.56"	"62.43"	"60.89"	"52.97"	"56.52"	"60.11"	"53.93"	"60.37"
 "GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"51.94"	"53.89"	"57.23"	"53.66"	"55.56"	"58.01"	"59.78"	"54.59"	"56.52"	"59.02"	"57.3"	"51.83"
+"Qwen3 Next 80B A3B Thinking"	"https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"3263.0"	"329.0"	"80.0"	"Open"	"Think"	"On"	"55.07"	"47.22"	"55.0"	"55.42"	"53.66"	"50.56"	"55.25"	"54.75"	"60.0"	"63.04"	"62.84"	"54.49"	"56.1"
 "GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
 top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"46.94"	"54.17"	"60.84"	"58.54"	"48.89"	"55.8"	"54.75"	"48.11"	"57.61"	"57.92"	"57.87"	"54.88"
 "Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
 top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"46.67"	"55.28"	"53.61"	"59.15"	"46.11"	"51.38"	"55.87"	"54.59"	"53.26"	"56.28"	"54.49"	"53.05"
 "DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"44.44"	"48.33"	"56.63"	"48.78"	"48.89"	"55.25"	"53.07"	"52.97"	"56.52"	"57.92"	"50.56"	"54.27"
+"Qwen3 30B A3B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507"	"temperature: 0.6
+top-p: 0.95"	"Qwen"	"2830.0"	"351.0"	"30.0"	"Open"	"Think"	"On"	"50.44"	"44.17"	"49.17"	"50.0"	"57.32"	"42.22"	"49.72"	"53.07"	"50.27"	"54.89"	"56.83"	"47.75"	"58.54"
 "gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"46.67"	"51.39"	"51.81"	"47.56"	"45.0"	"51.38"	"54.75"	"50.27"	"51.63"	"47.54"	"46.07"	"45.12"
+"Qwen3 Next 80B A3B Instruct"	"https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"477.5"	"477.5"	"80.0"	"Open"	"Instruct"	"Off"	"48.87"	"41.67"	"50.83"	"51.81"	"55.49"	"42.78"	"47.51"	"50.28"	"51.89"	"50.54"	"48.09"	"51.69"	"50.0"
 "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
 temperature: 0.6
 top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"42.22"	"49.44"	"50.0"	"53.05"	"47.22"	"48.62"	"50.28"	"48.11"	"51.63"	"54.1"	"44.38"	"53.05"
 top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"37.5"	"43.61"	"46.99"	"51.22"	"45.56"	"44.75"	"44.69"	"44.32"	"48.91"	"49.18"	"44.94"	"49.39"
 "Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
 top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"38.89"	"41.67"	"48.8"	"50.0"	"38.33"	"46.41"	"44.69"	"44.86"	"44.57"	"50.82"	"46.07"	"47.56"
+"Qwen3 30B A3B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507"	"temperature: 0.7
+top-p: 0.8"	"Qwen"	"441.5"	"441.5"	"30.0"	"Open"	"Instruct"	"Off"	"42.79"	"34.44"	"43.89"	"40.96"	"48.78"	"38.89"	"41.99"	"46.93"	"44.32"	"42.93"	"48.09"	"43.26"	"46.95"
 "A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"38.89"	"41.11"	"43.98"	"49.39"	"36.11"	"45.86"	"43.58"	"44.32"	"39.67"	"43.17"	"39.89"	"36.59"
 "gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
 temperature: 1.0
 top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"36.67"	"42.78"	"45.78"	"45.73"	"37.78"	"35.91"	"41.9"	"39.46"	"51.09"	"40.44"	"38.76"	"41.46"
+"Gemma 3 27B it"	"https://huggingface.co/google/gemma-3-27b-it"	"temperature: 1.0
+top-p: 0.95"	"Gemma"	"380.0"	"380.0"	"27.0"	"Open"	"Instruct"	"Off"	"40.86"	"34.44"	"35.0"	"37.35"	"43.9"	"42.22"	"43.65"	"47.49"	"41.08"	"44.02"	"53.55"	"39.33"	"40.24"
+"Tongyi DeepResearch 30B A3B"	"https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"	"temperature: 0.6
+top-p: 0.95"	"Alibaba"	"1147.0"	"408.0"	"30.0"	"Open"	"Think"	"On"	"40.1"	"36.11"	"40.83"	"43.37"	"44.51"	"32.78"	"37.02"	"44.69"	"38.92"	"43.48"	"46.45"	"37.08"	"39.63"
+"Mistral-Samll-3-2 24B-Instruct-2506"	"https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506"	"temperature: 0.15
+top-p: 0.95"	"mistralai"	"369.0"	"369.0"	"24.0"	"Open"	"Instruct"	"Off"	"39.09"	"31.39"	"40.0"	"36.75"	"42.07"	"34.44"	"44.2"	"41.9"	"42.16"	"45.65"	"40.98"	"37.64"	"38.41"
+"K2-Think"	"https://huggingface.co/LLM360/K2-Think"	"temperature: 1.0
+top-p: 0.95"	"LLM360"	"1835.0"	"486.0"	"32.8"	"Open"	"Think"	"On"	"35.06"	"29.17"	"36.11"	"30.12"	"44.51"	"26.67"	"33.15"	"38.55"	"37.84"	"41.85"	"37.7"	"33.71"	"36.59"
 "EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
 top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"33.61"	"38.33"	"28.92"	"35.98"	"26.11"	"35.91"	"34.08"	"38.92"	"35.33"	"33.88"	"28.09"	"31.71"
 "HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
 top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"32.22"	"37.22"	"31.93"	"38.41"	"27.78"	"32.6"	"30.17"	"29.19"	"32.07"	"33.33"	"25.28"	"26.22"
+"ERNIE-4.5 21B A3B Thinking"	"https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking"	"temperature: 0.6
+top-p: 0.95"	"ERNIE"	"1637.0"	"541.0"	"21.0"	"Open"	"Think"	"On"	"25.32"	"17.5"	"31.11"	"18.67"	"39.02"	"23.33"	"24.31"	"24.58"	"26.49"	"24.46"	"30.6"	"19.1"	"27.44"
 "Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
 top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"9.72"	"22.22"	"21.08"	"24.39"	"9.44"	"18.23"	"24.02"	"29.73"	"29.89"	"33.33"	"22.47"	"12.8"
 "Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
+top-p: 0.7"	"KT"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"26.39"	"26.39"	"17.47"	"26.83"	"13.33"	"18.78"	"20.67"	"16.22"	"20.65"	"21.31"	"12.92"	"9.15"
 "Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
+top-p: 0.95"	"Kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"21.11"	"20.28"	"10.84"	"15.24"	"5.56"	"7.73"	"8.94"	"9.19"	"8.15"	"5.46"	"5.06"	"4.88"

vis_utils.py CHANGED Viewed

@@ -82,7 +82,7 @@ def create_empty_radar_chart(message: str) -> Figure:
 def create_len_overall_scatter(
     df: pd.DataFrame,
     selected_models: Optional[List[str]] = None,
-    max_models: int = 30,
     y_col: str = "Overall",
     length_data: Optional[dict] = None,
     theme: str = "light",

 def create_len_overall_scatter(
     df: pd.DataFrame,
     selected_models: Optional[List[str]] = None,
+    max_models: int = 50,
     y_col: str = "Overall",
     length_data: Optional[dict] = None,
     theme: str = "light",