Spaces:
Running
Running
송종윤/AI Productivity팀(SR)/삼성전자
add models, add speed and time results, change scatter plot design
a452b10
| import gradio as gr | |
| from gradio_rangeslider import RangeSlider | |
| import pandas as pd | |
| import argparse | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from huggingface_hub import snapshot_download | |
| from src.data_utils import get_dataframe_category, get_dataframe_language | |
| import src.config as configs | |
| from src.display.formatting import get_display_model_name | |
| from utils import start_watchdog_in_background | |
| import time | |
| # Parse command line arguments at the top level | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to launch the app on") | |
| parser.add_argument("--port", type=int, default=7860, help="Port to launch the app on") | |
| parser.add_argument("--mode", default="open", choices=["open"]) | |
| args = parser.parse_args() | |
| from utils import get_profile_and_organizations, download_with_restart | |
| from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter, load_leaderboard_language_data, create_language_radar_chart | |
| from src.about import ( | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| EVALUATION_QUEUE_TEXT, | |
| EVALUATION_QUEUE_TEXT_OPTION1, | |
| INTRODUCTION_TEXT, | |
| BANNER, | |
| TITLE, | |
| LINK, | |
| ) | |
| from src.display.css_html_js import custom_css | |
| from src.display.utils import ( | |
| Precision | |
| ) | |
| from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN | |
| from src.submission.submit import add_new_eval_option | |
| from ui import create_leaderboard_tab | |
| if args.mode == "open": | |
| def restart_space(): | |
| API.restart_space(repo_id=REPO_ID) | |
| ### Space initialisation | |
| download_with_restart( | |
| snapshot_download, | |
| repo_id=QUEUE_REPO, | |
| local_dir=EVAL_REQUESTS_PATH, | |
| repo_type="dataset", | |
| token=TOKEN, | |
| restart_func=restart_space | |
| ) | |
| download_with_restart( | |
| snapshot_download, | |
| repo_id=RESULTS_REPO, | |
| local_dir=EVAL_RESULTS_PATH, | |
| repo_type="dataset", | |
| token=TOKEN, | |
| restart_func=restart_space | |
| ) | |
| theme = gr.themes.Default( | |
| primary_hue="gray", | |
| neutral_hue="gray" | |
| ) | |
| def create_benchmark_tab_content(data_prefix: str): | |
| gr.HTML(INTRODUCTION_TEXT) | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Category Analysis | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.</p> | |
| """) | |
| # --- Category Explanation Box (2x5 grid, emoji, desc from about.py) --- | |
| from src.about import CATEGORY_DESCRIPTIONS | |
| gr.HTML(f""" | |
| <style> | |
| .category-box-grid {{ | |
| display: flex; | |
| flex-direction: column; | |
| gap: 18px; | |
| margin: 18px 0; | |
| }} | |
| .category-box-row {{ | |
| display: flex; | |
| gap: 18px; | |
| }} | |
| .category-box {{ | |
| background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%); | |
| border-radius: 26px; | |
| box-shadow: 0 0 16px #6c63ff44, 0 2px 8px rgba(0,0,0,0.08); | |
| color: #222 !important; | |
| min-height: 140px; | |
| flex: 1 1 0; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: flex-start; | |
| padding: 18px 16px 12px 16px; | |
| box-shadow: 0 0 16px #6c63ff44, 0 2px 8px rgba(0,0,0,0.08); | |
| font-size: 1.08rem; | |
| color: #222 !important; | |
| transition: box-shadow 0.2s; | |
| position: relative; | |
| overflow: hidden; | |
| opacity: 1; | |
| }} | |
| .category-title {{ | |
| font-weight: 700; | |
| font-size: 1.18rem; | |
| margin-left: 8px; | |
| vertical-align: middle; | |
| color: #222 !important; | |
| }} | |
| .category-desc {{ | |
| margin-top: 12px; | |
| font-size: 0.98rem; | |
| color: #fff !important; | |
| font-weight: 400; | |
| min-height: 24px; | |
| width: 100%; | |
| line-height: 1.5; | |
| letter-spacing: 0.01em; | |
| }} | |
| .category-box:hover {{ | |
| box-shadow: 0 0 24px #a5a1ff55, 0 4px 16px rgba(0,0,0,0.18); | |
| }} | |
| .category-title {{ | |
| font-weight: 700; | |
| font-size: 1.18rem; | |
| margin-left: 8px; | |
| vertical-align: middle; | |
| }} | |
| .category-desc {{ | |
| margin-top: 12px; | |
| font-size: 0.98rem; | |
| color: #222 !important; | |
| font-weight: 400; | |
| min-height: 24px; | |
| width: 100%; | |
| line-height: 1.5; | |
| letter-spacing: 0.01em; | |
| }} | |
| @media (prefers-color-scheme: dark) {{ | |
| .category-box .category-title {{ | |
| color: #f5f6f7 !important; | |
| }} | |
| }} | |
| </style> | |
| <div class='category-box-grid'> | |
| <div class='category-box-row'> | |
| <div class='category-box'><span class='category-title'>📝 Content Generation</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Content Generation"]}</div></div> | |
| <div class='category-box'><span class='category-title'>✂️ Editing</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Editing"]}</div></div> | |
| <div class='category-box'><span class='category-title'>📊 Data Analysis</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Data Analysis"]}</div></div> | |
| <div class='category-box'><span class='category-title'>🧠 Reasoning</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Reasoning"]}</div></div> | |
| <div class='category-box'><span class='category-title'>🦄 Hallucination</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Hallucination"]}</div></div> | |
| </div> | |
| <div class='category-box-row'> | |
| <div class='category-box'><span class='category-title'>🛡️ Safety</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Safety"]}</div></div> | |
| <div class='category-box'><span class='category-title'>🔁 Repetition</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Repetition"]}</div></div> | |
| <div class='category-box'><span class='category-title'>📝 Summarization</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Summarization"]}</div></div> | |
| <div class='category-box'><span class='category-title'>🌐 Translation</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Translation"]}</div></div> | |
| <div class='category-box'><span class='category-title'>💬 Multi-Turn</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Multi-Turn"]}</div></div> | |
| </div> | |
| </div> | |
| """) | |
| df_cat = get_dataframe_category(data_prefix=data_prefix) | |
| gr.HTML(""" | |
| <style> | |
| .leaderboard-container { | |
| background: #fff; | |
| } | |
| @media (prefers-color-scheme: dark) { | |
| .leaderboard-container { | |
| background: #121212; | |
| } | |
| } | |
| </style> | |
| <div class="leaderboard-container"> | |
| """) | |
| leaderboard_tab_cat = create_leaderboard_tab( | |
| df_cat, | |
| "Category", | |
| mode=args.mode | |
| ) | |
| gr.HTML("</div>") | |
| # --- Category Radar Chart Section --- | |
| from vis_utils import load_leaderboard_data, create_domain_radar_chart | |
| initial_df_cat = load_leaderboard_data(data_prefix=data_prefix) | |
| # Top 5 models based on leaderboard (Average Accuracy) | |
| if "Overall" in initial_df_cat.columns: | |
| top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5] | |
| else: | |
| top5_models_cat = initial_df_cat['Model Name'].tolist()[:5] | |
| gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center; width: 100%; max-width: 100%; margin: 0 auto; padding: 0;">') | |
| # Radar chart model selector (up to 5) | |
| display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist() | |
| original_names_cat = initial_df_cat['Model Name'].tolist() | |
| display_to_original_cat = dict(zip(display_names_cat, original_names_cat)) | |
| top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat] | |
| model_selector_cat = gr.Dropdown( | |
| choices=display_names_cat, | |
| value=top5_display_names_cat, | |
| multiselect=True, | |
| label="🎯 Select Models for Radar Chart", | |
| info="Choose up to 5 models to visualize", | |
| elem_classes=["dropdown", "custom-dropdown"], | |
| interactive=True, | |
| filterable=True, | |
| allow_custom_value=False | |
| ) | |
| gr.HTML(""" | |
| <script> | |
| document.querySelector('.custom-dropdown').addEventListener('change', function(e) { | |
| if (this.value.length > 5) { | |
| alert('You can select up to 5 models only'); | |
| this.value = this.value.slice(0, 5); | |
| } | |
| }); | |
| </script> | |
| """) | |
| radar_chart_cat = gr.Plot( | |
| label="", | |
| value=create_domain_radar_chart( | |
| initial_df_cat, | |
| "Average Accuracy", | |
| top5_models_cat, | |
| mode=args.mode | |
| ), | |
| elem_classes=["radar-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| # --- Speed Med Bar Plot Section (NEW) --- | |
| import json | |
| with open(f"src/data/{data_prefix}/time_data.json", "r") as f: | |
| time_data = json.load(f) | |
| time_data_state = gr.State(value=time_data) | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px; margin-top: 24px;"> | |
| <div class="section-header"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Speed per GPU | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 8px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Speed per GPU represents the number of tokens generated per second divided by the number of GPUs during the inference.<br> | |
| </p> | |
| <p style="font-size:0.95em; color:var(--text-secondary); margin-top:0.5px;"> | |
| <b>Setting</b>: We measured the speed in an H100 GPU environment consisting of 4 nodes with 8 GPUs each, using vLLM and Ray to set the tensor parallel size between 1 and 32 (In the plot, <i>GPU</i> refers to the tensor parallel size).<br> | |
| We performed inference by sending an asynchronous request to the served model, and we set the concurrency to 32. <br> | |
| <b>Note</b>: We measured the speed by directly serving open-source models, and proprietary models were excluded from the plot. | |
| </p> | |
| """) | |
| # --- Speed Bar Plot UI: Row with left (category selector) and right (min/max dials) --- | |
| category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS] | |
| default_category = "Overall" | |
| default_x_axis_sort_by = "Overall Score" | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| x_axis_sort_by = gr.Radio( | |
| choices=["Overall Score", "Speed per GPU"], | |
| value="Overall Score", | |
| label="Sort X-Axis by", | |
| elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique | |
| elem_classes=["x-axis-btn-radio"], | |
| interactive=True, | |
| show_label=True | |
| ) | |
| with gr.Column(scale=1): | |
| min_max_score_slider = RangeSlider( | |
| minimum=0, | |
| maximum=100, | |
| value=(0, 100), | |
| step=1, | |
| label="Minimum and Maximum Overall Score", | |
| interactive=True | |
| ) | |
| with gr.Column(scale=1): | |
| min_max_param_size_slider = RangeSlider( | |
| minimum=0, | |
| maximum=1000, | |
| value=(0, 1000), | |
| step=1, | |
| label="Minimum and Maximum Parameter Size (B)", | |
| interactive=True | |
| ) | |
| # Speed Bar Plot | |
| from vis_utils import create_speed_med_bar_plot | |
| speed_med_bar_plot = gr.Plot( | |
| label="", | |
| value=create_speed_med_bar_plot( | |
| initial_df_cat, | |
| time_data, | |
| min_size=0, | |
| max_size=1000, | |
| min_score=0, | |
| max_score=100, | |
| category=default_category, | |
| theme="light", | |
| x_axis_sort_by=default_x_axis_sort_by, | |
| mode=args.mode | |
| ), | |
| elem_classes=["speed-med-bar-plot", "plot-container"] | |
| ) | |
| gr.HTML("</div>") | |
| # --- Event handler: update Speed bar plot and dials when category or dials change --- | |
| def update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, current_time_data_state, current_leaderboard_df=None): | |
| df = current_leaderboard_df if current_leaderboard_df is not None else initial_df_cat | |
| return create_speed_med_bar_plot( | |
| df, | |
| current_time_data_state, | |
| min_size=min_max_size[0], | |
| max_size=min_max_size[1], | |
| min_score=min_max_score[0], | |
| max_score=min_max_score[1], | |
| theme="light", | |
| x_axis_sort_by=x_axis_sort_by, | |
| mode=args.mode | |
| ) | |
| # Connect category selector to dials and plot | |
| x_axis_sort_by.change( | |
| fn=update_speed_med_bar_plot, | |
| inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], | |
| outputs=speed_med_bar_plot | |
| ) | |
| min_max_param_size_slider.change( | |
| fn=update_speed_med_bar_plot, | |
| inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], | |
| outputs=speed_med_bar_plot | |
| ) | |
| min_max_score_slider.change( | |
| fn=update_speed_med_bar_plot, | |
| inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], | |
| outputs=speed_med_bar_plot | |
| ) | |
| # Connect leaderboard filters to dials and plot (if leaderboard_tab_cat provides a filtered DataFrame state) | |
| if "df_state" in leaderboard_tab_cat: | |
| leaderboard_tab_cat["df_state"].change( | |
| fn=lambda df, x_axis_sort_by, min_max_size, min_max_score, time_data: update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, time_data, df), | |
| inputs=[leaderboard_tab_cat["df_state"], x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], | |
| outputs=speed_med_bar_plot | |
| ) | |
| # Update radar chart when model_selector_cat selection changes | |
| def update_radar_chart_cat(selected_display_names): | |
| # If no selection, fallback to top-5 | |
| if not selected_display_names or len(selected_display_names) == 0: | |
| df = load_leaderboard_data(data_prefix=data_prefix) | |
| selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]] | |
| selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat] | |
| return create_domain_radar_chart( | |
| load_leaderboard_data(data_prefix=data_prefix), | |
| "Average Accuracy", | |
| selected_models, | |
| mode=args.mode | |
| ) | |
| model_selector_cat.change( | |
| fn=update_radar_chart_cat, | |
| inputs=model_selector_cat, | |
| outputs=radar_chart_cat | |
| ) | |
| # --- Med. Len. vs Overall Scatter Plot Section --- | |
| from vis_utils import create_len_overall_scatter | |
| import json | |
| with open(f"src/data/{data_prefix}/length_data.json", "r") as f: | |
| length_data = json.load(f) | |
| # --- Create a Gradio State component to hold length_data --- | |
| length_data_state = gr.State(value=length_data) | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px; margin-top: 24px;"> | |
| <div class="section-header"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Output Length vs. Category Score | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 8px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Explore the relationship between median output length and model performance by category | |
| </p> | |
| <p style="font-size:0.95em; color:var(--text-secondary); margin-top:0.5px;"> | |
| Median Length: Median number of tokens including both Think and Answer<br> | |
| Median Response Length: Median number of answer tokens, excluding Think<br> | |
| <b>Note</b>: We measured the token length of open-source models only and proprietary models were excluded from the plot. | |
| </p> | |
| </div> | |
| """) | |
| # Category selection buttons (HTML + Gradio Radio for event) | |
| category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS] | |
| # (cat-btn-radio related style block removed, now handled in custom_css) | |
| category_selector = gr.Radio( | |
| choices=category_columns, | |
| value="Overall", | |
| label="Select Category for Y-Axis", | |
| elem_id=f"cat-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique | |
| elem_classes=["cat-btn-radio"], | |
| interactive=True, | |
| show_label=True | |
| ) | |
| x_axis_selector = gr.Radio( | |
| choices=["Median Length", "Median Response Length"], | |
| value="Median Length", | |
| label="Select X-Axis Data", | |
| elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique | |
| elem_classes=["x-axis-btn-radio"], | |
| interactive=True, | |
| show_label=True | |
| ) | |
| gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center;">') | |
| scatter_plot_cat = gr.Plot( | |
| label="", | |
| value=create_len_overall_scatter( | |
| load_leaderboard_data(data_prefix=data_prefix), | |
| y_col="Overall", | |
| length_data=length_data, | |
| x_axis_data_source=x_axis_selector.value | |
| ), | |
| elem_classes=["efficiency-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # Update plot when category or x-axis selection changes | |
| def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state): | |
| return create_len_overall_scatter( | |
| load_leaderboard_data(data_prefix=data_prefix), | |
| y_col=selected_category, | |
| length_data=current_length_data_state, | |
| x_axis_data_source=selected_x_source | |
| ) | |
| category_selector.change( | |
| fn=update_scatter_plot_cat, | |
| inputs=[category_selector, x_axis_selector, length_data_state], | |
| outputs=scatter_plot_cat | |
| ) | |
| x_axis_selector.change( | |
| fn=update_scatter_plot_cat, | |
| inputs=[category_selector, x_axis_selector, length_data_state], | |
| outputs=scatter_plot_cat | |
| ) | |
| # When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5 | |
| def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col): | |
| _, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col) | |
| top5_display_names = [get_display_model_name(m) for m in top5_models[:5]] | |
| return gr.update(value=top5_display_names), create_domain_radar_chart( | |
| load_leaderboard_data(data_prefix=data_prefix), | |
| "Average Accuracy", | |
| top5_models[:5], | |
| mode=args.mode | |
| ) | |
| leaderboard_selectors_cat = [ | |
| leaderboard_tab_cat["type_selector"], | |
| leaderboard_tab_cat["model_type_selector"], | |
| leaderboard_tab_cat["think_selector"], | |
| leaderboard_tab_cat["df_state"], | |
| leaderboard_tab_cat["sort_col_dropdown"] | |
| ] | |
| for selector in leaderboard_selectors_cat: | |
| selector.change( | |
| fn=update_model_selector_and_radar_chart_cat_from_leaderboard, | |
| inputs=leaderboard_selectors_cat, | |
| outputs=[model_selector_cat, radar_chart_cat] | |
| ) | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Language Analysis | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).</p> | |
| """) | |
| df_lang = get_dataframe_language(data_prefix=data_prefix) | |
| leaderboard_tab_lang = create_leaderboard_tab( | |
| df_lang, | |
| "Language", | |
| mode=args.mode | |
| ) | |
| # --- Language Radar Chart Section --- | |
| from vis_utils import load_leaderboard_language_data, create_language_radar_chart | |
| initial_df_lang = load_leaderboard_language_data(data_prefix=data_prefix) | |
| # Top 5 models based on leaderboard (Overall) | |
| if "Overall" in initial_df_lang.columns: | |
| top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5] | |
| else: | |
| top5_models_lang = initial_df_lang['Model Name'].tolist()[:5] | |
| gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center;">') | |
| # Add model selector | |
| display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist() | |
| original_names_lang = initial_df_lang['Model Name'].tolist() | |
| display_to_original_lang = dict(zip(display_names_lang, original_names_lang)) | |
| top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang] | |
| model_selector_lang = gr.Dropdown( | |
| choices=display_names_lang, | |
| value=top5_display_names_lang, | |
| multiselect=True, | |
| label="🎯 Select Models for Radar Chart", | |
| info="Choose up to 5 models to visualize", | |
| elem_classes=["dropdown", "custom-dropdown"], | |
| interactive=True, | |
| filterable=True, | |
| allow_custom_value=False | |
| ) | |
| gr.HTML(""" | |
| <script> | |
| document.querySelectorAll('.custom-dropdown')[1].addEventListener('change', function(e) { | |
| if (this.value.length > 5) { | |
| alert('You can select up to 5 models only'); | |
| this.value = this.value.slice(0, 5); | |
| } | |
| }); | |
| </script> | |
| """) | |
| radar_chart_lang = gr.Plot( | |
| label="", | |
| value=create_language_radar_chart( | |
| initial_df_lang, | |
| "Average Accuracy", | |
| top5_models_lang, | |
| mode=args.mode | |
| ), | |
| elem_classes=["radar-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| # Update radar chart when model_selector_lang selection changes | |
| def update_radar_chart_lang(selected_display_names): | |
| if not selected_display_names or len(selected_display_names) == 0: | |
| df = load_leaderboard_language_data(data_prefix=data_prefix) | |
| selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]] | |
| selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang] | |
| return create_language_radar_chart( | |
| load_leaderboard_language_data(data_prefix=data_prefix), | |
| "Average Accuracy", | |
| selected_models, | |
| mode=args.mode | |
| ) | |
| model_selector_lang.change( | |
| fn=update_radar_chart_lang, | |
| inputs=model_selector_lang, | |
| outputs=radar_chart_lang | |
| ) | |
| # When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5 | |
| def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col): | |
| _, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col) | |
| top5_display_names = [get_display_model_name(m) for m in top5_models[:5]] | |
| return gr.update(value=top5_display_names), create_language_radar_chart( | |
| load_leaderboard_language_data(data_prefix=data_prefix), | |
| "Average Accuracy", | |
| top5_models[:5], | |
| mode=args.mode | |
| ) | |
| leaderboard_selectors_lang = [ | |
| leaderboard_tab_lang["type_selector"], | |
| leaderboard_tab_lang["model_type_selector"], | |
| leaderboard_tab_lang["think_selector"], | |
| leaderboard_tab_lang["df_state"], | |
| leaderboard_tab_lang["sort_col_dropdown"] | |
| ] | |
| for selector in leaderboard_selectors_lang: | |
| selector.change( | |
| fn=update_model_selector_and_radar_chart_lang_from_leaderboard, | |
| inputs=leaderboard_selectors_lang, | |
| outputs=[model_selector_lang, radar_chart_lang] | |
| ) | |
| demo = gr.Blocks(css=custom_css, theme=theme) | |
| with demo: | |
| gr.HTML(BANNER + TITLE + LINK) | |
| user_state = gr.State() | |
| organization_state = gr.State() | |
| with gr.Tabs(elem_classes="tab-buttons") as main_tabs: | |
| if args.mode == "open": | |
| tab_configurations = [ | |
| {"data_prefix": "open/", "tab_name": "TRUEBench", "tab_id": 2} | |
| ] | |
| else: | |
| tab_configurations = [ | |
| {"data_prefix": f"{args.mode}-public/", "tab_name": "TRUEBench (public set)", "tab_id": 2}, | |
| {"data_prefix": f"{args.mode}-full/", "tab_name": "TRUEBench (private set)", "tab_id": 3} | |
| ] | |
| for config in tab_configurations: | |
| with gr.TabItem(config["tab_name"], elem_id="llm-benchmark-tab-table", id=config["tab_id"]): | |
| create_benchmark_tab_content(data_prefix=config["data_prefix"]) | |
| with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4): | |
| with gr.Column(): | |
| with gr.Row(): | |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text") | |
| with gr.Row(): | |
| gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text") | |
| if args.mode == "open": | |
| login_button = gr.LoginButton() | |
| with gr.Row(): | |
| with gr.Column(): | |
| contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True) | |
| model_name_textbox = gr.Textbox(label="Model Name") | |
| model_type_dropdown = gr.Dropdown( | |
| choices=["Instruct", "Think", "Hybrid"], | |
| label="Model Type (Instruct, Think, or Hybrid)", | |
| multiselect=False, | |
| value="Instruct", | |
| interactive=True, | |
| ) | |
| think_type_dropdown = gr.Dropdown( | |
| choices=["On", "Off"], | |
| label="Think Mode (On/Off)", | |
| multiselect=False, | |
| value="Off", | |
| interactive=False, | |
| ) | |
| precision = gr.Dropdown( | |
| choices=[i.value.name for i in Precision if i != Precision.Unknown], | |
| label="Precision", | |
| multiselect=False, | |
| value="float16", | |
| interactive=True, | |
| ) | |
| # --- Dynamically control think_type based on model_type and connect event --- | |
| def update_think_type(model_type_value): | |
| if model_type_value == "Instruct": | |
| return gr.update(value="Off", interactive=False) | |
| elif model_type_value == "Think": | |
| return gr.update(value="On", interactive=False) | |
| else: # Hybrid | |
| return gr.update(value="On", interactive=True) | |
| model_type_dropdown.change( | |
| fn=update_think_type, | |
| inputs=model_type_dropdown, | |
| outputs=think_type_dropdown | |
| ) | |
| response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., </think>)") | |
| with gr.Column(): | |
| yml_textbox_placeholder = """# vLLM serving parameters | |
| # Refence: https://docs.vllm.ai/en/latest/cli/serve.html | |
| llm_serve_args: | |
| max_model_len: | |
| tensor_parallel_size: | |
| dtype: | |
| ... | |
| # OpenAI-compatible API (chat completion) | |
| # Reference: https://platform.openai.com/docs/api-reference/chat | |
| sampling_params: | |
| top_p: | |
| temperature: | |
| presence_penalty: | |
| ... | |
| # vLLM sampling parameters | |
| # Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1 | |
| extra_body: | |
| chat_template_kwargs: | |
| enable_thinking: | |
| ... | |
| top_k: | |
| repetition_penalty: | |
| ...""" | |
| yml_textbox = gr.Textbox( | |
| label="Configuration (YAML format)", | |
| elem_id="yml-textbox", | |
| lines=7, | |
| value=yml_textbox_placeholder | |
| ) | |
| upbox = gr.File( | |
| label="Upload configuration file as .yml or .yaml", | |
| file_types=[".yml", ".yaml"], | |
| type="filepath", | |
| height=150 | |
| ) | |
| # Add Translate to JSON button below upbox | |
| translate_button = gr.Button( | |
| "Translate to JSON", | |
| elem_id="translate-to-json-btn", | |
| elem_classes=["translate-btn"], | |
| scale=None | |
| ) | |
| # Add custom style for the button | |
| gr.HTML( | |
| ''' | |
| <style> | |
| #translate-to-json-btn, .translate-btn { | |
| width: 100%; | |
| min-height: 24px; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| background: linear-gradient(90deg, #6c63ff 60%, #a5a1ff 100%); | |
| color: #fff; | |
| border: none; | |
| border-radius: 12px; | |
| margin-top: 8px; | |
| margin-bottom: 8px; | |
| box-shadow: 0 2px 8px #6c63ff33; | |
| transition: background 0.2s, box-shadow 0.2s; | |
| } | |
| #translate-to-json-btn:hover, .translate-btn:hover { | |
| background: linear-gradient(90deg, #5a54d6 60%, #7e7bff 100%); | |
| box-shadow: 0 4px 16px #6c63ff55; | |
| } | |
| </style> | |
| ''' | |
| ) | |
| with gr.Column(): | |
| requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox") | |
| output_dict = gr.Code(label="Translated Python Dictionary", language="json") | |
| submit_button = gr.Button("Submit Eval") | |
| submission_result = gr.Markdown() | |
| def parse_and_display_yaml_config(upbox_path, yml_textbox_value): | |
| import yaml, json | |
| if upbox_path: | |
| try: | |
| with open(upbox_path, "r", encoding="utf-8") as f: | |
| data = yaml.safe_load(f) | |
| if data is None: | |
| return "YAML file is empty." | |
| return json.dumps(data, indent=4, ensure_ascii=False) | |
| except Exception as e: | |
| return f"Error parsing YAML file: {e}" | |
| elif yml_textbox_value and yml_textbox_value.strip(): | |
| try: | |
| data = yaml.safe_load(yml_textbox_value) | |
| if data is None: | |
| return "YAML textbox is empty or invalid." | |
| return json.dumps(data, indent=4, ensure_ascii=False) | |
| except Exception as e: | |
| return f"Error parsing YAML textbox: {e}" | |
| else: | |
| return "" | |
| if args.mode == "open": | |
| event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state]) | |
| event.then( | |
| add_new_eval_option, | |
| [ | |
| contact_email, | |
| model_name_textbox, | |
| model_type_dropdown, | |
| think_type_dropdown, | |
| precision, | |
| response_prefix_textbox, | |
| requirements_textbox, | |
| user_state, | |
| organization_state, | |
| yml_textbox, | |
| upbox, | |
| ], | |
| submission_result, | |
| ).then( | |
| fn=parse_and_display_yaml_config, | |
| inputs=[upbox, yml_textbox], | |
| outputs=output_dict | |
| ) | |
| translate_button.click( | |
| fn=parse_and_display_yaml_config, | |
| inputs=[upbox, yml_textbox], | |
| outputs=output_dict | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("📙 Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=9, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| if args.mode == "open": | |
| def health_fn() -> str: | |
| try: | |
| initial_df_cat_ = load_leaderboard_data() | |
| if "Overall" in initial_df_cat_.columns: | |
| return "ok" | |
| else: | |
| return "error" | |
| except: | |
| return "error" | |
| gr.api(health_fn, api_name="health") | |
| demo.load(fn=lambda: start_watchdog_in_background(), inputs=None, outputs=None, queue=False) | |
| if args.mode == "open": | |
| demo.queue(default_concurrency_limit=40).launch(prevent_thread_lock=True) | |
| while True: | |
| time.sleep(600) | |
| else: | |
| if __name__ == "__main__": | |
| demo.queue(default_concurrency_limit=40).launch(server_name=args.ip, server_port=args.port) | |