CodeReviewBench

Sleeping

App Files Files Community

kenkaneki commited on Jul 3, 2025

Commit

346c3c5

1 Parent(s): 5c87359

zalupa3

Browse files

Files changed (11) hide show

README.md +133 -2
app.py +43 -43
example_submission.jsonl +4 -0
requirements.txt +0 -1
src/about.py +25 -19
src/display/css_html_js.py +1 -1
src/display/utils.py +184 -207
src/envs.py +3 -3
src/leaderboard/processor.py +143 -218
src/populate.py +21 -96
src/submission/submit.py +42 -112

README.md CHANGED Viewed

@@ -29,6 +29,137 @@ models:
 ---
-# CircleGuardBench Leaderboard
-First-of-its-kind benchmark for evaluating the protection capabilities of large language model (LLM) guard systems. It tests how well guard models block harmful content, resist jailbreaks, avoid false positives, and operate efficiently in real-time environments.

 ---
+# CodeReview Bench Leaderboard
+A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
+## Features
+- **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
+- **Dual Language Comments**: Supports both Russian and English comment languages
+- **Comprehensive Metrics**:
+  - LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
+  - Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
+- **Interactive Visualization**: Compare model performance across categories with radar plots
+- **Easy Submission**: Submit your model results via web interface
+## Metrics
+### LLM-based Multimetric
+- **Readability**: How easy the review is to understand
+- **Relevance**: How relevant the review is to the code
+- **Explanation Clarity**: How clear the explanations are
+- **Problem Identification**: How well problems are identified
+- **Actionability**: How actionable the suggestions are
+- **Completeness**: How complete the review is
+- **Specificity**: How specific the feedback is
+- **Contextual Adequacy**: How well the review fits the context
+- **Consistency**: How consistent the review style is
+- **Brevity**: How concise the review is
+### Exact-Match Metrics
+- **Pass@1**: Percentage of correct reviews on first attempt
+- **Pass@5**: Percentage of correct reviews in top 5 attempts
+- **Pass@10**: Percentage of correct reviews in top 10 attempts
+- **BLEU@10**: BLEU score for top 10 review candidates
+## Programming Languages Supported
+- Python
+- JavaScript
+- Java
+- C++
+- C#
+- TypeScript
+- Go
+- Rust
+- Swift
+- Kotlin
+- Ruby
+- PHP
+- C
+- Scala
+- R
+- Dart
+- Other
+## Comment Languages
+- Russian (ru)
+- English (en)
+## Example Categories
+- Bug Fix
+- Code Style
+- Performance
+- Security
+- Refactoring
+- Documentation
+- Testing
+- Architecture
+- Other
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## Usage
+```bash
+python app.py
+```
+## Submission Format
+Submit your results as a JSONL file where each line contains:
+```json
+{
+  "model_name": "your-model-name",
+  "programming_language": "python",
+  "comment_language": "en",
+  "readability": 8.5,
+  "relevance": 9.0,
+  "explanation_clarity": 7.8,
+  "problem_identification": 8.2,
+  "actionability": 8.7,
+  "completeness": 8.0,
+  "specificity": 7.5,
+  "contextual_adequacy": 8.3,
+  "consistency": 8.8,
+  "brevity": 7.2,
+  "pass_at_1": 0.75,
+  "pass_at_5": 0.88,
+  "pass_at_10": 0.92,
+  "bleu_at_10": 0.65,
+  "total_evaluations": 100
+}
+```
+## Environment Variables
+Set the following environment variables:
+```bash
+HF_TOKEN=your_huggingface_token
+OWNER=your-organization
+RESULTS_DATASET_ID=your-org/codereview-bench-results
+```
+## Citation
+```bibtex
+@misc{codereviewbench2025,
+  author = {CodeReview Bench Team},
+  title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
+  year = {2025},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/your-org/codereview-bench}}
+}
+```

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-GuardBench Leaderboard Application
 """
 import os
@@ -25,18 +25,19 @@ from src.about import (
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
-    GUARDBENCH_COLUMN,
     DISPLAY_COLS,
     METRIC_COLS,
     HIDDEN_COLS,
     NEVER_HIDDEN_COLS,
     CATEGORIES,
-    TEST_TYPES,
     ModelType,
     Mode,
     Precision,
     WeightType,
-    GuardModelType,
     get_all_column_choices,
     get_default_visible_columns,
 )
@@ -221,7 +222,7 @@ def init_leaderboard(dataframe, visible_columns=None):
     """
     if dataframe is None or dataframe.empty:
         # Create an empty dataframe with the right columns
-        columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
         dataframe = pd.DataFrame(columns=columns)
         logger.warning("Initializing empty leaderboard")
@@ -234,20 +235,20 @@ def init_leaderboard(dataframe, visible_columns=None):
         dataframe = dataframe.copy()
         dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
-    if "guard_model_type" in dataframe.columns:
         dataframe = dataframe.copy()
-        dataframe["guard_model_type"] = dataframe["guard_model_type"].str.replace("wc_guard", "whitecircle_guard")
     # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
     # Determine which columns to display
     display_column_names = [
-        getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS
     ]
-    hidden_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS]
     # Columns that should always be shown
-    always_visible = [getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
     # Use provided visible columns if specified, otherwise use default
     if visible_columns is None:
@@ -279,11 +280,11 @@ def init_leaderboard(dataframe, visible_columns=None):
     # Create a list of datatypes in the format Gradio expects
     datatypes = []
     for col in visible_columns:
-        # Find the corresponding GUARDBENCH_COLUMN entry
         col_type = None
         for display_col in DISPLAY_COLS:
-            if getattr(GUARDBENCH_COLUMN, display_col).name == col:
-                orig_type = getattr(GUARDBENCH_COLUMN, display_col).type
                 # Map to Gradio's expected types
                 col_type = type_mapping.get(orig_type, "str")
                 break
@@ -322,7 +323,7 @@ def init_leaderboard(dataframe, visible_columns=None):
             )
     column_info_map = {
-        f.name: getattr(GUARDBENCH_COLUMN, f.name) for f in fields(GUARDBENCH_COLUMN)
     }
     column_mapping = {
         col: column_info_map.get(col, ColumnInfo(col, col)).display_name
@@ -500,7 +501,7 @@ def submit_results(
     mode: str,
     submission_file: tempfile._TemporaryFileWrapper,
     version: str,
-    guard_model_type: GuardModelType,
 ):
     """
     Handle submission of results with model metadata.
@@ -530,7 +531,7 @@ def submit_results(
         "model_type": model_type,
         "mode": mode,
         "version": version,
-        "guard_model_type": guard_model_type,
     }
     # Process the submission
@@ -689,24 +690,23 @@ def update_visualization(selected_models, selected_category, selected_metric, ve
 demo = gr.Blocks(css=custom_css, theme=custom_theme)
 CATEGORY_DISPLAY_MAP = {
-    "Political Corruption and Legal Evasion": "Corruption & Legal Evasion",
-    "Financial Fraud and Unethical Business": "Financial Fraud",
-    "AI Manipulation and Jailbreaking": "AI Jailbreaking",
-    "Child Exploitation and Abuse": "Child Exploitation",
-    "Hate Speech, Extremism, and Discrimination": "Hate Speech",
-    "Labor Exploitation and Human Trafficking": "Labor Exploitation",
-    "Manipulation, Deception, and Misinformation": "Misinformation",
-    "Environmental and Industrial Harm": "Environmental Harm",
-    "Academic Dishonesty and Cheating": "Academic Dishonesty",
-    "Self–Harm and Suicidal Ideation": "Self-Harm",
-    "Animal Cruelty and Exploitation": "Animal Harm",
-    "Criminal, Violent, and Terrorist Activity": "Crime & Violence",
-    "Drug– and Substance–Related Activities": "Drug Use",
-    "Sexual Content and Violence": "Sexual Content",
-    "Weapon, Explosives, and Hazardous Materials": "Weapons & Harmful Materials",
-    "Cybercrime, Hacking, and Digital Exploits": "Cybercrime",
-    "Creative Content Involving Illicit Themes": "Illicit Creative",
-    "Safe Prompts": "Safe Prompts",
 }
 # Create reverse mapping for lookups
 CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
@@ -720,7 +720,7 @@ with demo:
         tabs = gr.Tabs(elem_classes="tab-buttons")
         with tabs:
-            with gr.TabItem("Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
                 with gr.Row():
                     version_selector = gr.Dropdown(
                         choices=BENCHMARK_VERSIONS,
@@ -963,7 +963,7 @@ with demo:
                     ],
                 )
-            with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
                 with gr.Row():
                     with gr.Column():
                         viz_version_selector = gr.Dropdown(
@@ -1128,10 +1128,10 @@ with demo:
                     outputs=[model_mode_selector],
                 )
-            # with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
             #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-            with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3):
                 gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Row():
@@ -1172,11 +1172,11 @@ with demo:
                             value=None,
                             interactive=True,
                         )
-                        guard_model_type = gr.Dropdown(
-                            choices=[t.name for t in GuardModelType],
-                            label="Guard model type",
                             multiselect=False,
-                            value=GuardModelType.LLM_REGEXP.name,
                             interactive=True,
                         )
@@ -1221,7 +1221,7 @@ with demo:
                         mode_selector,
                         file_input,
                         submission_version_selector,
-                        guard_model_type,
                     ],
                     outputs=result_output,
                 )

 """
+CodeReview Bench Leaderboard Application
 """
 import os
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
+    CODEREVIEW_COLUMN,
     DISPLAY_COLS,
     METRIC_COLS,
     HIDDEN_COLS,
     NEVER_HIDDEN_COLS,
     CATEGORIES,
+    COMMENT_LANGUAGES,
+    EXAMPLE_CATEGORIES,
     ModelType,
     Mode,
     Precision,
     WeightType,
+    ReviewModelType,
     get_all_column_choices,
     get_default_visible_columns,
 )
     """
     if dataframe is None or dataframe.empty:
         # Create an empty dataframe with the right columns
+        columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
         dataframe = pd.DataFrame(columns=columns)
         logger.warning("Initializing empty leaderboard")
         dataframe = dataframe.copy()
         dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
+    if "review_model_type" in dataframe.columns:
         dataframe = dataframe.copy()
+        dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
     # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
     # Determine which columns to display
     display_column_names = [
+        getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
     ]
+    hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
     # Columns that should always be shown
+    always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
     # Use provided visible columns if specified, otherwise use default
     if visible_columns is None:
     # Create a list of datatypes in the format Gradio expects
     datatypes = []
     for col in visible_columns:
+        # Find the corresponding CODEREVIEW_COLUMN entry
         col_type = None
         for display_col in DISPLAY_COLS:
+            if getattr(CODEREVIEW_COLUMN, display_col).name == col:
+                orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
                 # Map to Gradio's expected types
                 col_type = type_mapping.get(orig_type, "str")
                 break
             )
     column_info_map = {
+        f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
     }
     column_mapping = {
         col: column_info_map.get(col, ColumnInfo(col, col)).display_name
     mode: str,
     submission_file: tempfile._TemporaryFileWrapper,
     version: str,
+    review_model_type: ReviewModelType,
 ):
     """
     Handle submission of results with model metadata.
         "model_type": model_type,
         "mode": mode,
         "version": version,
+        "review_model_type": review_model_type,
     }
     # Process the submission
 demo = gr.Blocks(css=custom_css, theme=custom_theme)
 CATEGORY_DISPLAY_MAP = {
+    "Python": "Python",
+    "JavaScript": "JavaScript",
+    "Java": "Java",
+    "C++": "C++",
+    "C#": "C#",
+    "TypeScript": "TypeScript",
+    "Go": "Go",
+    "Rust": "Rust",
+    "Swift": "Swift",
+    "Kotlin": "Kotlin",
+    "Ruby": "Ruby",
+    "PHP": "PHP",
+    "C": "C",
+    "Scala": "Scala",
+    "R": "R",
+    "Dart": "Dart",
+    "Other": "Other"
 }
 # Create reverse mapping for lookups
 CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
         tabs = gr.Tabs(elem_classes="tab-buttons")
         with tabs:
+            with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
                 with gr.Row():
                     version_selector = gr.Dropdown(
                         choices=BENCHMARK_VERSIONS,
                     ],
                 )
+            with gr.TabItem("Visualize", elem_id="codereview-viz-tab", id=1):
                 with gr.Row():
                     with gr.Column():
                         viz_version_selector = gr.Dropdown(
                     outputs=[model_mode_selector],
                 )
+            # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
             #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=3):
                 gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Row():
                             value=None,
                             interactive=True,
                         )
+                        review_model_type = gr.Dropdown(
+                            choices=[t.name for t in ReviewModelType],
+                            label="Review model type",
                             multiselect=False,
+                            value=ReviewModelType.CUSTOM.name,
                             interactive=True,
                         )
                         mode_selector,
                         file_input,
                         submission_version_selector,
+                        review_model_type,
                     ],
                     outputs=result_output,
                 )

example_submission.jsonl ADDED Viewed

	@@ -0,0 +1,4 @@

+{"model_name": "GPT-4-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
+{"model_name": "GPT-4-CodeReview", "programming_language": "javascript", "comment_language": "en", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
+{"model_name": "Claude-3-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 8.0, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
+{"model_name": "Llama-CodeReview", "programming_language": "java", "comment_language": "en", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}

requirements.txt CHANGED Viewed

@@ -6,4 +6,3 @@ apscheduler>=3.10.0
 python-dotenv>=1.0.0
 plotly>=5.18.0
 pydantic==2.10.6
-circleguardbench @ git+https://github.com/whitecircle-ai/circle-guard-bench.git

 python-dotenv>=1.0.0
 plotly>=5.18.0
 pydantic==2.10.6

src/about.py CHANGED Viewed

@@ -1,54 +1,60 @@
 """
-Text content for the GuardBench Leaderboard.
 """
 TITLE = """
 <div style="text-align: center; margin-bottom: 1rem">
-    <h1>CircleGuardBench Leaderboard</h1>
 </div>
 """
 INTRODUCTION_TEXT = """
 ## Introduction
-CircleGuardBench is a comprehensive benchmark for evaluating the protection capabilities of large language model (LLM) guard systems.
-This leaderboard tracks model performance across various safety categories, including harmful content detection,
-jailbreak resistance, and more.
-Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
-across multiple categories and test scenarios.
 """
 LLM_BENCHMARKS_TEXT = """
-CircleGuardBench is the first-of-its-kind benchmark for evaluating the protection capabilities of large language model (LLM) guard systems.
-It tests how well guard models block harmful content, resist jailbreaks, avoid false positives, and operate efficiently in real-time environments on a taxonomy close to real-world data.
-Learn more about us at [whitecircle.ai](https://whitecircle.ai?utm_source=huggingface&utm_medium=organic&utm_campaign=circleguardbench_launch&utm_content=space)
-"""
 EVALUATION_QUEUE_TEXT = """
 ## Submit Your Model
-To add your model to the CircleGuardBench leaderboard:
-1. Run your evaluation using the CircleGuardBench framework at https://github.com/whitecircle-ai/circle-guard-bench
-2. Upload your run results in .jsonl format using this form.
 3. Once validated, your model will appear on the leaderboard.
 ### ✉️✨ Ready? Upload your results below!
 """
-CITATION_BUTTON_LABEL = "Cite CircleGuardBench"
 CITATION_BUTTON_TEXT = """
-@misc{circleguardbench2025,
-  author = {whitecircle-ai},
-  title = {CircleGuardBench: Comprehensive Benchmark for LLM Safety Guardrails. Learn more about us at whitecircle.ai},
   year = {2025},
   publisher = {GitHub},
   journal = {GitHub repository},
-  howpublished = {\\url{https://github.com/whitecircle-ai/circle-guard-bench}}
 }
 """

 """
+Text content for the CodeReview Bench Leaderboard.
 """
 TITLE = """
 <div style="text-align: center; margin-bottom: 1rem">
+    <h1>CodeReview Bench Leaderboard</h1>
 </div>
 """
 INTRODUCTION_TEXT = """
 ## Introduction
+CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
+This leaderboard tracks model performance across various programming languages and review criteria,
+including readability, relevance, explanation clarity, and actionability.
+Models are evaluated on their ability to provide high-quality code reviews that are helpful,
+accurate, and actionable across multiple programming languages and review categories.
 """
 LLM_BENCHMARKS_TEXT = """
+CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
+It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
+The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
+Learn more about automated code review evaluation and best practices.
+"""
 EVALUATION_QUEUE_TEXT = """
 ## Submit Your Model
+To add your model to the CodeReview Bench leaderboard:
+1. Run your evaluation using the CodeReview Bench framework
+2. Upload your results in .jsonl format using this form.
 3. Once validated, your model will appear on the leaderboard.
+### Requirements:
+- Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
+- Submissions should cover multiple programming languages where applicable
+- Both Russian and English comment languages are supported
 ### ✉️✨ Ready? Upload your results below!
 """
+CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
 CITATION_BUTTON_TEXT = """
+@misc{codereviewbench2025,
+  author = {CodeReview Bench Team},
+  title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
   year = {2025},
   publisher = {GitHub},
   journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/your-org/codereview-bench}}
 }
 """

src/display/css_html_js.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-CSS and styling for the GuardBench Leaderboard.
 """
 custom_css = """

 """
+CSS and styling for the CodeReview Bench Leaderboard.
 """
 custom_css = """

src/display/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Utility classes and functions for the GuardBench Leaderboard display.
 """
 from dataclasses import dataclass, field, fields
@@ -8,7 +8,7 @@ from typing import List, Optional
 class Mode(Enum):
-    """Inference mode for the guard model."""
     CoT = auto()  # Chain of Thought
     Strict = auto()
@@ -36,20 +36,19 @@ class ModelType(Enum):
             return "API"
         return "Unknown"
-class GuardModelType(str, Enum):
-    """Guard model types for the leaderboard."""
-    LLAMA_GUARD = "llama_guard"
-    CLASSIFIER = "classifier"
-    ATLA_SELENE = "atla_selene"
-    OPENAI_MODERATION = "openai_moderation"
-    LLM_REGEXP = "llm_regexp"
-    LLM_SO = "llm_so"
-    WHITECIRCLE_GUARD = "whitecircle_guard"
-    def __str__(self):
-        """String representation of the guard model type."""
-        return self.name
 class Precision(Enum):
@@ -72,6 +71,7 @@ class WeightType(Enum):
     Original = auto()
     Delta = auto()
     Adapter = auto()
     def __str__(self):
         """String representation of the weight type."""
         return self.name
@@ -89,8 +89,8 @@ class ColumnInfo:
 @dataclass
-class GuardBenchColumn:
-    """Columns for the GuardBench leaderboard."""
     # Core metadata
     model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_name",
@@ -118,8 +118,8 @@ class GuardBenchColumn:
         display_name="Version",
         displayed_by_default=False
     ))
-    guard_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="guard_model_type",
         display_name="Type",
         displayed_by_default=False
     ))
@@ -144,212 +144,168 @@ class GuardBenchColumn:
         displayed_by_default=False
     ))
-    # Default prompts metrics
-    default_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_prompts_f1_binary",
-        display_name="Default_Prompts_F1_Binary",
         type="number",
-        displayed_by_default=False
     ))
-    default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_prompts_f1",
-        display_name="Default_Prompts_F1",
         type="number",
-        displayed_by_default=False
     ))
-    default_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_prompts_recall_binary",
-        display_name="Default_Prompts_Recall",
         type="number",
-        displayed_by_default=False
     ))
-    default_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_prompts_precision_binary",
-        display_name="Default_Prompts_Precision",
         type="number",
-        displayed_by_default=False
     ))
-    default_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_prompts_error_ratio",
-        display_name="Default_Prompts_Error_Ratio",
         type="number",
-        displayed_by_default=False
     ))
-    default_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_prompts_avg_runtime_ms",
-        display_name="Default_Prompts_Avg_Runtime_ms",
         type="number",
-        displayed_by_default=False
     ))
-    # Jailbreaked prompts metrics
-    jailbreaked_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_prompts_f1_binary",
-        display_name="Jailbreaked_Prompts_F1_Binary",
         type="number",
-        displayed_by_default=False
     ))
-    jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_prompts_f1",
-        display_name="Jailbreaked_Prompts_F1",
         type="number",
-        displayed_by_default=False
     ))
-    jailbreaked_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_prompts_recall_binary",
-        display_name="Jailbreaked_Prompts_Recall",
         type="number",
-        displayed_by_default=False
     ))
-    jailbreaked_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_prompts_precision_binary",
-        display_name="Jailbreaked_Prompts_Precision",
         type="number",
-        displayed_by_default=False
     ))
-    jailbreaked_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_prompts_error_ratio",
-        display_name="Jailbreaked_Prompts_Error_Ratio",
         type="number",
-        displayed_by_default=False
     ))
-    jailbreaked_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_prompts_avg_runtime_ms",
-        display_name="Jailbreaked_Prompts_Avg_Runtime_ms",
         type="number",
-        displayed_by_default=False
     ))
-    # Default answers metrics
-    default_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_answers_f1_binary",
-        display_name="Default_Answers_F1_Binary",
         type="number",
-        displayed_by_default=False
     ))
-    default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_answers_f1",
-        display_name="Default_Answers_F1",
         type="number",
-        displayed_by_default=False
     ))
-    default_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_answers_recall_binary",
-        display_name="Default_Answers_Recall",
         type="number",
-        displayed_by_default=False
     ))
-    default_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_answers_precision_binary",
-        display_name="Default_Answers_Precision",
         type="number",
-        displayed_by_default=False
     ))
-    default_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_answers_error_ratio",
-        display_name="Default_Answers_Error_Ratio",
         type="number",
-        displayed_by_default=False
     ))
-    default_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="default_answers_avg_runtime_ms",
-        display_name="Default_Answers_Avg_Runtime_ms",
         type="number",
-        displayed_by_default=False
     ))
-    # Jailbreaked answers metrics
-    jailbreaked_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_answers_f1_binary",
-        display_name="Jailbreaked_Answers_F1_Binary",
         type="number",
         displayed_by_default=False
     ))
-    jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_answers_f1",
-        display_name="Jailbreaked_Answers_F1",
         type="number",
         displayed_by_default=False
     ))
-    jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_answers_recall_binary",
-        display_name="Jailbreaked_Answers_Recall",
         type="number",
         displayed_by_default=False
     ))
-    jailbreaked_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_answers_precision_binary",
-        display_name="Jailbreaked_Answers_Precision",
         type="number",
         displayed_by_default=False
     ))
-    jailbreaked_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_answers_error_ratio",
-        display_name="Jailbreaked_Answers_Error_Ratio",
         type="number",
         displayed_by_default=False
     ))
-    jailbreaked_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="jailbreaked_answers_avg_runtime_ms",
-        display_name="Jailbreaked_Answers_Avg_Runtime_ms",
         type="number",
         displayed_by_default=False
     ))
-    integral_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="integral_score",
-        display_name="Integral_Score",
-        type="number",
-        displayed_by_default=True
-    ))
-    # Calculated overall metrics (renamed)
-    macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="macro_accuracy",
-        display_name="Macro_Accuracy",
-        type="number",
-        displayed_by_default=True
-    ))
-    macro_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="macro_recall",
-        display_name="Macro_Recall",
-        type="number",
-        displayed_by_default=True
-    ))
-    macro_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="macro_precision",
-        display_name="Macro Precision",
-        type="number",
-        displayed_by_default=False
-    ))
-    # NEW Summary Metrics
-    micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="micro_avg_error_ratio",
-        display_name="Micro_Error",
-        type="number",
-        displayed_by_default=True
-    ))
-    micro_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="micro_avg_runtime_ms",
-        display_name="Micro_Avg_time_ms",
-        type="number",
-        displayed_by_default=True
-    ))
-    total_evals_count: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="total_evals_count",
-        display_name="Total_Count",
-        type="number",
-        displayed_by_default=True
-    ))
 # Create instances for easy access
-GUARDBENCH_COLUMN = GuardBenchColumn()
 # Extract column lists for different views
-COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
-DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
-                if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
 # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
 def reorder_display_cols():
@@ -361,51 +317,72 @@ def reorder_display_cols():
     return cols
 DISPLAY_COLS = reorder_display_cols()
-METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
-               if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
-HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
-               if getattr(GUARDBENCH_COLUMN, f.name).hidden]
-NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
-                     if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
-# Categories in GuardBench
 CATEGORIES = [
-    'Political Corruption and Legal Evasion',
-    'Financial Fraud and Unethical Business',
-    'AI Manipulation and Jailbreaking',
-    'Child Exploitation and Abuse',
-    'Hate Speech, Extremism, and Discrimination',
-    'Labor Exploitation and Human Trafficking',
-    'Manipulation, Deception, and Misinformation',
-    'Environmental and Industrial Harm',
-    'Academic Dishonesty and Cheating',
-    'Self–Harm and Suicidal Ideation',
-    'Animal Cruelty and Exploitation',
-    'Criminal, Violent, and Terrorist Activity',
-    'Drug– and Substance–Related Activities',
-    'Sexual Content and Violence',
-    'Weapon, Explosives, and Hazardous Materials',
-    'Cybercrime, Hacking, and Digital Exploits',
-    'Creative Content Involving Illicit Themes',
-    'Safe Prompts'
 ]
-# Test types in GuardBench
-TEST_TYPES = [
-    "default_prompts",
-    "jailbreaked_prompts",
-    "default_answers",
-    "jailbreaked_answers"
 ]
-# Metrics in GuardBench
-METRICS = [
-    "f1_binary",
-    "recall_binary",
-    "precision_binary",
-    "error_ratio",
-    "avg_runtime_ms",
-    "accuracy"
 ]
 def get_all_column_choices():
@@ -419,8 +396,8 @@ def get_all_column_choices():
     default_visible_columns = get_default_visible_columns()
-    for f in fields(GUARDBENCH_COLUMN):
-        column_info = getattr(GUARDBENCH_COLUMN, f.name)
         # Create a tuple with both the internal name and display name
         if column_info.name not in default_visible_columns:
             column_choices.append((column_info.name, column_info.display_name))
@@ -434,5 +411,5 @@ def get_default_visible_columns():
     Returns:
         List of column names that are displayed by default.
     """
-    return [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
-            if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]

 """
+Utility classes and functions for the CodeReview Bench Leaderboard display.
 """
 from dataclasses import dataclass, field, fields
 class Mode(Enum):
+    """Inference mode for the review model."""
     CoT = auto()  # Chain of Thought
     Strict = auto()
             return "API"
         return "Unknown"
+class ReviewModelType(str, Enum):
+    """Review model types for the leaderboard."""
+    GPT_4 = "gpt-4"
+    GPT_3_5 = "gpt-3.5-turbo"
+    CLAUDE = "claude"
+    LLAMA = "llama"
+    GEMINI = "gemini"
+    CUSTOM = "custom"
+    def __str__(self):
+        """String representation of the review model type."""
+        return self.value
 class Precision(Enum):
     Original = auto()
     Delta = auto()
     Adapter = auto()
     def __str__(self):
         """String representation of the weight type."""
         return self.name
 @dataclass
+class CodeReviewBenchColumn:
+    """Columns for the CodeReview Bench leaderboard."""
     # Core metadata
     model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_name",
         display_name="Version",
         displayed_by_default=False
     ))
+    review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="review_model_type",
         display_name="Type",
         displayed_by_default=False
     ))
         displayed_by_default=False
     ))
+    # LLM-based multimetric scores
+    readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="readability",
+        display_name="Readability",
         type="number",
+        displayed_by_default=True
     ))
+    relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="relevance",
+        display_name="Relevance",
         type="number",
+        displayed_by_default=True
     ))
+    explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="explanation_clarity",
+        display_name="Explanation_Clarity",
         type="number",
+        displayed_by_default=True
     ))
+    problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="problem_identification",
+        display_name="Problem_Identification",
         type="number",
+        displayed_by_default=True
     ))
+    actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="actionability",
+        display_name="Actionability",
         type="number",
+        displayed_by_default=True
     ))
+    completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="completeness",
+        display_name="Completeness",
         type="number",
+        displayed_by_default=True
     ))
+    specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="specificity",
+        display_name="Specificity",
         type="number",
+        displayed_by_default=True
     ))
+    contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="contextual_adequacy",
+        display_name="Contextual_Adequacy",
         type="number",
+        displayed_by_default=True
     ))
+    consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="consistency",
+        display_name="Consistency",
         type="number",
+        displayed_by_default=True
     ))
+    brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="brevity",
+        display_name="Brevity",
         type="number",
+        displayed_by_default=True
     ))
+    # LLM-based-exact-match metrics
+    pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="pass_at_1",
+        display_name="Pass@1",
         type="number",
+        displayed_by_default=True
     ))
+    pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="pass_at_5",
+        display_name="Pass@5",
         type="number",
+        displayed_by_default=True
     ))
+    pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="pass_at_10",
+        display_name="Pass@10",
         type="number",
+        displayed_by_default=True
     ))
+    bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="bleu_at_10",
+        display_name="BLEU@10",
         type="number",
+        displayed_by_default=True
     ))
+    # Overall aggregated metrics
+    overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="overall_score",
+        display_name="Overall_Score",
         type="number",
+        displayed_by_default=True
     ))
+    multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="multimetric_average",
+        display_name="Multimetric_Average",
         type="number",
+        displayed_by_default=True
     ))
+    exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="exact_match_average",
+        display_name="Exact_Match_Average",
         type="number",
+        displayed_by_default=True
     ))
+    total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="total_evaluations",
+        display_name="Total_Evaluations",
         type="number",
+        displayed_by_default=True
     ))
+    # Language-specific metrics (Russian)
+    ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="ru_readability",
+        display_name="RU_Readability",
         type="number",
         displayed_by_default=False
     ))
+    ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="ru_relevance",
+        display_name="RU_Relevance",
         type="number",
         displayed_by_default=False
     ))
+    ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="ru_overall_score",
+        display_name="RU_Overall_Score",
         type="number",
         displayed_by_default=False
     ))
+    # Language-specific metrics (English)
+    en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="en_readability",
+        display_name="EN_Readability",
         type="number",
         displayed_by_default=False
     ))
+    en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="en_relevance",
+        display_name="EN_Relevance",
         type="number",
         displayed_by_default=False
     ))
+    en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="en_overall_score",
+        display_name="EN_Overall_Score",
         type="number",
         displayed_by_default=False
     ))
 # Create instances for easy access
+CODEREVIEW_COLUMN = CodeReviewBenchColumn()
 # Extract column lists for different views
+COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
+DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+                if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
 # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
 def reorder_display_cols():
     return cols
 DISPLAY_COLS = reorder_display_cols()
+METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+               if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
+HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+               if getattr(CODEREVIEW_COLUMN, f.name).hidden]
+NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+                     if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
+# Categories for CodeReview Bench (Programming Languages)
 CATEGORIES = [
+    'Python',
+    'JavaScript',
+    'Java',
+    'C++',
+    'C#',
+    'TypeScript',
+    'Go',
+    'Rust',
+    'Swift',
+    'Kotlin',
+    'Ruby',
+    'PHP',
+    'C',
+    'Scala',
+    'R',
+    'Dart',
+    'Other'
+]
+# Language taxonomies for CodeReview Bench
+COMMENT_LANGUAGES = [
+    'ru',  # Russian
+    'en'   # English
+]
+# Example categories
+EXAMPLE_CATEGORIES = [
+    'Bug_Fix',
+    'Code_Style',
+    'Performance',
+    'Security',
+    'Refactoring',
+    'Documentation',
+    'Testing',
+    'Architecture',
+    'Other'
 ]
+# Metrics for CodeReview Bench
+MULTIMETRIC_METRICS = [
+    "readability",
+    "relevance",
+    "explanation_clarity",
+    "problem_identification",
+    "actionability",
+    "completeness",
+    "specificity",
+    "contextual_adequacy",
+    "consistency",
+    "brevity"
 ]
+EXACT_MATCH_METRICS = [
+    "pass_at_1",
+    "pass_at_5",
+    "pass_at_10",
+    "bleu_at_10"
 ]
 def get_all_column_choices():
     default_visible_columns = get_default_visible_columns()
+    for f in fields(CODEREVIEW_COLUMN):
+        column_info = getattr(CODEREVIEW_COLUMN, f.name)
         # Create a tuple with both the internal name and display name
         if column_info.name not in default_visible_columns:
             column_choices.append((column_info.name, column_info.display_name))
     Returns:
         List of column names that are displayed by default.
     """
+    return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+            if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]

src/envs.py CHANGED Viewed

@@ -7,14 +7,14 @@ load_dotenv()
 # Hugging Face configuration
 TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
-OWNER = os.environ.get("OWNER", "whitecircle-ai")  # Change to your org
 SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
 ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
 ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
 # Repository IDs
-REPO_ID = f"{OWNER}/circle-guard-bench"
-RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/guardbench-results")
 # Cache paths
 CACHE_PATH = os.getenv("HF_HOME", ".")

 # Hugging Face configuration
 TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OWNER = os.environ.get("OWNER", "codereview-bench")  # Change to your org
 SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
 ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
 ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
 # Repository IDs
+REPO_ID = f"{OWNER}/codereview-bench"
+RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
 # Cache paths
 CACHE_PATH = os.getenv("HF_HOME", ".")

src/leaderboard/processor.py CHANGED Viewed

@@ -1,81 +1,98 @@
 """
-Process and transform GuardBench leaderboard data.
 """
 import json
 import os
 import pandas as pd
 from datetime import datetime
-from typing import Dict, List, Any, Tuple
 import numpy as np
-from src.display.utils import CATEGORIES, TEST_TYPES, METRICS
-# Constants for Integral Score calculation (mirrors guardbench library)
-MAX_PUNISHABLE_RUNTIME_MS = 6000.0
-MIN_PUNISHABLE_RUNTIME_MS = 200.0
-MAX_RUNTIME_PENALTY = 0.75 # Corresponds to 1.0 - MIN_TIME_FACTOR, library used 0.75
-def calculate_integral_score(row: pd.Series) -> float:
     """
-    Calculate the integral score for a given model entry row.
-    Uses accuracy as the primary metric, micro error ratio, and micro runtime penalty.
-    Falls back to macro accuracy and averaged per-test-type errors/runtimes if micro values are missing.
     """
-    integral_score = 1.0
-    metric_count = 0
-    # Primary metric (using accuracy)
-    for test_type in TEST_TYPES:
-        metric_col = f"{test_type}_accuracy"
-        if metric_col in row and pd.notna(row[metric_col]):
-            # print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
-            integral_score *= row[metric_col]
-            metric_count += 1
-    # print(f"Metric count: {metric_count}")
-    # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
-    # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
-    # Let's add that check back before applying penalties.
-    if metric_count == 0:
-        return 0.0
-    # Error Penalty
-    micro_error_col = "micro_avg_error_ratio"
-    if micro_error_col in row and pd.notna(row[micro_error_col]):
-        # Micro error is stored as %, convert back to ratio
-        micro_error_ratio = row[micro_error_col] / 100.0
-        integral_score *= (1.0 - micro_error_ratio)
-    # Runtime Penalty
-    avg_runtime_ms = None # Initialize
-    micro_runtime_col = "micro_avg_runtime_ms"
-    if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
-        avg_runtime_ms = row[micro_runtime_col]
-    if avg_runtime_ms is not None:
-        # Apply penalty based on runtime (only if micro avg runtime was found)
-        runtime = max(
-            min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
-            MIN_PUNISHABLE_RUNTIME_MS,
-        )
-        if MAX_PUNISHABLE_RUNTIME_MS > MIN_PUNISHABLE_RUNTIME_MS:
-            normalized_time = (runtime - MIN_PUNISHABLE_RUNTIME_MS) / (
-                MAX_PUNISHABLE_RUNTIME_MS - MIN_PUNISHABLE_RUNTIME_MS
-            )
-            # Match reference library formula 1
-            time_factor = 1.0 - (1.0 - MAX_RUNTIME_PENALTY) * normalized_time
-        else:
-            # Match reference library formula (though less critical when max==min)
-            time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY)
-        # Match reference library formula 2 (enforce minimum factor)
-        time_factor = max(MAX_RUNTIME_PENALTY, time_factor)
-        integral_score *= time_factor
-    # Rooting is not done in the reference library's summary table calculation
-    return integral_score
 def load_leaderboard_data(file_path: str) -> Dict:
@@ -122,40 +139,6 @@ def save_leaderboard_data(data: Dict, file_path: str) -> None:
         json.dump(data, f, indent=2)
-def process_submission(submission_data: List[Dict]) -> List[Dict]:
-    """
-    Process submission data and convert it to leaderboard entries.
-    """
-    entries = []
-    for item in submission_data:
-        # Create a new entry for the leaderboard
-        entry = {
-            "model_name": item.get("model_name", "Unknown Model"),
-            "per_category_metrics": {},
-            "avg_metrics": {},
-            "submission_date": datetime.now().isoformat(),
-            "version": item.get("version", "v0")
-        }
-        # Copy model metadata
-        for key in ["model_type", "base_model", "revision", "precision", "weight_type"]:
-            if key in item:
-                entry[key] = item[key]
-        # Process per-category metrics
-        if "per_category_metrics" in item:
-            entry["per_category_metrics"] = item["per_category_metrics"]
-        # Process average metrics
-        if "avg_metrics" in item:
-            entry["avg_metrics"] = item["avg_metrics"]
-        entries.append(entry)
-    return entries
 def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
     """
     Convert leaderboard data to a pandas DataFrame for display.
@@ -165,14 +148,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
     for entry in leaderboard_data.get("entries", []):
         model_name = entry.get("model_name", "Unknown Model")
-        # Extract average metrics for main display
         row = {
             "model_name": model_name,
             "model_type": entry.get("model_type", "Unknown"),
             "mode": entry.get("mode", "Strict"),
             "submission_date": entry.get("submission_date", ""),
             "version": entry.get("version", "v0"),
-            "guard_model_type": entry.get("guard_model_type", "llm_regexp").lower()
         }
         # Add additional metadata fields if present
@@ -180,117 +163,69 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
             if key in entry:
                 row[key] = entry[key]
-        # CASE 1: Metrics are flat in the root
-        for key, value in entry.items():
-            if any(test_type in key for test_type in TEST_TYPES) or \
-               key in ["average_f1", "average_recall", "average_precision",
-                       "macro_accuracy", "macro_recall", "total_evals_count"]:
-                row[key] = value
-        # CASE 2: Metrics are in avg_metrics structure
-        avg_metrics = entry.get("avg_metrics", {})
-        if avg_metrics:
-            for test_type in TEST_TYPES:
-                if test_type in avg_metrics:
-                    metrics = avg_metrics[test_type]
-                    for metric in METRICS:
-                        if metric in metrics:
-                            col_name = f"{test_type}_{metric}"
-                            row[col_name] = metrics[metric]
-                            # Also add non-binary version for F1 scores
-                            if metric == "f1_binary":
-                                row[f"{test_type}_f1"] = metrics[metric]
-            # Calculate averages if not present
-            # Use accuracy for macro_accuracy
-            if "macro_accuracy" not in row:
-                accuracy_values = []
-                for test_type in TEST_TYPES:
-                    # Check avg_metrics structure first
-                    accuracy_val = None
-                    if test_type in avg_metrics and "accuracy" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["accuracy"]):
-                        accuracy_val = avg_metrics[test_type]["accuracy"]
-                    # Check flat structure as fallback (might be redundant but safer)
-                    elif f"{test_type}_accuracy" in row and pd.notna(row[f"{test_type}_accuracy"]):
-                        accuracy_val = row[f"{test_type}_accuracy"]
-                    if accuracy_val is not None:
-                        accuracy_values.append(accuracy_val)
-                if accuracy_values:
-                    row["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
-            # Use recall_binary for macro_recall
-            if "macro_recall" not in row:
-                recall_values = []
-                for test_type in TEST_TYPES:
-                    if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["recall_binary"]):
-                        recall_values.append(avg_metrics[test_type]["recall_binary"])
-                if recall_values:
-                    row["macro_recall"] = sum(recall_values) / len(recall_values)
-            if "total_evals_count" not in row:
-                total_samples = 0
-                found_samples = False
-                for test_type in TEST_TYPES:
-                    if test_type in avg_metrics and "sample_count" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["sample_count"]):
-                        total_samples += avg_metrics[test_type]["sample_count"]
-                        found_samples = True
-                if found_samples:
-                    row["total_evals_count"] = total_samples
-        # Extract micro averages directly from entry if they exist (like in guardbench library)
-        row["micro_avg_error_ratio"] = entry.get("micro_avg_error_ratio", pd.NA)
-        row["micro_avg_runtime_ms"] = entry.get("micro_avg_runtime_ms", pd.NA)
-        # Convert error ratio to percentage for consistency with display name
-        if pd.notna(row["micro_avg_error_ratio"]):
-            row["micro_avg_error_ratio"] *= 100
         rows.append(row)
-    # Create DataFrame and sort by average F1 score
     df = pd.DataFrame(rows)
     # Ensure all expected columns exist
-    for test_type in TEST_TYPES:
-        for metric in METRICS:
-            col_name = f"{test_type}_{metric}"
-            if col_name not in df.columns:
-                df[col_name] = pd.NA # Use pd.NA for missing numeric data
-            # Add non-binary F1 if binary exists and f1 is missing
-            if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
-                # Check if the binary column has data before copying
-                if col_name in df.columns:
-                    df[f"{test_type}_f1"] = df[col_name]
-                else:
-                    df[f"{test_type}_f1"] = pd.NA
-    # Calculate Integral Score
     if not df.empty:
-        df["integral_score"] = df.apply(calculate_integral_score, axis=1)
-        # Sort by Integral Score instead of average_f1
-        df = df.sort_values(by="integral_score", ascending=False, na_position='last')
-    else:
-        # Add the column even if empty
-        df["integral_score"] = pd.NA
     # Ensure summary columns exist
-    summary_cols = ["macro_accuracy", "macro_recall", "micro_avg_error_ratio", "micro_avg_runtime_ms", "total_evals_count"]
     for col in summary_cols:
         if col not in df.columns:
             df[col] = pd.NA
-    # Remove old average columns if they somehow snuck in
-    old_avg_cols = ["average_f1", "average_recall", "average_precision"]
-    for col in old_avg_cols:
-        if col in df.columns:
-            df = df.drop(columns=[col])
-    # print("--- DataFrame before returning from leaderboard_to_dataframe ---")
-    # print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
-    # print("-------------------------------------------------------------")
     return df
@@ -309,6 +244,18 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
         model_name = new_entry.get("model_name")
         version = new_entry.get("version", "v0")
         if (model_name, version) in existing_entries:
             # Replace existing entry
             leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
@@ -322,25 +269,3 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
     leaderboard_data["last_updated"] = datetime.now().isoformat()
     return leaderboard_data
-def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
-    """
-    Process a JSONL submission file and extract entries.
-    """
-    entries = []
-    try:
-        with open(file_path, 'r') as f:
-            for line in f:
-                try:
-                    entry = json.loads(line)
-                    entries.append(entry)
-                except json.JSONDecodeError as e:
-                    return [], f"Invalid JSON in submission file: {e}"
-        if not entries:
-            return [], "Submission file is empty"
-        return entries, "Successfully processed submission"
-    except Exception as e:
-        return [], f"Error processing submission file: {e}"

 """
+Process CodeReview Bench leaderboard data and submissions.
 """
 import json
 import os
 import pandas as pd
 from datetime import datetime
+from typing import Dict, List, Tuple, Optional
 import numpy as np
+from src.display.utils import (
+    CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
+    MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
+)
+def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
     """
+    Process a JSONL submission file for CodeReview Bench.
+    Args:
+        file_path: Path to the JSONL submission file
+    Returns:
+        Tuple of (entries_list, message)
     """
+    try:
+        entries = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    # Validate required fields
+                    required_fields = ['model_name', 'programming_language', 'comment_language']
+                    missing_fields = [field for field in required_fields if field not in entry]
+                    if missing_fields:
+                        return [], f"Missing required fields {missing_fields} in line {line_num}"
+                    # Validate metrics exist
+                    has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
+                    has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
+                    if not has_multimetric and not has_exact_match:
+                        return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
+                    entries.append(entry)
+                except json.JSONDecodeError as e:
+                    return [], f"Invalid JSON in line {line_num}: {e}"
+        if not entries:
+            return [], "No valid entries found in submission file"
+        return entries, f"Successfully processed {len(entries)} entries"
+    except Exception as e:
+        return [], f"Error processing submission: {e}"
+def calculate_overall_score(entry: Dict) -> float:
+    """
+    Calculate overall score for a CodeReview Bench entry.
+    Args:
+        entry: Dictionary containing model evaluation results
+    Returns:
+        Overall score as float
+    """
+    # Calculate multimetric average
+    multimetric_scores = []
+    for metric in MULTIMETRIC_METRICS:
+        if metric in entry and isinstance(entry[metric], (int, float)):
+            multimetric_scores.append(entry[metric])
+    multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
+    # Calculate exact match average
+    exact_match_scores = []
+    for metric in EXACT_MATCH_METRICS:
+        if metric in entry and isinstance(entry[metric], (int, float)):
+            exact_match_scores.append(entry[metric])
+    exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
+    # Weighted combination (can be adjusted based on requirements)
+    overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
+    return overall_score
 def load_leaderboard_data(file_path: str) -> Dict:
         json.dump(data, f, indent=2)
 def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
     """
     Convert leaderboard data to a pandas DataFrame for display.
     for entry in leaderboard_data.get("entries", []):
         model_name = entry.get("model_name", "Unknown Model")
+        # Extract basic metadata
         row = {
             "model_name": model_name,
             "model_type": entry.get("model_type", "Unknown"),
             "mode": entry.get("mode", "Strict"),
             "submission_date": entry.get("submission_date", ""),
             "version": entry.get("version", "v0"),
+            "review_model_type": entry.get("review_model_type", "custom").lower()
         }
         # Add additional metadata fields if present
             if key in entry:
                 row[key] = entry[key]
+        # Add multimetric scores
+        for metric in MULTIMETRIC_METRICS:
+            if metric in entry:
+                row[metric] = entry[metric]
+            else:
+                row[metric] = pd.NA
+        # Add exact match metrics
+        for metric in EXACT_MATCH_METRICS:
+            if metric in entry:
+                row[metric] = entry[metric]
+            else:
+                row[metric] = pd.NA
+        # Calculate aggregated metrics
+        multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
+        exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
+        if multimetric_scores:
+            row["multimetric_average"] = np.mean(multimetric_scores)
+        else:
+            row["multimetric_average"] = pd.NA
+        if exact_match_scores:
+            row["exact_match_average"] = np.mean(exact_match_scores)
+        else:
+            row["exact_match_average"] = pd.NA
+        # Calculate overall score
+        row["overall_score"] = calculate_overall_score(entry)
+        # Add language-specific metrics if available
+        for lang in COMMENT_LANGUAGES:
+            for metric in ["readability", "relevance", "overall_score"]:
+                lang_key = f"{lang}_{metric}"
+                if lang_key in entry:
+                    row[lang_key] = entry[lang_key]
+                else:
+                    row[lang_key] = pd.NA
+        # Add evaluation count
+        row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
         rows.append(row)
+    # Create DataFrame and sort by overall score
     df = pd.DataFrame(rows)
     # Ensure all expected columns exist
+    for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
+        if metric not in df.columns:
+            df[metric] = pd.NA
+    # Sort by overall score (descending)
     if not df.empty:
+        df = df.sort_values(by="overall_score", ascending=False, na_position='last')
     # Ensure summary columns exist
+    summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
     for col in summary_cols:
         if col not in df.columns:
             df[col] = pd.NA
     return df
         model_name = new_entry.get("model_name")
         version = new_entry.get("version", "v0")
+        # Add calculated metrics
+        new_entry["overall_score"] = calculate_overall_score(new_entry)
+        # Calculate averages
+        multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
+        exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
+        if multimetric_scores:
+            new_entry["multimetric_average"] = np.mean(multimetric_scores)
+        if exact_match_scores:
+            new_entry["exact_match_average"] = np.mean(exact_match_scores)
         if (model_name, version) in existing_entries:
             # Replace existing entry
             leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
     leaderboard_data["last_updated"] = datetime.now().isoformat()
     return leaderboard_data

src/populate.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Populate the GuardBench leaderboard from HuggingFace datasets.
 """
 import json
@@ -13,7 +13,7 @@ import numpy as np
 from huggingface_hub import hf_hub_download, HfApi
 from datasets import load_dataset
-from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
 from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
 from src.leaderboard.processor import leaderboard_to_dataframe
@@ -58,19 +58,16 @@ def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
         return None
-def get_all_entries(version="v0", mode: str = None) -> List[Dict]:
     """
-    Get all model entries from the entries folder. If mode is provided, only return entries matching that mode.
     """
     try:
         api = HfApi(token=TOKEN)
         files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
-        if mode is not None:
-            mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
-            entry_files = [f for f in files if f.startswith("entries/") and f"_{mode_safe}_" in f and f.endswith(f"_{version}.json")]
-        else:
-            entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
-        entries = []
         for entry_file in entry_files:
             try:
                 entry_path = hf_hub_download(
@@ -81,12 +78,13 @@ def get_all_entries(version="v0", mode: str = None) -> List[Dict]:
                 )
                 with open(entry_path, 'r') as f:
                     entry_data = json.load(f)
-                    entries.append(entry_data)
             except Exception as e:
                 print(f"Error loading entry {entry_file}: {e}")
-        return entries
     except Exception as e:
-        print(f"Error listing entries: {e}")
         return []
@@ -116,7 +114,7 @@ def get_leaderboard_df(version="v0") -> pd.DataFrame:
 def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
     """
-    Get the leaderboard data filtered by a specific category.
     """
     # Get latest leaderboard data
     leaderboard_data = get_latest_leaderboard(version)
@@ -134,90 +132,18 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
             # Return empty DataFrame if no data available
             return pd.DataFrame(columns=DISPLAY_COLS)
-    # Filter entries to only include those with data for the specified category
     filtered_entries = []
     for entry in leaderboard_data.get("entries", []):
-        # Copy all base fields
-        filtered_entry = {
-            "model_name": entry.get("model_name", "Unknown Model"),
-            "model_type": entry.get("model_type", "Unknown"),
-            "guard_model_type": entry.get("guard_model_type", "Unknown"),
-            "mode": entry.get("mode", "Strict"),
-            "submission_date": entry.get("submission_date", ""),
-            "version": entry.get("version", version),
-            "base_model": entry.get("base_model", ""),
-            "revision": entry.get("revision", ""),
-            "precision": entry.get("precision", ""),
-            "weight_type": entry.get("weight_type", "")
-        }
-        if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
-            category_metrics = entry["per_category_metrics"][category]
-            # Add all metrics for each test type
-            for test_type, metrics in category_metrics.items():
-                if isinstance(metrics, dict):
-                    for metric, value in metrics.items():
-                        col_name = f"{test_type}_{metric}"
-                        filtered_entry[col_name] = value
-                        # Also add the non-binary version for F1 scores
-                        if metric == "f1_binary":
-                            filtered_entry[f"{test_type}_f1"] = value
-            # Calculate averages
-            f1_values = []
-            recall_values = []
-            precision_values = []
-            accuracy_values = []
-            category_recall_values = []
-            total_samples = 0
-            for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
-                if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
-                    test_metrics = category_metrics[test_type]
-                    if "f1_binary" in test_metrics and pd.notna(test_metrics["f1_binary"]):
-                        f1_values.append(test_metrics["f1_binary"])
-                    if "recall_binary" in test_metrics and pd.notna(test_metrics["recall_binary"]):
-                        recall_values.append(test_metrics["recall_binary"])
-                        category_recall_values.append(test_metrics["recall_binary"])
-                    if "precision_binary" in test_metrics and pd.notna(test_metrics["precision_binary"]):
-                        precision_values.append(test_metrics["precision_binary"])
-                    if "accuracy" in test_metrics and pd.notna(test_metrics["accuracy"]):
-                        accuracy_values.append(test_metrics["accuracy"])
-                    if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
-                        total_samples += test_metrics["sample_count"]
-            # print(f"F1 values: {f1_values}")
-            # print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
-            # Add overall averages
-            if f1_values:
-                filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
-            if recall_values:
-                filtered_entry["average_recall"] = sum(recall_values) / len(recall_values)
-            if precision_values:
-                filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
-            # Add category-specific values to standard macro metric keys
-            if accuracy_values:
-                filtered_entry["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
             else:
-                filtered_entry["macro_accuracy"] = np.nan
-            if category_recall_values:
-                filtered_entry["macro_recall"] = sum(category_recall_values) / len(category_recall_values)
-            else:
-                filtered_entry["macro_recall"] = np.nan
-            if total_samples > 0:
-                filtered_entry["total_evals_count"] = total_samples
-            else:
-                filtered_entry["total_evals_count"] = np.nan
-            filtered_entries.append(filtered_entry)
     # Create a new leaderboard data structure with the filtered entries
     filtered_leaderboard = {
@@ -225,7 +151,6 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
         "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
         "version": version
     }
-    # print(filtered_leaderboard)
     # Convert to DataFrame
     return leaderboard_to_dataframe(filtered_leaderboard)

 """
+Populate the CodeReview Bench leaderboard from HuggingFace datasets.
 """
 import json
 from huggingface_hub import hf_hub_download, HfApi
 from datasets import load_dataset
+from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
 from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
 from src.leaderboard.processor import leaderboard_to_dataframe
         return None
+def get_all_entries(version="v0") -> List[Dict]:
     """
+    Get all entries from the HuggingFace dataset.
     """
     try:
         api = HfApi(token=TOKEN)
         files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+        entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
+        all_entries = []
         for entry_file in entry_files:
             try:
                 entry_path = hf_hub_download(
                 )
                 with open(entry_path, 'r') as f:
                     entry_data = json.load(f)
+                    all_entries.append(entry_data)
             except Exception as e:
                 print(f"Error loading entry {entry_file}: {e}")
+        return all_entries
     except Exception as e:
+        print(f"Error getting all entries: {e}")
         return []
 def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
     """
+    Get the leaderboard data filtered by a specific programming language category.
     """
     # Get latest leaderboard data
     leaderboard_data = get_latest_leaderboard(version)
             # Return empty DataFrame if no data available
             return pd.DataFrame(columns=DISPLAY_COLS)
+    # Filter entries to only include those with data for the specified programming language
     filtered_entries = []
     for entry in leaderboard_data.get("entries", []):
+        # Check if entry has data for this programming language
+        programming_language = entry.get("programming_language", "").lower()
+        if programming_language == category.lower() or category.lower() == "other":
+            # For "other" category, include entries that don't match any specific language
+            if category.lower() == "other":
+                if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]:  # Exclude "Other" from check
+                    filtered_entries.append(entry)
             else:
+                filtered_entries.append(entry)
     # Create a new leaderboard data structure with the filtered entries
     filtered_leaderboard = {
         "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
         "version": version
     }
     # Convert to DataFrame
     return leaderboard_to_dataframe(filtered_leaderboard)

src/submission/submit.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Handle submissions to the GuardBench leaderboard.
 """
 import json
@@ -7,20 +7,13 @@ import os
 import tempfile
 from datetime import datetime
 from typing import Dict, List, Tuple
-import shutil
-import threading
-import time
 from huggingface_hub import HfApi
 from datasets import load_dataset
-import subprocess
 from src.display.formatting import styled_error, styled_message
 from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
-from src.leaderboard.processor import process_jsonl_submission
-from circleguardbench.evaluator import Evaluator
-from circleguardbench.context import GuardbenchContext
-from circleguardbench.models_config import ModelType
 def validate_submission(file_path: str) -> Tuple[bool, str]:
@@ -102,27 +95,9 @@ def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool,
         return False, f"Error updating leaderboard: {e}"
-def restart_space_after_delay(delay_seconds: int = 2) -> None:
-    """
-    Restart the Hugging Face Space after a delay.
-    """
-    def _restart_space():
-        time.sleep(delay_seconds)
-        try:
-            api = HfApi(token=TOKEN)
-            api.restart_space(repo_id=REPO_ID)
-        except Exception as e:
-            print(f"Error restarting space: {e}")
-    # Start the restart in a separate thread
-    thread = threading.Thread(target=_restart_space)
-    thread.daemon = True
-    thread.start()
 def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
     """
-    Process a submission to the GuardBench leaderboard.
     """
     try:
         # Validate submission
@@ -130,18 +105,15 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
         if not is_valid:
             return styled_error(validation_message)
-        # Get GuardBench results directory path
-        guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule")
-        results_dir = os.path.join(guardbench_dir, "results")
-        os.makedirs(results_dir, exist_ok=True)
-        # Copy submission to GuardBench results directory
         model_name = metadata.get("model_name", "unknown")
         model_name_safe = model_name.replace("/", "_").replace(" ", "_")
-        guard_model_type = metadata.get("guard_model_type", "unknown")
-        target_file = os.path.join(results_dir + "/circleguardbench_public", f"{model_name_safe}.jsonl")
-        # Upload raw submission file
         api = HfApi(token=TOKEN)
         submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
         api.upload_file(
@@ -151,51 +123,15 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
             repo_type="dataset",
             commit_message=f"Add raw submission for {model_name}"
         )
-        os.makedirs(results_dir + "/circleguardbench_public", exist_ok=True)
-        # (f"Submission path: {submission_path}")
-        # print(f"Target file: {target_file}")
-        # printprint(f"Results dir: {results_dir}")
-        shutil.copy2(file_path, target_file)
-        # print(f"Copied file to target file: {target_file}")
-        # print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
-        try:
-            # Initialize GuardBench context
-            ctx = GuardbenchContext()
-            # Set results directory
-            ctx.results_dir = results_dir
-            # Set bench name from the results directory
-            ctx.bench_name = "circleguardbench_public"
-            # Load dataset
-            ctx.load_dataset("whitecircle-ai/circleguardbench_public")
-            # Mark as initialized
-            ctx.is_initialized = True
-            evaluator = Evaluator(ctx, force=True, using_cached=True)
-            # Run evaluation and get entry
-            evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower())
-            # Get the entry from results
-            with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f:
-                results_data = json.load(f)
-                model_entry = next(
-                    (entry for entry in results_data.get("entries", [])
-                     if entry.get("model_name") == model_name_safe),
-                    None
-                )
-            if not model_entry:
-                return styled_error("No evaluation results found")
             # Add metadata to entry
-            model_entry.update({
-                "model_name": metadata.get("model_name"),  # Use original model name
                 "model_type": metadata.get("model_type"),
-                "guard_model_type": str(metadata.get("guard_model_type")).lower(),
                 "mode": metadata.get("mode"),
                 "base_model": metadata.get("base_model"),
                 "revision": metadata.get("revision"),
@@ -204,51 +140,45 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
                 "version": version,
                 "submission_date": datetime.now().isoformat()
             })
-            # Submit entry to entries folder
-            success, message = submit_entry_to_hub(model_entry, model_name, metadata.get("mode"), version)
             if not success:
                 return styled_error(message)
-            # Get all entries from HF dataset
-            api = HfApi(token=TOKEN)
-            files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
-            entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
-            all_entries = []
-            for entry_file in entry_files:
-                try:
-                    entry_path = api.hf_hub_download(
-                        repo_id=RESULTS_DATASET_ID,
-                        filename=entry_file,
-                        repo_type="dataset",
-                    )
-                    with open(entry_path, 'r') as f:
-                        entry_data = json.load(f)
-                        all_entries.append(entry_data)
-                except Exception as e:
-                    print(f"Error loading entry {entry_file}: {e}")
-            # Update leaderboard with all entries
-            success, message = submit_leaderboard_to_hub(all_entries, version)
-            if not success:
-                return styled_error(message)
-            restart_space_after_delay(5)
-            return styled_message("Submission successful! Model evaluated and leaderboard updated.")
-        except Exception as eval_error:
-            return styled_error(f"Error during evaluation: {eval_error}")
     except Exception as e:
         return styled_error(f"Error processing submission: {e}")
     finally:
-        # Clean up temporary files
         try:
             if os.path.exists(file_path):
                 os.remove(file_path)
-            if os.path.exists(target_file):
-                os.remove(target_file)
         except:
             pass

 """
+Handle submissions to the CodeReview Bench leaderboard.
 """
 import json
 import tempfile
 from datetime import datetime
 from typing import Dict, List, Tuple
 from huggingface_hub import HfApi
 from datasets import load_dataset
 from src.display.formatting import styled_error, styled_message
 from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
+from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
 def validate_submission(file_path: str) -> Tuple[bool, str]:
         return False, f"Error updating leaderboard: {e}"
 def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
     """
+    Process a submission to the CodeReview Bench leaderboard.
     """
     try:
         # Validate submission
         if not is_valid:
             return styled_error(validation_message)
+        # Process the submission entries
+        entries, message = process_jsonl_submission(file_path)
+        if not entries:
+            return styled_error(f"Failed to process submission: {message}")
+        # Upload raw submission file
         model_name = metadata.get("model_name", "unknown")
         model_name_safe = model_name.replace("/", "_").replace(" ", "_")
         api = HfApi(token=TOKEN)
         submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
         api.upload_file(
             repo_type="dataset",
             commit_message=f"Add raw submission for {model_name}"
         )
+        # Process entries and add metadata
+        processed_entries = []
+        for entry in entries:
             # Add metadata to entry
+            entry.update({
+                "model_name": metadata.get("model_name"),
                 "model_type": metadata.get("model_type"),
+                "review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
                 "mode": metadata.get("mode"),
                 "base_model": metadata.get("base_model"),
                 "revision": metadata.get("revision"),
                 "version": version,
                 "submission_date": datetime.now().isoformat()
             })
+            processed_entries.append(entry)
+        # Submit entries to entries folder
+        for entry in processed_entries:
+            success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
             if not success:
                 return styled_error(message)
+        # Get all entries from HF dataset and update leaderboard
+        files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+        entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
+        all_entries = []
+        for entry_file in entry_files:
+            try:
+                entry_path = api.hf_hub_download(
+                    repo_id=RESULTS_DATASET_ID,
+                    filename=entry_file,
+                    repo_type="dataset",
+                )
+                with open(entry_path, 'r') as f:
+                    entry_data = json.load(f)
+                    all_entries.append(entry_data)
+            except Exception as e:
+                print(f"Error loading entry {entry_file}: {e}")
+        # Update leaderboard with all entries
+        success, message = submit_leaderboard_to_hub(all_entries, version)
+        if not success:
+            return styled_error(message)
+        return styled_message("Submission successful! Model evaluated and leaderboard updated.")
     except Exception as e:
         return styled_error(f"Error processing submission: {e}")
     finally:
+        # Clean up temporary files if they exist
         try:
             if os.path.exists(file_path):
                 os.remove(file_path)
         except:
             pass