kenkaneki commited on
Commit
346c3c5
·
1 Parent(s): 5c87359
README.md CHANGED
@@ -29,6 +29,137 @@ models:
29
 
30
  ---
31
 
32
- # CircleGuardBench Leaderboard
33
 
34
- First-of-its-kind benchmark for evaluating the protection capabilities of large language model (LLM) guard systems. It tests how well guard models block harmful content, resist jailbreaks, avoid false positives, and operate efficiently in real-time environments.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  ---
31
 
32
+ # CodeReview Bench Leaderboard
33
 
34
+ A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
35
+
36
+ ## Features
37
+
38
+ - **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
39
+ - **Dual Language Comments**: Supports both Russian and English comment languages
40
+ - **Comprehensive Metrics**:
41
+ - LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
42
+ - Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
43
+ - **Interactive Visualization**: Compare model performance across categories with radar plots
44
+ - **Easy Submission**: Submit your model results via web interface
45
+
46
+ ## Metrics
47
+
48
+ ### LLM-based Multimetric
49
+
50
+ - **Readability**: How easy the review is to understand
51
+ - **Relevance**: How relevant the review is to the code
52
+ - **Explanation Clarity**: How clear the explanations are
53
+ - **Problem Identification**: How well problems are identified
54
+ - **Actionability**: How actionable the suggestions are
55
+ - **Completeness**: How complete the review is
56
+ - **Specificity**: How specific the feedback is
57
+ - **Contextual Adequacy**: How well the review fits the context
58
+ - **Consistency**: How consistent the review style is
59
+ - **Brevity**: How concise the review is
60
+
61
+ ### Exact-Match Metrics
62
+
63
+ - **Pass@1**: Percentage of correct reviews on first attempt
64
+ - **Pass@5**: Percentage of correct reviews in top 5 attempts
65
+ - **Pass@10**: Percentage of correct reviews in top 10 attempts
66
+ - **BLEU@10**: BLEU score for top 10 review candidates
67
+
68
+ ## Programming Languages Supported
69
+
70
+ - Python
71
+ - JavaScript
72
+ - Java
73
+ - C++
74
+ - C#
75
+ - TypeScript
76
+ - Go
77
+ - Rust
78
+ - Swift
79
+ - Kotlin
80
+ - Ruby
81
+ - PHP
82
+ - C
83
+ - Scala
84
+ - R
85
+ - Dart
86
+ - Other
87
+
88
+ ## Comment Languages
89
+
90
+ - Russian (ru)
91
+ - English (en)
92
+
93
+ ## Example Categories
94
+
95
+ - Bug Fix
96
+ - Code Style
97
+ - Performance
98
+ - Security
99
+ - Refactoring
100
+ - Documentation
101
+ - Testing
102
+ - Architecture
103
+ - Other
104
+
105
+ ## Installation
106
+
107
+ ```bash
108
+ pip install -r requirements.txt
109
+ ```
110
+
111
+ ## Usage
112
+
113
+ ```bash
114
+ python app.py
115
+ ```
116
+
117
+ ## Submission Format
118
+
119
+ Submit your results as a JSONL file where each line contains:
120
+
121
+ ```json
122
+ {
123
+ "model_name": "your-model-name",
124
+ "programming_language": "python",
125
+ "comment_language": "en",
126
+ "readability": 8.5,
127
+ "relevance": 9.0,
128
+ "explanation_clarity": 7.8,
129
+ "problem_identification": 8.2,
130
+ "actionability": 8.7,
131
+ "completeness": 8.0,
132
+ "specificity": 7.5,
133
+ "contextual_adequacy": 8.3,
134
+ "consistency": 8.8,
135
+ "brevity": 7.2,
136
+ "pass_at_1": 0.75,
137
+ "pass_at_5": 0.88,
138
+ "pass_at_10": 0.92,
139
+ "bleu_at_10": 0.65,
140
+ "total_evaluations": 100
141
+ }
142
+ ```
143
+
144
+ ## Environment Variables
145
+
146
+ Set the following environment variables:
147
+
148
+ ```bash
149
+ HF_TOKEN=your_huggingface_token
150
+ OWNER=your-organization
151
+ RESULTS_DATASET_ID=your-org/codereview-bench-results
152
+ ```
153
+
154
+ ## Citation
155
+
156
+ ```bibtex
157
+ @misc{codereviewbench2025,
158
+ author = {CodeReview Bench Team},
159
+ title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
160
+ year = {2025},
161
+ publisher = {GitHub},
162
+ journal = {GitHub repository},
163
+ howpublished = {\url{https://github.com/your-org/codereview-bench}}
164
+ }
165
+ ```
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- GuardBench Leaderboard Application
3
  """
4
 
5
  import os
@@ -25,18 +25,19 @@ from src.about import (
25
  )
26
  from src.display.css_html_js import custom_css
27
  from src.display.utils import (
28
- GUARDBENCH_COLUMN,
29
  DISPLAY_COLS,
30
  METRIC_COLS,
31
  HIDDEN_COLS,
32
  NEVER_HIDDEN_COLS,
33
  CATEGORIES,
34
- TEST_TYPES,
 
35
  ModelType,
36
  Mode,
37
  Precision,
38
  WeightType,
39
- GuardModelType,
40
  get_all_column_choices,
41
  get_default_visible_columns,
42
  )
@@ -221,7 +222,7 @@ def init_leaderboard(dataframe, visible_columns=None):
221
  """
222
  if dataframe is None or dataframe.empty:
223
  # Create an empty dataframe with the right columns
224
- columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
225
  dataframe = pd.DataFrame(columns=columns)
226
  logger.warning("Initializing empty leaderboard")
227
 
@@ -234,20 +235,20 @@ def init_leaderboard(dataframe, visible_columns=None):
234
  dataframe = dataframe.copy()
235
  dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
236
 
237
- if "guard_model_type" in dataframe.columns:
238
  dataframe = dataframe.copy()
239
- dataframe["guard_model_type"] = dataframe["guard_model_type"].str.replace("wc_guard", "whitecircle_guard")
240
 
241
  # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
242
 
243
  # Determine which columns to display
244
  display_column_names = [
245
- getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS
246
  ]
247
- hidden_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS]
248
 
249
  # Columns that should always be shown
250
- always_visible = [getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
251
 
252
  # Use provided visible columns if specified, otherwise use default
253
  if visible_columns is None:
@@ -279,11 +280,11 @@ def init_leaderboard(dataframe, visible_columns=None):
279
  # Create a list of datatypes in the format Gradio expects
280
  datatypes = []
281
  for col in visible_columns:
282
- # Find the corresponding GUARDBENCH_COLUMN entry
283
  col_type = None
284
  for display_col in DISPLAY_COLS:
285
- if getattr(GUARDBENCH_COLUMN, display_col).name == col:
286
- orig_type = getattr(GUARDBENCH_COLUMN, display_col).type
287
  # Map to Gradio's expected types
288
  col_type = type_mapping.get(orig_type, "str")
289
  break
@@ -322,7 +323,7 @@ def init_leaderboard(dataframe, visible_columns=None):
322
  )
323
 
324
  column_info_map = {
325
- f.name: getattr(GUARDBENCH_COLUMN, f.name) for f in fields(GUARDBENCH_COLUMN)
326
  }
327
  column_mapping = {
328
  col: column_info_map.get(col, ColumnInfo(col, col)).display_name
@@ -500,7 +501,7 @@ def submit_results(
500
  mode: str,
501
  submission_file: tempfile._TemporaryFileWrapper,
502
  version: str,
503
- guard_model_type: GuardModelType,
504
  ):
505
  """
506
  Handle submission of results with model metadata.
@@ -530,7 +531,7 @@ def submit_results(
530
  "model_type": model_type,
531
  "mode": mode,
532
  "version": version,
533
- "guard_model_type": guard_model_type,
534
  }
535
 
536
  # Process the submission
@@ -689,24 +690,23 @@ def update_visualization(selected_models, selected_category, selected_metric, ve
689
  demo = gr.Blocks(css=custom_css, theme=custom_theme)
690
 
691
  CATEGORY_DISPLAY_MAP = {
692
- "Political Corruption and Legal Evasion": "Corruption & Legal Evasion",
693
- "Financial Fraud and Unethical Business": "Financial Fraud",
694
- "AI Manipulation and Jailbreaking": "AI Jailbreaking",
695
- "Child Exploitation and Abuse": "Child Exploitation",
696
- "Hate Speech, Extremism, and Discrimination": "Hate Speech",
697
- "Labor Exploitation and Human Trafficking": "Labor Exploitation",
698
- "Manipulation, Deception, and Misinformation": "Misinformation",
699
- "Environmental and Industrial Harm": "Environmental Harm",
700
- "Academic Dishonesty and Cheating": "Academic Dishonesty",
701
- "Self–Harm and Suicidal Ideation": "Self-Harm",
702
- "Animal Cruelty and Exploitation": "Animal Harm",
703
- "Criminal, Violent, and Terrorist Activity": "Crime & Violence",
704
- "Drug– and Substance–Related Activities": "Drug Use",
705
- "Sexual Content and Violence": "Sexual Content",
706
- "Weapon, Explosives, and Hazardous Materials": "Weapons & Harmful Materials",
707
- "Cybercrime, Hacking, and Digital Exploits": "Cybercrime",
708
- "Creative Content Involving Illicit Themes": "Illicit Creative",
709
- "Safe Prompts": "Safe Prompts",
710
  }
711
  # Create reverse mapping for lookups
712
  CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
@@ -720,7 +720,7 @@ with demo:
720
  tabs = gr.Tabs(elem_classes="tab-buttons")
721
 
722
  with tabs:
723
- with gr.TabItem("Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
724
  with gr.Row():
725
  version_selector = gr.Dropdown(
726
  choices=BENCHMARK_VERSIONS,
@@ -963,7 +963,7 @@ with demo:
963
  ],
964
  )
965
 
966
- with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
967
  with gr.Row():
968
  with gr.Column():
969
  viz_version_selector = gr.Dropdown(
@@ -1128,10 +1128,10 @@ with demo:
1128
  outputs=[model_mode_selector],
1129
  )
1130
 
1131
- # with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
1132
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
1133
 
1134
- with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3):
1135
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1136
 
1137
  with gr.Row():
@@ -1172,11 +1172,11 @@ with demo:
1172
  value=None,
1173
  interactive=True,
1174
  )
1175
- guard_model_type = gr.Dropdown(
1176
- choices=[t.name for t in GuardModelType],
1177
- label="Guard model type",
1178
  multiselect=False,
1179
- value=GuardModelType.LLM_REGEXP.name,
1180
  interactive=True,
1181
  )
1182
 
@@ -1221,7 +1221,7 @@ with demo:
1221
  mode_selector,
1222
  file_input,
1223
  submission_version_selector,
1224
- guard_model_type,
1225
  ],
1226
  outputs=result_output,
1227
  )
 
1
  """
2
+ CodeReview Bench Leaderboard Application
3
  """
4
 
5
  import os
 
25
  )
26
  from src.display.css_html_js import custom_css
27
  from src.display.utils import (
28
+ CODEREVIEW_COLUMN,
29
  DISPLAY_COLS,
30
  METRIC_COLS,
31
  HIDDEN_COLS,
32
  NEVER_HIDDEN_COLS,
33
  CATEGORIES,
34
+ COMMENT_LANGUAGES,
35
+ EXAMPLE_CATEGORIES,
36
  ModelType,
37
  Mode,
38
  Precision,
39
  WeightType,
40
+ ReviewModelType,
41
  get_all_column_choices,
42
  get_default_visible_columns,
43
  )
 
222
  """
223
  if dataframe is None or dataframe.empty:
224
  # Create an empty dataframe with the right columns
225
+ columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
226
  dataframe = pd.DataFrame(columns=columns)
227
  logger.warning("Initializing empty leaderboard")
228
 
 
235
  dataframe = dataframe.copy()
236
  dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
237
 
238
+ if "review_model_type" in dataframe.columns:
239
  dataframe = dataframe.copy()
240
+ dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
241
 
242
  # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
243
 
244
  # Determine which columns to display
245
  display_column_names = [
246
+ getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
247
  ]
248
+ hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
249
 
250
  # Columns that should always be shown
251
+ always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
252
 
253
  # Use provided visible columns if specified, otherwise use default
254
  if visible_columns is None:
 
280
  # Create a list of datatypes in the format Gradio expects
281
  datatypes = []
282
  for col in visible_columns:
283
+ # Find the corresponding CODEREVIEW_COLUMN entry
284
  col_type = None
285
  for display_col in DISPLAY_COLS:
286
+ if getattr(CODEREVIEW_COLUMN, display_col).name == col:
287
+ orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
288
  # Map to Gradio's expected types
289
  col_type = type_mapping.get(orig_type, "str")
290
  break
 
323
  )
324
 
325
  column_info_map = {
326
+ f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
327
  }
328
  column_mapping = {
329
  col: column_info_map.get(col, ColumnInfo(col, col)).display_name
 
501
  mode: str,
502
  submission_file: tempfile._TemporaryFileWrapper,
503
  version: str,
504
+ review_model_type: ReviewModelType,
505
  ):
506
  """
507
  Handle submission of results with model metadata.
 
531
  "model_type": model_type,
532
  "mode": mode,
533
  "version": version,
534
+ "review_model_type": review_model_type,
535
  }
536
 
537
  # Process the submission
 
690
  demo = gr.Blocks(css=custom_css, theme=custom_theme)
691
 
692
  CATEGORY_DISPLAY_MAP = {
693
+ "Python": "Python",
694
+ "JavaScript": "JavaScript",
695
+ "Java": "Java",
696
+ "C++": "C++",
697
+ "C#": "C#",
698
+ "TypeScript": "TypeScript",
699
+ "Go": "Go",
700
+ "Rust": "Rust",
701
+ "Swift": "Swift",
702
+ "Kotlin": "Kotlin",
703
+ "Ruby": "Ruby",
704
+ "PHP": "PHP",
705
+ "C": "C",
706
+ "Scala": "Scala",
707
+ "R": "R",
708
+ "Dart": "Dart",
709
+ "Other": "Other"
 
710
  }
711
  # Create reverse mapping for lookups
712
  CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
 
720
  tabs = gr.Tabs(elem_classes="tab-buttons")
721
 
722
  with tabs:
723
+ with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
724
  with gr.Row():
725
  version_selector = gr.Dropdown(
726
  choices=BENCHMARK_VERSIONS,
 
963
  ],
964
  )
965
 
966
+ with gr.TabItem("Visualize", elem_id="codereview-viz-tab", id=1):
967
  with gr.Row():
968
  with gr.Column():
969
  viz_version_selector = gr.Dropdown(
 
1128
  outputs=[model_mode_selector],
1129
  )
1130
 
1131
+ # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
1132
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
1133
 
1134
+ with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=3):
1135
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1136
 
1137
  with gr.Row():
 
1172
  value=None,
1173
  interactive=True,
1174
  )
1175
+ review_model_type = gr.Dropdown(
1176
+ choices=[t.name for t in ReviewModelType],
1177
+ label="Review model type",
1178
  multiselect=False,
1179
+ value=ReviewModelType.CUSTOM.name,
1180
  interactive=True,
1181
  )
1182
 
 
1221
  mode_selector,
1222
  file_input,
1223
  submission_version_selector,
1224
+ review_model_type,
1225
  ],
1226
  outputs=result_output,
1227
  )
example_submission.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"model_name": "GPT-4-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
2
+ {"model_name": "GPT-4-CodeReview", "programming_language": "javascript", "comment_language": "en", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
3
+ {"model_name": "Claude-3-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 8.0, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
4
+ {"model_name": "Llama-CodeReview", "programming_language": "java", "comment_language": "en", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
requirements.txt CHANGED
@@ -6,4 +6,3 @@ apscheduler>=3.10.0
6
  python-dotenv>=1.0.0
7
  plotly>=5.18.0
8
  pydantic==2.10.6
9
- circleguardbench @ git+https://github.com/whitecircle-ai/circle-guard-bench.git
 
6
  python-dotenv>=1.0.0
7
  plotly>=5.18.0
8
  pydantic==2.10.6
 
src/about.py CHANGED
@@ -1,54 +1,60 @@
1
  """
2
- Text content for the GuardBench Leaderboard.
3
  """
4
 
5
  TITLE = """
6
  <div style="text-align: center; margin-bottom: 1rem">
7
- <h1>CircleGuardBench Leaderboard</h1>
8
  </div>
9
  """
10
 
11
  INTRODUCTION_TEXT = """
12
  ## Introduction
13
 
14
- CircleGuardBench is a comprehensive benchmark for evaluating the protection capabilities of large language model (LLM) guard systems.
15
- This leaderboard tracks model performance across various safety categories, including harmful content detection,
16
- jailbreak resistance, and more.
17
 
18
- Models are evaluated on their ability to properly refuse harmful requests and detect problematic content
19
- across multiple categories and test scenarios.
20
  """
21
 
22
  LLM_BENCHMARKS_TEXT = """
23
- CircleGuardBench is the first-of-its-kind benchmark for evaluating the protection capabilities of large language model (LLM) guard systems.
24
 
25
- It tests how well guard models block harmful content, resist jailbreaks, avoid false positives, and operate efficiently in real-time environments on a taxonomy close to real-world data.
26
 
27
- Learn more about us at [whitecircle.ai](https://whitecircle.ai?utm_source=huggingface&utm_medium=organic&utm_campaign=circleguardbench_launch&utm_content=space)
28
- """
29
 
 
 
30
 
31
  EVALUATION_QUEUE_TEXT = """
32
  ## Submit Your Model
33
 
34
- To add your model to the CircleGuardBench leaderboard:
35
 
36
- 1. Run your evaluation using the CircleGuardBench framework at https://github.com/whitecircle-ai/circle-guard-bench
37
- 2. Upload your run results in .jsonl format using this form.
38
  3. Once validated, your model will appear on the leaderboard.
39
 
 
 
 
 
 
40
  ### ✉️✨ Ready? Upload your results below!
41
  """
42
 
43
- CITATION_BUTTON_LABEL = "Cite CircleGuardBench"
44
 
45
  CITATION_BUTTON_TEXT = """
46
- @misc{circleguardbench2025,
47
- author = {whitecircle-ai},
48
- title = {CircleGuardBench: Comprehensive Benchmark for LLM Safety Guardrails. Learn more about us at whitecircle.ai},
49
  year = {2025},
50
  publisher = {GitHub},
51
  journal = {GitHub repository},
52
- howpublished = {\\url{https://github.com/whitecircle-ai/circle-guard-bench}}
53
  }
54
  """
 
1
  """
2
+ Text content for the CodeReview Bench Leaderboard.
3
  """
4
 
5
  TITLE = """
6
  <div style="text-align: center; margin-bottom: 1rem">
7
+ <h1>CodeReview Bench Leaderboard</h1>
8
  </div>
9
  """
10
 
11
  INTRODUCTION_TEXT = """
12
  ## Introduction
13
 
14
+ CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
15
+ This leaderboard tracks model performance across various programming languages and review criteria,
16
+ including readability, relevance, explanation clarity, and actionability.
17
 
18
+ Models are evaluated on their ability to provide high-quality code reviews that are helpful,
19
+ accurate, and actionable across multiple programming languages and review categories.
20
  """
21
 
22
  LLM_BENCHMARKS_TEXT = """
23
+ CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
24
 
25
+ It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
26
 
27
+ The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
 
28
 
29
+ Learn more about automated code review evaluation and best practices.
30
+ """
31
 
32
  EVALUATION_QUEUE_TEXT = """
33
  ## Submit Your Model
34
 
35
+ To add your model to the CodeReview Bench leaderboard:
36
 
37
+ 1. Run your evaluation using the CodeReview Bench framework
38
+ 2. Upload your results in .jsonl format using this form.
39
  3. Once validated, your model will appear on the leaderboard.
40
 
41
+ ### Requirements:
42
+ - Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
43
+ - Submissions should cover multiple programming languages where applicable
44
+ - Both Russian and English comment languages are supported
45
+
46
  ### ✉️✨ Ready? Upload your results below!
47
  """
48
 
49
+ CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
50
 
51
  CITATION_BUTTON_TEXT = """
52
+ @misc{codereviewbench2025,
53
+ author = {CodeReview Bench Team},
54
+ title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
55
  year = {2025},
56
  publisher = {GitHub},
57
  journal = {GitHub repository},
58
+ howpublished = {\\url{https://github.com/your-org/codereview-bench}}
59
  }
60
  """
src/display/css_html_js.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- CSS and styling for the GuardBench Leaderboard.
3
  """
4
 
5
  custom_css = """
 
1
  """
2
+ CSS and styling for the CodeReview Bench Leaderboard.
3
  """
4
 
5
  custom_css = """
src/display/utils.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Utility classes and functions for the GuardBench Leaderboard display.
3
  """
4
 
5
  from dataclasses import dataclass, field, fields
@@ -8,7 +8,7 @@ from typing import List, Optional
8
 
9
 
10
  class Mode(Enum):
11
- """Inference mode for the guard model."""
12
  CoT = auto() # Chain of Thought
13
  Strict = auto()
14
 
@@ -36,20 +36,19 @@ class ModelType(Enum):
36
  return "API"
37
  return "Unknown"
38
 
39
- class GuardModelType(str, Enum):
40
- """Guard model types for the leaderboard."""
41
- LLAMA_GUARD = "llama_guard"
42
- CLASSIFIER = "classifier"
43
- ATLA_SELENE = "atla_selene"
44
- OPENAI_MODERATION = "openai_moderation"
45
- LLM_REGEXP = "llm_regexp"
46
- LLM_SO = "llm_so"
47
- WHITECIRCLE_GUARD = "whitecircle_guard"
48
 
49
- def __str__(self):
50
- """String representation of the guard model type."""
51
- return self.name
 
 
 
 
 
52
 
 
 
 
53
 
54
 
55
  class Precision(Enum):
@@ -72,6 +71,7 @@ class WeightType(Enum):
72
  Original = auto()
73
  Delta = auto()
74
  Adapter = auto()
 
75
  def __str__(self):
76
  """String representation of the weight type."""
77
  return self.name
@@ -89,8 +89,8 @@ class ColumnInfo:
89
 
90
 
91
  @dataclass
92
- class GuardBenchColumn:
93
- """Columns for the GuardBench leaderboard."""
94
  # Core metadata
95
  model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
96
  name="model_name",
@@ -118,8 +118,8 @@ class GuardBenchColumn:
118
  display_name="Version",
119
  displayed_by_default=False
120
  ))
121
- guard_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
122
- name="guard_model_type",
123
  display_name="Type",
124
  displayed_by_default=False
125
  ))
@@ -144,212 +144,168 @@ class GuardBenchColumn:
144
  displayed_by_default=False
145
  ))
146
 
147
- # Default prompts metrics
148
- default_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
149
- name="default_prompts_f1_binary",
150
- display_name="Default_Prompts_F1_Binary",
151
  type="number",
152
- displayed_by_default=False
153
  ))
154
- default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
155
- name="default_prompts_f1",
156
- display_name="Default_Prompts_F1",
157
  type="number",
158
- displayed_by_default=False
159
  ))
160
- default_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
161
- name="default_prompts_recall_binary",
162
- display_name="Default_Prompts_Recall",
163
  type="number",
164
- displayed_by_default=False
165
  ))
166
- default_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
167
- name="default_prompts_precision_binary",
168
- display_name="Default_Prompts_Precision",
169
  type="number",
170
- displayed_by_default=False
171
  ))
172
- default_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
173
- name="default_prompts_error_ratio",
174
- display_name="Default_Prompts_Error_Ratio",
175
  type="number",
176
- displayed_by_default=False
177
  ))
178
- default_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
179
- name="default_prompts_avg_runtime_ms",
180
- display_name="Default_Prompts_Avg_Runtime_ms",
181
  type="number",
182
- displayed_by_default=False
183
  ))
184
-
185
- # Jailbreaked prompts metrics
186
- jailbreaked_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
187
- name="jailbreaked_prompts_f1_binary",
188
- display_name="Jailbreaked_Prompts_F1_Binary",
189
  type="number",
190
- displayed_by_default=False
191
  ))
192
- jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
193
- name="jailbreaked_prompts_f1",
194
- display_name="Jailbreaked_Prompts_F1",
195
  type="number",
196
- displayed_by_default=False
197
  ))
198
- jailbreaked_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
199
- name="jailbreaked_prompts_recall_binary",
200
- display_name="Jailbreaked_Prompts_Recall",
201
  type="number",
202
- displayed_by_default=False
203
  ))
204
- jailbreaked_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
205
- name="jailbreaked_prompts_precision_binary",
206
- display_name="Jailbreaked_Prompts_Precision",
207
  type="number",
208
- displayed_by_default=False
209
  ))
210
- jailbreaked_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
211
- name="jailbreaked_prompts_error_ratio",
212
- display_name="Jailbreaked_Prompts_Error_Ratio",
 
 
213
  type="number",
214
- displayed_by_default=False
215
  ))
216
- jailbreaked_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
217
- name="jailbreaked_prompts_avg_runtime_ms",
218
- display_name="Jailbreaked_Prompts_Avg_Runtime_ms",
219
  type="number",
220
- displayed_by_default=False
221
  ))
222
-
223
- # Default answers metrics
224
- default_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
225
- name="default_answers_f1_binary",
226
- display_name="Default_Answers_F1_Binary",
227
  type="number",
228
- displayed_by_default=False
229
  ))
230
- default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
231
- name="default_answers_f1",
232
- display_name="Default_Answers_F1",
233
  type="number",
234
- displayed_by_default=False
235
  ))
236
- default_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
237
- name="default_answers_recall_binary",
238
- display_name="Default_Answers_Recall",
 
 
239
  type="number",
240
- displayed_by_default=False
241
  ))
242
- default_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
243
- name="default_answers_precision_binary",
244
- display_name="Default_Answers_Precision",
245
  type="number",
246
- displayed_by_default=False
247
  ))
248
- default_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
249
- name="default_answers_error_ratio",
250
- display_name="Default_Answers_Error_Ratio",
251
  type="number",
252
- displayed_by_default=False
253
  ))
254
- default_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
255
- name="default_answers_avg_runtime_ms",
256
- display_name="Default_Answers_Avg_Runtime_ms",
257
  type="number",
258
- displayed_by_default=False
259
  ))
260
 
261
- # Jailbreaked answers metrics
262
- jailbreaked_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
263
- name="jailbreaked_answers_f1_binary",
264
- display_name="Jailbreaked_Answers_F1_Binary",
265
  type="number",
266
  displayed_by_default=False
267
  ))
268
- jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
269
- name="jailbreaked_answers_f1",
270
- display_name="Jailbreaked_Answers_F1",
271
  type="number",
272
  displayed_by_default=False
273
  ))
274
- jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
275
- name="jailbreaked_answers_recall_binary",
276
- display_name="Jailbreaked_Answers_Recall",
277
  type="number",
278
  displayed_by_default=False
279
  ))
280
- jailbreaked_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
281
- name="jailbreaked_answers_precision_binary",
282
- display_name="Jailbreaked_Answers_Precision",
 
 
283
  type="number",
284
  displayed_by_default=False
285
  ))
286
- jailbreaked_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
287
- name="jailbreaked_answers_error_ratio",
288
- display_name="Jailbreaked_Answers_Error_Ratio",
289
  type="number",
290
  displayed_by_default=False
291
  ))
292
- jailbreaked_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
293
- name="jailbreaked_answers_avg_runtime_ms",
294
- display_name="Jailbreaked_Answers_Avg_Runtime_ms",
295
  type="number",
296
  displayed_by_default=False
297
  ))
298
- integral_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
299
- name="integral_score",
300
- display_name="Integral_Score",
301
- type="number",
302
- displayed_by_default=True
303
- ))
304
-
305
- # Calculated overall metrics (renamed)
306
- macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
307
- name="macro_accuracy",
308
- display_name="Macro_Accuracy",
309
- type="number",
310
- displayed_by_default=True
311
- ))
312
- macro_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
313
- name="macro_recall",
314
- display_name="Macro_Recall",
315
- type="number",
316
- displayed_by_default=True
317
- ))
318
- macro_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
319
- name="macro_precision",
320
- display_name="Macro Precision",
321
- type="number",
322
- displayed_by_default=False
323
- ))
324
-
325
- # NEW Summary Metrics
326
- micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
327
- name="micro_avg_error_ratio",
328
- display_name="Micro_Error",
329
- type="number",
330
- displayed_by_default=True
331
- ))
332
- micro_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
333
- name="micro_avg_runtime_ms",
334
- display_name="Micro_Avg_time_ms",
335
- type="number",
336
- displayed_by_default=True
337
- ))
338
- total_evals_count: ColumnInfo = field(default_factory=lambda: ColumnInfo(
339
- name="total_evals_count",
340
- display_name="Total_Count",
341
- type="number",
342
- displayed_by_default=True
343
- ))
344
 
345
 
346
  # Create instances for easy access
347
- GUARDBENCH_COLUMN = GuardBenchColumn()
348
 
349
  # Extract column lists for different views
350
- COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
351
- DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
352
- if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
353
 
354
  # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
355
  def reorder_display_cols():
@@ -361,51 +317,72 @@ def reorder_display_cols():
361
  return cols
362
  DISPLAY_COLS = reorder_display_cols()
363
 
364
- METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
365
- if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
366
- HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
367
- if getattr(GUARDBENCH_COLUMN, f.name).hidden]
368
- NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
369
- if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
370
 
371
- # Categories in GuardBench
372
  CATEGORIES = [
373
- 'Political Corruption and Legal Evasion',
374
- 'Financial Fraud and Unethical Business',
375
- 'AI Manipulation and Jailbreaking',
376
- 'Child Exploitation and Abuse',
377
- 'Hate Speech, Extremism, and Discrimination',
378
- 'Labor Exploitation and Human Trafficking',
379
- 'Manipulation, Deception, and Misinformation',
380
- 'Environmental and Industrial Harm',
381
- 'Academic Dishonesty and Cheating',
382
- 'Self–Harm and Suicidal Ideation',
383
- 'Animal Cruelty and Exploitation',
384
- 'Criminal, Violent, and Terrorist Activity',
385
- 'Drug– and Substance–Related Activities',
386
- 'Sexual Content and Violence',
387
- 'Weapon, Explosives, and Hazardous Materials',
388
- 'Cybercrime, Hacking, and Digital Exploits',
389
- 'Creative Content Involving Illicit Themes',
390
- 'Safe Prompts'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  ]
392
 
393
- # Test types in GuardBench
394
- TEST_TYPES = [
395
- "default_prompts",
396
- "jailbreaked_prompts",
397
- "default_answers",
398
- "jailbreaked_answers"
 
 
 
 
 
 
399
  ]
400
 
401
- # Metrics in GuardBench
402
- METRICS = [
403
- "f1_binary",
404
- "recall_binary",
405
- "precision_binary",
406
- "error_ratio",
407
- "avg_runtime_ms",
408
- "accuracy"
409
  ]
410
 
411
  def get_all_column_choices():
@@ -419,8 +396,8 @@ def get_all_column_choices():
419
 
420
  default_visible_columns = get_default_visible_columns()
421
 
422
- for f in fields(GUARDBENCH_COLUMN):
423
- column_info = getattr(GUARDBENCH_COLUMN, f.name)
424
  # Create a tuple with both the internal name and display name
425
  if column_info.name not in default_visible_columns:
426
  column_choices.append((column_info.name, column_info.display_name))
@@ -434,5 +411,5 @@ def get_default_visible_columns():
434
  Returns:
435
  List of column names that are displayed by default.
436
  """
437
- return [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
438
- if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
 
1
  """
2
+ Utility classes and functions for the CodeReview Bench Leaderboard display.
3
  """
4
 
5
  from dataclasses import dataclass, field, fields
 
8
 
9
 
10
  class Mode(Enum):
11
+ """Inference mode for the review model."""
12
  CoT = auto() # Chain of Thought
13
  Strict = auto()
14
 
 
36
  return "API"
37
  return "Unknown"
38
 
 
 
 
 
 
 
 
 
 
39
 
40
+ class ReviewModelType(str, Enum):
41
+ """Review model types for the leaderboard."""
42
+ GPT_4 = "gpt-4"
43
+ GPT_3_5 = "gpt-3.5-turbo"
44
+ CLAUDE = "claude"
45
+ LLAMA = "llama"
46
+ GEMINI = "gemini"
47
+ CUSTOM = "custom"
48
 
49
+ def __str__(self):
50
+ """String representation of the review model type."""
51
+ return self.value
52
 
53
 
54
  class Precision(Enum):
 
71
  Original = auto()
72
  Delta = auto()
73
  Adapter = auto()
74
+
75
  def __str__(self):
76
  """String representation of the weight type."""
77
  return self.name
 
89
 
90
 
91
  @dataclass
92
+ class CodeReviewBenchColumn:
93
+ """Columns for the CodeReview Bench leaderboard."""
94
  # Core metadata
95
  model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
96
  name="model_name",
 
118
  display_name="Version",
119
  displayed_by_default=False
120
  ))
121
+ review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
122
+ name="review_model_type",
123
  display_name="Type",
124
  displayed_by_default=False
125
  ))
 
144
  displayed_by_default=False
145
  ))
146
 
147
+ # LLM-based multimetric scores
148
+ readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
149
+ name="readability",
150
+ display_name="Readability",
151
  type="number",
152
+ displayed_by_default=True
153
  ))
154
+ relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
155
+ name="relevance",
156
+ display_name="Relevance",
157
  type="number",
158
+ displayed_by_default=True
159
  ))
160
+ explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
161
+ name="explanation_clarity",
162
+ display_name="Explanation_Clarity",
163
  type="number",
164
+ displayed_by_default=True
165
  ))
166
+ problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
167
+ name="problem_identification",
168
+ display_name="Problem_Identification",
169
  type="number",
170
+ displayed_by_default=True
171
  ))
172
+ actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
173
+ name="actionability",
174
+ display_name="Actionability",
175
  type="number",
176
+ displayed_by_default=True
177
  ))
178
+ completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
179
+ name="completeness",
180
+ display_name="Completeness",
181
  type="number",
182
+ displayed_by_default=True
183
  ))
184
+ specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
185
+ name="specificity",
186
+ display_name="Specificity",
 
 
187
  type="number",
188
+ displayed_by_default=True
189
  ))
190
+ contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
191
+ name="contextual_adequacy",
192
+ display_name="Contextual_Adequacy",
193
  type="number",
194
+ displayed_by_default=True
195
  ))
196
+ consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
197
+ name="consistency",
198
+ display_name="Consistency",
199
  type="number",
200
+ displayed_by_default=True
201
  ))
202
+ brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
203
+ name="brevity",
204
+ display_name="Brevity",
205
  type="number",
206
+ displayed_by_default=True
207
  ))
208
+
209
+ # LLM-based-exact-match metrics
210
+ pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
211
+ name="pass_at_1",
212
+ display_name="Pass@1",
213
  type="number",
214
+ displayed_by_default=True
215
  ))
216
+ pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
217
+ name="pass_at_5",
218
+ display_name="Pass@5",
219
  type="number",
220
+ displayed_by_default=True
221
  ))
222
+ pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
223
+ name="pass_at_10",
224
+ display_name="Pass@10",
 
 
225
  type="number",
226
+ displayed_by_default=True
227
  ))
228
+ bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
229
+ name="bleu_at_10",
230
+ display_name="BLEU@10",
231
  type="number",
232
+ displayed_by_default=True
233
  ))
234
+
235
+ # Overall aggregated metrics
236
+ overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
237
+ name="overall_score",
238
+ display_name="Overall_Score",
239
  type="number",
240
+ displayed_by_default=True
241
  ))
242
+ multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
243
+ name="multimetric_average",
244
+ display_name="Multimetric_Average",
245
  type="number",
246
+ displayed_by_default=True
247
  ))
248
+ exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
249
+ name="exact_match_average",
250
+ display_name="Exact_Match_Average",
251
  type="number",
252
+ displayed_by_default=True
253
  ))
254
+ total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
255
+ name="total_evaluations",
256
+ display_name="Total_Evaluations",
257
  type="number",
258
+ displayed_by_default=True
259
  ))
260
 
261
+ # Language-specific metrics (Russian)
262
+ ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
263
+ name="ru_readability",
264
+ display_name="RU_Readability",
265
  type="number",
266
  displayed_by_default=False
267
  ))
268
+ ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
269
+ name="ru_relevance",
270
+ display_name="RU_Relevance",
271
  type="number",
272
  displayed_by_default=False
273
  ))
274
+ ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
275
+ name="ru_overall_score",
276
+ display_name="RU_Overall_Score",
277
  type="number",
278
  displayed_by_default=False
279
  ))
280
+
281
+ # Language-specific metrics (English)
282
+ en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
283
+ name="en_readability",
284
+ display_name="EN_Readability",
285
  type="number",
286
  displayed_by_default=False
287
  ))
288
+ en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
289
+ name="en_relevance",
290
+ display_name="EN_Relevance",
291
  type="number",
292
  displayed_by_default=False
293
  ))
294
+ en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
295
+ name="en_overall_score",
296
+ display_name="EN_Overall_Score",
297
  type="number",
298
  displayed_by_default=False
299
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
 
302
  # Create instances for easy access
303
+ CODEREVIEW_COLUMN = CodeReviewBenchColumn()
304
 
305
  # Extract column lists for different views
306
+ COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
307
+ DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
308
+ if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
309
 
310
  # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
311
  def reorder_display_cols():
 
317
  return cols
318
  DISPLAY_COLS = reorder_display_cols()
319
 
320
+ METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
321
+ if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
322
+ HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
323
+ if getattr(CODEREVIEW_COLUMN, f.name).hidden]
324
+ NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
325
+ if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
326
 
327
+ # Categories for CodeReview Bench (Programming Languages)
328
  CATEGORIES = [
329
+ 'Python',
330
+ 'JavaScript',
331
+ 'Java',
332
+ 'C++',
333
+ 'C#',
334
+ 'TypeScript',
335
+ 'Go',
336
+ 'Rust',
337
+ 'Swift',
338
+ 'Kotlin',
339
+ 'Ruby',
340
+ 'PHP',
341
+ 'C',
342
+ 'Scala',
343
+ 'R',
344
+ 'Dart',
345
+ 'Other'
346
+ ]
347
+
348
+ # Language taxonomies for CodeReview Bench
349
+ COMMENT_LANGUAGES = [
350
+ 'ru', # Russian
351
+ 'en' # English
352
+ ]
353
+
354
+ # Example categories
355
+ EXAMPLE_CATEGORIES = [
356
+ 'Bug_Fix',
357
+ 'Code_Style',
358
+ 'Performance',
359
+ 'Security',
360
+ 'Refactoring',
361
+ 'Documentation',
362
+ 'Testing',
363
+ 'Architecture',
364
+ 'Other'
365
  ]
366
 
367
+ # Metrics for CodeReview Bench
368
+ MULTIMETRIC_METRICS = [
369
+ "readability",
370
+ "relevance",
371
+ "explanation_clarity",
372
+ "problem_identification",
373
+ "actionability",
374
+ "completeness",
375
+ "specificity",
376
+ "contextual_adequacy",
377
+ "consistency",
378
+ "brevity"
379
  ]
380
 
381
+ EXACT_MATCH_METRICS = [
382
+ "pass_at_1",
383
+ "pass_at_5",
384
+ "pass_at_10",
385
+ "bleu_at_10"
 
 
 
386
  ]
387
 
388
  def get_all_column_choices():
 
396
 
397
  default_visible_columns = get_default_visible_columns()
398
 
399
+ for f in fields(CODEREVIEW_COLUMN):
400
+ column_info = getattr(CODEREVIEW_COLUMN, f.name)
401
  # Create a tuple with both the internal name and display name
402
  if column_info.name not in default_visible_columns:
403
  column_choices.append((column_info.name, column_info.display_name))
 
411
  Returns:
412
  List of column names that are displayed by default.
413
  """
414
+ return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
415
+ if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
src/envs.py CHANGED
@@ -7,14 +7,14 @@ load_dotenv()
7
 
8
  # Hugging Face configuration
9
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
10
- OWNER = os.environ.get("OWNER", "whitecircle-ai") # Change to your org
11
  SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
12
  ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
13
  ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
14
 
15
  # Repository IDs
16
- REPO_ID = f"{OWNER}/circle-guard-bench"
17
- RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/guardbench-results")
18
 
19
  # Cache paths
20
  CACHE_PATH = os.getenv("HF_HOME", ".")
 
7
 
8
  # Hugging Face configuration
9
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
10
+ OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
11
  SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
12
  ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
13
  ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
14
 
15
  # Repository IDs
16
+ REPO_ID = f"{OWNER}/codereview-bench"
17
+ RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
18
 
19
  # Cache paths
20
  CACHE_PATH = os.getenv("HF_HOME", ".")
src/leaderboard/processor.py CHANGED
@@ -1,81 +1,98 @@
1
  """
2
- Process and transform GuardBench leaderboard data.
3
  """
4
 
5
  import json
6
  import os
7
  import pandas as pd
8
  from datetime import datetime
9
- from typing import Dict, List, Any, Tuple
10
  import numpy as np
11
 
12
- from src.display.utils import CATEGORIES, TEST_TYPES, METRICS
 
 
 
13
 
14
- # Constants for Integral Score calculation (mirrors guardbench library)
15
- MAX_PUNISHABLE_RUNTIME_MS = 6000.0
16
- MIN_PUNISHABLE_RUNTIME_MS = 200.0
17
- MAX_RUNTIME_PENALTY = 0.75 # Corresponds to 1.0 - MIN_TIME_FACTOR, library used 0.75
18
 
19
- def calculate_integral_score(row: pd.Series) -> float:
20
  """
21
- Calculate the integral score for a given model entry row.
22
- Uses accuracy as the primary metric, micro error ratio, and micro runtime penalty.
23
- Falls back to macro accuracy and averaged per-test-type errors/runtimes if micro values are missing.
 
 
 
 
24
  """
25
- integral_score = 1.0
26
- metric_count = 0
27
-
28
- # Primary metric (using accuracy)
29
- for test_type in TEST_TYPES:
30
- metric_col = f"{test_type}_accuracy"
31
- if metric_col in row and pd.notna(row[metric_col]):
32
- # print(f"Found accuracy metric for {test_type}: {row[metric_col]}")
33
- integral_score *= row[metric_col]
34
- metric_count += 1
35
- # print(f"Metric count: {metric_count}")
36
-
37
- # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
38
- # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
39
- # Let's add that check back before applying penalties.
40
- if metric_count == 0:
41
- return 0.0
42
-
43
- # Error Penalty
44
- micro_error_col = "micro_avg_error_ratio"
45
- if micro_error_col in row and pd.notna(row[micro_error_col]):
46
- # Micro error is stored as %, convert back to ratio
47
- micro_error_ratio = row[micro_error_col] / 100.0
48
- integral_score *= (1.0 - micro_error_ratio)
49
-
50
- # Runtime Penalty
51
- avg_runtime_ms = None # Initialize
52
- micro_runtime_col = "micro_avg_runtime_ms"
53
- if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
54
- avg_runtime_ms = row[micro_runtime_col]
55
-
56
- if avg_runtime_ms is not None:
57
- # Apply penalty based on runtime (only if micro avg runtime was found)
58
- runtime = max(
59
- min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
60
- MIN_PUNISHABLE_RUNTIME_MS,
61
- )
62
-
63
- if MAX_PUNISHABLE_RUNTIME_MS > MIN_PUNISHABLE_RUNTIME_MS:
64
- normalized_time = (runtime - MIN_PUNISHABLE_RUNTIME_MS) / (
65
- MAX_PUNISHABLE_RUNTIME_MS - MIN_PUNISHABLE_RUNTIME_MS
66
- )
67
- # Match reference library formula 1
68
- time_factor = 1.0 - (1.0 - MAX_RUNTIME_PENALTY) * normalized_time
69
- else:
70
- # Match reference library formula (though less critical when max==min)
71
- time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY)
72
 
73
- # Match reference library formula 2 (enforce minimum factor)
74
- time_factor = max(MAX_RUNTIME_PENALTY, time_factor)
75
- integral_score *= time_factor
76
 
77
- # Rooting is not done in the reference library's summary table calculation
78
- return integral_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
  def load_leaderboard_data(file_path: str) -> Dict:
@@ -122,40 +139,6 @@ def save_leaderboard_data(data: Dict, file_path: str) -> None:
122
  json.dump(data, f, indent=2)
123
 
124
 
125
- def process_submission(submission_data: List[Dict]) -> List[Dict]:
126
- """
127
- Process submission data and convert it to leaderboard entries.
128
- """
129
- entries = []
130
-
131
- for item in submission_data:
132
- # Create a new entry for the leaderboard
133
- entry = {
134
- "model_name": item.get("model_name", "Unknown Model"),
135
- "per_category_metrics": {},
136
- "avg_metrics": {},
137
- "submission_date": datetime.now().isoformat(),
138
- "version": item.get("version", "v0")
139
- }
140
-
141
- # Copy model metadata
142
- for key in ["model_type", "base_model", "revision", "precision", "weight_type"]:
143
- if key in item:
144
- entry[key] = item[key]
145
-
146
- # Process per-category metrics
147
- if "per_category_metrics" in item:
148
- entry["per_category_metrics"] = item["per_category_metrics"]
149
-
150
- # Process average metrics
151
- if "avg_metrics" in item:
152
- entry["avg_metrics"] = item["avg_metrics"]
153
-
154
- entries.append(entry)
155
-
156
- return entries
157
-
158
-
159
  def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
160
  """
161
  Convert leaderboard data to a pandas DataFrame for display.
@@ -165,14 +148,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
165
  for entry in leaderboard_data.get("entries", []):
166
  model_name = entry.get("model_name", "Unknown Model")
167
 
168
- # Extract average metrics for main display
169
  row = {
170
  "model_name": model_name,
171
  "model_type": entry.get("model_type", "Unknown"),
172
  "mode": entry.get("mode", "Strict"),
173
  "submission_date": entry.get("submission_date", ""),
174
  "version": entry.get("version", "v0"),
175
- "guard_model_type": entry.get("guard_model_type", "llm_regexp").lower()
176
  }
177
 
178
  # Add additional metadata fields if present
@@ -180,117 +163,69 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
180
  if key in entry:
181
  row[key] = entry[key]
182
 
183
- # CASE 1: Metrics are flat in the root
184
- for key, value in entry.items():
185
- if any(test_type in key for test_type in TEST_TYPES) or \
186
- key in ["average_f1", "average_recall", "average_precision",
187
- "macro_accuracy", "macro_recall", "total_evals_count"]:
188
- row[key] = value
189
-
190
- # CASE 2: Metrics are in avg_metrics structure
191
- avg_metrics = entry.get("avg_metrics", {})
192
- if avg_metrics:
193
- for test_type in TEST_TYPES:
194
- if test_type in avg_metrics:
195
- metrics = avg_metrics[test_type]
196
- for metric in METRICS:
197
- if metric in metrics:
198
- col_name = f"{test_type}_{metric}"
199
- row[col_name] = metrics[metric]
200
-
201
- # Also add non-binary version for F1 scores
202
- if metric == "f1_binary":
203
- row[f"{test_type}_f1"] = metrics[metric]
204
-
205
- # Calculate averages if not present
206
- # Use accuracy for macro_accuracy
207
- if "macro_accuracy" not in row:
208
- accuracy_values = []
209
- for test_type in TEST_TYPES:
210
- # Check avg_metrics structure first
211
- accuracy_val = None
212
- if test_type in avg_metrics and "accuracy" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["accuracy"]):
213
- accuracy_val = avg_metrics[test_type]["accuracy"]
214
- # Check flat structure as fallback (might be redundant but safer)
215
- elif f"{test_type}_accuracy" in row and pd.notna(row[f"{test_type}_accuracy"]):
216
- accuracy_val = row[f"{test_type}_accuracy"]
217
-
218
- if accuracy_val is not None:
219
- accuracy_values.append(accuracy_val)
220
-
221
- if accuracy_values:
222
- row["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
223
-
224
- # Use recall_binary for macro_recall
225
- if "macro_recall" not in row:
226
- recall_values = []
227
- for test_type in TEST_TYPES:
228
- if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["recall_binary"]):
229
- recall_values.append(avg_metrics[test_type]["recall_binary"])
230
- if recall_values:
231
- row["macro_recall"] = sum(recall_values) / len(recall_values)
232
-
233
- if "total_evals_count" not in row:
234
- total_samples = 0
235
- found_samples = False
236
- for test_type in TEST_TYPES:
237
- if test_type in avg_metrics and "sample_count" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["sample_count"]):
238
- total_samples += avg_metrics[test_type]["sample_count"]
239
- found_samples = True
240
- if found_samples:
241
- row["total_evals_count"] = total_samples
242
-
243
- # Extract micro averages directly from entry if they exist (like in guardbench library)
244
- row["micro_avg_error_ratio"] = entry.get("micro_avg_error_ratio", pd.NA)
245
- row["micro_avg_runtime_ms"] = entry.get("micro_avg_runtime_ms", pd.NA)
246
-
247
- # Convert error ratio to percentage for consistency with display name
248
- if pd.notna(row["micro_avg_error_ratio"]):
249
- row["micro_avg_error_ratio"] *= 100
250
 
251
  rows.append(row)
252
 
253
- # Create DataFrame and sort by average F1 score
254
  df = pd.DataFrame(rows)
255
 
256
  # Ensure all expected columns exist
257
- for test_type in TEST_TYPES:
258
- for metric in METRICS:
259
- col_name = f"{test_type}_{metric}"
260
- if col_name not in df.columns:
261
- df[col_name] = pd.NA # Use pd.NA for missing numeric data
262
-
263
- # Add non-binary F1 if binary exists and f1 is missing
264
- if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
265
- # Check if the binary column has data before copying
266
- if col_name in df.columns:
267
- df[f"{test_type}_f1"] = df[col_name]
268
- else:
269
- df[f"{test_type}_f1"] = pd.NA
270
 
271
- # Calculate Integral Score
272
  if not df.empty:
273
- df["integral_score"] = df.apply(calculate_integral_score, axis=1)
274
- # Sort by Integral Score instead of average_f1
275
- df = df.sort_values(by="integral_score", ascending=False, na_position='last')
276
- else:
277
- # Add the column even if empty
278
- df["integral_score"] = pd.NA
279
 
280
  # Ensure summary columns exist
281
- summary_cols = ["macro_accuracy", "macro_recall", "micro_avg_error_ratio", "micro_avg_runtime_ms", "total_evals_count"]
282
  for col in summary_cols:
283
  if col not in df.columns:
284
  df[col] = pd.NA
285
 
286
- # Remove old average columns if they somehow snuck in
287
- old_avg_cols = ["average_f1", "average_recall", "average_precision"]
288
- for col in old_avg_cols:
289
- if col in df.columns:
290
- df = df.drop(columns=[col])
291
- # print("--- DataFrame before returning from leaderboard_to_dataframe ---")
292
- # print(df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head())
293
- # print("-------------------------------------------------------------")
294
  return df
295
 
296
 
@@ -309,6 +244,18 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
309
  model_name = new_entry.get("model_name")
310
  version = new_entry.get("version", "v0")
311
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  if (model_name, version) in existing_entries:
313
  # Replace existing entry
314
  leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
@@ -322,25 +269,3 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
322
  leaderboard_data["last_updated"] = datetime.now().isoformat()
323
 
324
  return leaderboard_data
325
-
326
-
327
- def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
328
- """
329
- Process a JSONL submission file and extract entries.
330
- """
331
- entries = []
332
- try:
333
- with open(file_path, 'r') as f:
334
- for line in f:
335
- try:
336
- entry = json.loads(line)
337
- entries.append(entry)
338
- except json.JSONDecodeError as e:
339
- return [], f"Invalid JSON in submission file: {e}"
340
-
341
- if not entries:
342
- return [], "Submission file is empty"
343
-
344
- return entries, "Successfully processed submission"
345
- except Exception as e:
346
- return [], f"Error processing submission file: {e}"
 
1
  """
2
+ Process CodeReview Bench leaderboard data and submissions.
3
  """
4
 
5
  import json
6
  import os
7
  import pandas as pd
8
  from datetime import datetime
9
+ from typing import Dict, List, Tuple, Optional
10
  import numpy as np
11
 
12
+ from src.display.utils import (
13
+ CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
14
+ MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
15
+ )
16
 
 
 
 
 
17
 
18
+ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
19
  """
20
+ Process a JSONL submission file for CodeReview Bench.
21
+
22
+ Args:
23
+ file_path: Path to the JSONL submission file
24
+
25
+ Returns:
26
+ Tuple of (entries_list, message)
27
  """
28
+ try:
29
+ entries = []
30
+ with open(file_path, 'r', encoding='utf-8') as f:
31
+ for line_num, line in enumerate(f, 1):
32
+ line = line.strip()
33
+ if not line:
34
+ continue
35
+
36
+ try:
37
+ entry = json.loads(line)
38
+
39
+ # Validate required fields
40
+ required_fields = ['model_name', 'programming_language', 'comment_language']
41
+ missing_fields = [field for field in required_fields if field not in entry]
42
+ if missing_fields:
43
+ return [], f"Missing required fields {missing_fields} in line {line_num}"
44
+
45
+ # Validate metrics exist
46
+ has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
47
+ has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
48
+
49
+ if not has_multimetric and not has_exact_match:
50
+ return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
51
+
52
+ entries.append(entry)
53
+
54
+ except json.JSONDecodeError as e:
55
+ return [], f"Invalid JSON in line {line_num}: {e}"
56
+
57
+ if not entries:
58
+ return [], "No valid entries found in submission file"
59
+
60
+ return entries, f"Successfully processed {len(entries)} entries"
61
+
62
+ except Exception as e:
63
+ return [], f"Error processing submission: {e}"
 
 
 
 
 
 
 
 
 
 
 
64
 
 
 
 
65
 
66
+ def calculate_overall_score(entry: Dict) -> float:
67
+ """
68
+ Calculate overall score for a CodeReview Bench entry.
69
+
70
+ Args:
71
+ entry: Dictionary containing model evaluation results
72
+
73
+ Returns:
74
+ Overall score as float
75
+ """
76
+ # Calculate multimetric average
77
+ multimetric_scores = []
78
+ for metric in MULTIMETRIC_METRICS:
79
+ if metric in entry and isinstance(entry[metric], (int, float)):
80
+ multimetric_scores.append(entry[metric])
81
+
82
+ multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
83
+
84
+ # Calculate exact match average
85
+ exact_match_scores = []
86
+ for metric in EXACT_MATCH_METRICS:
87
+ if metric in entry and isinstance(entry[metric], (int, float)):
88
+ exact_match_scores.append(entry[metric])
89
+
90
+ exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
91
+
92
+ # Weighted combination (can be adjusted based on requirements)
93
+ overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
94
+
95
+ return overall_score
96
 
97
 
98
  def load_leaderboard_data(file_path: str) -> Dict:
 
139
  json.dump(data, f, indent=2)
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
143
  """
144
  Convert leaderboard data to a pandas DataFrame for display.
 
148
  for entry in leaderboard_data.get("entries", []):
149
  model_name = entry.get("model_name", "Unknown Model")
150
 
151
+ # Extract basic metadata
152
  row = {
153
  "model_name": model_name,
154
  "model_type": entry.get("model_type", "Unknown"),
155
  "mode": entry.get("mode", "Strict"),
156
  "submission_date": entry.get("submission_date", ""),
157
  "version": entry.get("version", "v0"),
158
+ "review_model_type": entry.get("review_model_type", "custom").lower()
159
  }
160
 
161
  # Add additional metadata fields if present
 
163
  if key in entry:
164
  row[key] = entry[key]
165
 
166
+ # Add multimetric scores
167
+ for metric in MULTIMETRIC_METRICS:
168
+ if metric in entry:
169
+ row[metric] = entry[metric]
170
+ else:
171
+ row[metric] = pd.NA
172
+
173
+ # Add exact match metrics
174
+ for metric in EXACT_MATCH_METRICS:
175
+ if metric in entry:
176
+ row[metric] = entry[metric]
177
+ else:
178
+ row[metric] = pd.NA
179
+
180
+ # Calculate aggregated metrics
181
+ multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
182
+ exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
183
+
184
+ if multimetric_scores:
185
+ row["multimetric_average"] = np.mean(multimetric_scores)
186
+ else:
187
+ row["multimetric_average"] = pd.NA
188
+
189
+ if exact_match_scores:
190
+ row["exact_match_average"] = np.mean(exact_match_scores)
191
+ else:
192
+ row["exact_match_average"] = pd.NA
193
+
194
+ # Calculate overall score
195
+ row["overall_score"] = calculate_overall_score(entry)
196
+
197
+ # Add language-specific metrics if available
198
+ for lang in COMMENT_LANGUAGES:
199
+ for metric in ["readability", "relevance", "overall_score"]:
200
+ lang_key = f"{lang}_{metric}"
201
+ if lang_key in entry:
202
+ row[lang_key] = entry[lang_key]
203
+ else:
204
+ row[lang_key] = pd.NA
205
+
206
+ # Add evaluation count
207
+ row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  rows.append(row)
210
 
211
+ # Create DataFrame and sort by overall score
212
  df = pd.DataFrame(rows)
213
 
214
  # Ensure all expected columns exist
215
+ for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
216
+ if metric not in df.columns:
217
+ df[metric] = pd.NA
 
 
 
 
 
 
 
 
 
 
218
 
219
+ # Sort by overall score (descending)
220
  if not df.empty:
221
+ df = df.sort_values(by="overall_score", ascending=False, na_position='last')
 
 
 
 
 
222
 
223
  # Ensure summary columns exist
224
+ summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
225
  for col in summary_cols:
226
  if col not in df.columns:
227
  df[col] = pd.NA
228
 
 
 
 
 
 
 
 
 
229
  return df
230
 
231
 
 
244
  model_name = new_entry.get("model_name")
245
  version = new_entry.get("version", "v0")
246
 
247
+ # Add calculated metrics
248
+ new_entry["overall_score"] = calculate_overall_score(new_entry)
249
+
250
+ # Calculate averages
251
+ multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
252
+ exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
253
+
254
+ if multimetric_scores:
255
+ new_entry["multimetric_average"] = np.mean(multimetric_scores)
256
+ if exact_match_scores:
257
+ new_entry["exact_match_average"] = np.mean(exact_match_scores)
258
+
259
  if (model_name, version) in existing_entries:
260
  # Replace existing entry
261
  leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
 
269
  leaderboard_data["last_updated"] = datetime.now().isoformat()
270
 
271
  return leaderboard_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Populate the GuardBench leaderboard from HuggingFace datasets.
3
  """
4
 
5
  import json
@@ -13,7 +13,7 @@ import numpy as np
13
  from huggingface_hub import hf_hub_download, HfApi
14
  from datasets import load_dataset
15
 
16
- from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
17
  from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
18
  from src.leaderboard.processor import leaderboard_to_dataframe
19
 
@@ -58,19 +58,16 @@ def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
58
  return None
59
 
60
 
61
- def get_all_entries(version="v0", mode: str = None) -> List[Dict]:
62
  """
63
- Get all model entries from the entries folder. If mode is provided, only return entries matching that mode.
64
  """
65
  try:
66
  api = HfApi(token=TOKEN)
67
  files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
68
- if mode is not None:
69
- mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
70
- entry_files = [f for f in files if f.startswith("entries/") and f"_{mode_safe}_" in f and f.endswith(f"_{version}.json")]
71
- else:
72
- entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
73
- entries = []
74
  for entry_file in entry_files:
75
  try:
76
  entry_path = hf_hub_download(
@@ -81,12 +78,13 @@ def get_all_entries(version="v0", mode: str = None) -> List[Dict]:
81
  )
82
  with open(entry_path, 'r') as f:
83
  entry_data = json.load(f)
84
- entries.append(entry_data)
85
  except Exception as e:
86
  print(f"Error loading entry {entry_file}: {e}")
87
- return entries
 
88
  except Exception as e:
89
- print(f"Error listing entries: {e}")
90
  return []
91
 
92
 
@@ -116,7 +114,7 @@ def get_leaderboard_df(version="v0") -> pd.DataFrame:
116
 
117
  def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
118
  """
119
- Get the leaderboard data filtered by a specific category.
120
  """
121
  # Get latest leaderboard data
122
  leaderboard_data = get_latest_leaderboard(version)
@@ -134,90 +132,18 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
134
  # Return empty DataFrame if no data available
135
  return pd.DataFrame(columns=DISPLAY_COLS)
136
 
137
- # Filter entries to only include those with data for the specified category
138
  filtered_entries = []
139
-
140
  for entry in leaderboard_data.get("entries", []):
141
- # Copy all base fields
142
- filtered_entry = {
143
- "model_name": entry.get("model_name", "Unknown Model"),
144
- "model_type": entry.get("model_type", "Unknown"),
145
- "guard_model_type": entry.get("guard_model_type", "Unknown"),
146
- "mode": entry.get("mode", "Strict"),
147
- "submission_date": entry.get("submission_date", ""),
148
- "version": entry.get("version", version),
149
- "base_model": entry.get("base_model", ""),
150
- "revision": entry.get("revision", ""),
151
- "precision": entry.get("precision", ""),
152
- "weight_type": entry.get("weight_type", "")
153
- }
154
-
155
- if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
156
- category_metrics = entry["per_category_metrics"][category]
157
-
158
- # Add all metrics for each test type
159
- for test_type, metrics in category_metrics.items():
160
- if isinstance(metrics, dict):
161
- for metric, value in metrics.items():
162
- col_name = f"{test_type}_{metric}"
163
- filtered_entry[col_name] = value
164
-
165
- # Also add the non-binary version for F1 scores
166
- if metric == "f1_binary":
167
- filtered_entry[f"{test_type}_f1"] = value
168
-
169
- # Calculate averages
170
- f1_values = []
171
- recall_values = []
172
- precision_values = []
173
- accuracy_values = []
174
- category_recall_values = []
175
- total_samples = 0
176
-
177
- for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
178
- if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
179
- test_metrics = category_metrics[test_type]
180
- if "f1_binary" in test_metrics and pd.notna(test_metrics["f1_binary"]):
181
- f1_values.append(test_metrics["f1_binary"])
182
- if "recall_binary" in test_metrics and pd.notna(test_metrics["recall_binary"]):
183
- recall_values.append(test_metrics["recall_binary"])
184
- category_recall_values.append(test_metrics["recall_binary"])
185
- if "precision_binary" in test_metrics and pd.notna(test_metrics["precision_binary"]):
186
- precision_values.append(test_metrics["precision_binary"])
187
- if "accuracy" in test_metrics and pd.notna(test_metrics["accuracy"]):
188
- accuracy_values.append(test_metrics["accuracy"])
189
- if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
190
- total_samples += test_metrics["sample_count"]
191
-
192
- # print(f"F1 values: {f1_values}")
193
- # print(f1_values, recall_values, precision_values, accuracy_values, total_samples)
194
-
195
-
196
- # Add overall averages
197
- if f1_values:
198
- filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
199
- if recall_values:
200
- filtered_entry["average_recall"] = sum(recall_values) / len(recall_values)
201
- if precision_values:
202
- filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
203
-
204
- # Add category-specific values to standard macro metric keys
205
- if accuracy_values:
206
- filtered_entry["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
207
  else:
208
- filtered_entry["macro_accuracy"] = np.nan
209
-
210
- if category_recall_values:
211
- filtered_entry["macro_recall"] = sum(category_recall_values) / len(category_recall_values)
212
- else:
213
- filtered_entry["macro_recall"] = np.nan
214
-
215
- if total_samples > 0:
216
- filtered_entry["total_evals_count"] = total_samples
217
- else:
218
- filtered_entry["total_evals_count"] = np.nan
219
-
220
- filtered_entries.append(filtered_entry)
221
 
222
  # Create a new leaderboard data structure with the filtered entries
223
  filtered_leaderboard = {
@@ -225,7 +151,6 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
225
  "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
226
  "version": version
227
  }
228
- # print(filtered_leaderboard)
229
 
230
  # Convert to DataFrame
231
  return leaderboard_to_dataframe(filtered_leaderboard)
 
1
  """
2
+ Populate the CodeReview Bench leaderboard from HuggingFace datasets.
3
  """
4
 
5
  import json
 
13
  from huggingface_hub import hf_hub_download, HfApi
14
  from datasets import load_dataset
15
 
16
+ from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
17
  from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
18
  from src.leaderboard.processor import leaderboard_to_dataframe
19
 
 
58
  return None
59
 
60
 
61
+ def get_all_entries(version="v0") -> List[Dict]:
62
  """
63
+ Get all entries from the HuggingFace dataset.
64
  """
65
  try:
66
  api = HfApi(token=TOKEN)
67
  files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
68
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
69
+
70
+ all_entries = []
 
 
 
71
  for entry_file in entry_files:
72
  try:
73
  entry_path = hf_hub_download(
 
78
  )
79
  with open(entry_path, 'r') as f:
80
  entry_data = json.load(f)
81
+ all_entries.append(entry_data)
82
  except Exception as e:
83
  print(f"Error loading entry {entry_file}: {e}")
84
+
85
+ return all_entries
86
  except Exception as e:
87
+ print(f"Error getting all entries: {e}")
88
  return []
89
 
90
 
 
114
 
115
  def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
116
  """
117
+ Get the leaderboard data filtered by a specific programming language category.
118
  """
119
  # Get latest leaderboard data
120
  leaderboard_data = get_latest_leaderboard(version)
 
132
  # Return empty DataFrame if no data available
133
  return pd.DataFrame(columns=DISPLAY_COLS)
134
 
135
+ # Filter entries to only include those with data for the specified programming language
136
  filtered_entries = []
 
137
  for entry in leaderboard_data.get("entries", []):
138
+ # Check if entry has data for this programming language
139
+ programming_language = entry.get("programming_language", "").lower()
140
+ if programming_language == category.lower() or category.lower() == "other":
141
+ # For "other" category, include entries that don't match any specific language
142
+ if category.lower() == "other":
143
+ if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
144
+ filtered_entries.append(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  else:
146
+ filtered_entries.append(entry)
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Create a new leaderboard data structure with the filtered entries
149
  filtered_leaderboard = {
 
151
  "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
152
  "version": version
153
  }
 
154
 
155
  # Convert to DataFrame
156
  return leaderboard_to_dataframe(filtered_leaderboard)
src/submission/submit.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Handle submissions to the GuardBench leaderboard.
3
  """
4
 
5
  import json
@@ -7,20 +7,13 @@ import os
7
  import tempfile
8
  from datetime import datetime
9
  from typing import Dict, List, Tuple
10
- import shutil
11
- import threading
12
- import time
13
 
14
  from huggingface_hub import HfApi
15
  from datasets import load_dataset
16
- import subprocess
17
 
18
  from src.display.formatting import styled_error, styled_message
19
  from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
20
- from src.leaderboard.processor import process_jsonl_submission
21
- from circleguardbench.evaluator import Evaluator
22
- from circleguardbench.context import GuardbenchContext
23
- from circleguardbench.models_config import ModelType
24
 
25
 
26
  def validate_submission(file_path: str) -> Tuple[bool, str]:
@@ -102,27 +95,9 @@ def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool,
102
  return False, f"Error updating leaderboard: {e}"
103
 
104
 
105
- def restart_space_after_delay(delay_seconds: int = 2) -> None:
106
- """
107
- Restart the Hugging Face Space after a delay.
108
- """
109
- def _restart_space():
110
- time.sleep(delay_seconds)
111
- try:
112
- api = HfApi(token=TOKEN)
113
- api.restart_space(repo_id=REPO_ID)
114
- except Exception as e:
115
- print(f"Error restarting space: {e}")
116
-
117
- # Start the restart in a separate thread
118
- thread = threading.Thread(target=_restart_space)
119
- thread.daemon = True
120
- thread.start()
121
-
122
-
123
  def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
124
  """
125
- Process a submission to the GuardBench leaderboard.
126
  """
127
  try:
128
  # Validate submission
@@ -130,18 +105,15 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
130
  if not is_valid:
131
  return styled_error(validation_message)
132
 
133
- # Get GuardBench results directory path
134
- guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule")
135
- results_dir = os.path.join(guardbench_dir, "results")
136
- os.makedirs(results_dir, exist_ok=True)
137
 
138
- # Copy submission to GuardBench results directory
139
  model_name = metadata.get("model_name", "unknown")
140
  model_name_safe = model_name.replace("/", "_").replace(" ", "_")
141
- guard_model_type = metadata.get("guard_model_type", "unknown")
142
- target_file = os.path.join(results_dir + "/circleguardbench_public", f"{model_name_safe}.jsonl")
143
-
144
- # Upload raw submission file
145
  api = HfApi(token=TOKEN)
146
  submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
147
  api.upload_file(
@@ -151,51 +123,15 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
151
  repo_type="dataset",
152
  commit_message=f"Add raw submission for {model_name}"
153
  )
154
- os.makedirs(results_dir + "/circleguardbench_public", exist_ok=True)
155
-
156
- # (f"Submission path: {submission_path}")
157
- # print(f"Target file: {target_file}")
158
- # printprint(f"Results dir: {results_dir}")
159
-
160
-
161
- shutil.copy2(file_path, target_file)
162
- # print(f"Copied file to target file: {target_file}")
163
- # print(f" ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/: {subprocess.check_output('ls /home/user/app/guard-bench-submodule/results/guardbench_dataset_1k_public/', shell=True).decode('utf-8')}")
164
-
165
- try:
166
- # Initialize GuardBench context
167
- ctx = GuardbenchContext()
168
- # Set results directory
169
- ctx.results_dir = results_dir
170
- # Set bench name from the results directory
171
- ctx.bench_name = "circleguardbench_public"
172
- # Load dataset
173
- ctx.load_dataset("whitecircle-ai/circleguardbench_public")
174
- # Mark as initialized
175
- ctx.is_initialized = True
176
-
177
- evaluator = Evaluator(ctx, force=True, using_cached=True)
178
-
179
- # Run evaluation and get entry
180
- evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower())
181
-
182
- # Get the entry from results
183
- with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f:
184
- results_data = json.load(f)
185
- model_entry = next(
186
- (entry for entry in results_data.get("entries", [])
187
- if entry.get("model_name") == model_name_safe),
188
- None
189
- )
190
-
191
- if not model_entry:
192
- return styled_error("No evaluation results found")
193
 
 
 
 
194
  # Add metadata to entry
195
- model_entry.update({
196
- "model_name": metadata.get("model_name"), # Use original model name
197
  "model_type": metadata.get("model_type"),
198
- "guard_model_type": str(metadata.get("guard_model_type")).lower(),
199
  "mode": metadata.get("mode"),
200
  "base_model": metadata.get("base_model"),
201
  "revision": metadata.get("revision"),
@@ -204,51 +140,45 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
204
  "version": version,
205
  "submission_date": datetime.now().isoformat()
206
  })
 
207
 
208
- # Submit entry to entries folder
209
- success, message = submit_entry_to_hub(model_entry, model_name, metadata.get("mode"), version)
 
210
  if not success:
211
  return styled_error(message)
212
 
213
- # Get all entries from HF dataset
214
- api = HfApi(token=TOKEN)
215
- files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
216
- entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
217
-
218
- all_entries = []
219
- for entry_file in entry_files:
220
- try:
221
- entry_path = api.hf_hub_download(
222
- repo_id=RESULTS_DATASET_ID,
223
- filename=entry_file,
224
- repo_type="dataset",
225
- )
226
- with open(entry_path, 'r') as f:
227
- entry_data = json.load(f)
228
- all_entries.append(entry_data)
229
- except Exception as e:
230
- print(f"Error loading entry {entry_file}: {e}")
231
-
232
- # Update leaderboard with all entries
233
- success, message = submit_leaderboard_to_hub(all_entries, version)
234
- if not success:
235
- return styled_error(message)
236
-
237
- restart_space_after_delay(5)
238
 
239
- return styled_message("Submission successful! Model evaluated and leaderboard updated.")
 
 
 
240
 
241
- except Exception as eval_error:
242
- return styled_error(f"Error during evaluation: {eval_error}")
243
 
244
  except Exception as e:
245
  return styled_error(f"Error processing submission: {e}")
246
  finally:
247
- # Clean up temporary files
248
  try:
249
  if os.path.exists(file_path):
250
  os.remove(file_path)
251
- if os.path.exists(target_file):
252
- os.remove(target_file)
253
  except:
254
  pass
 
1
  """
2
+ Handle submissions to the CodeReview Bench leaderboard.
3
  """
4
 
5
  import json
 
7
  import tempfile
8
  from datetime import datetime
9
  from typing import Dict, List, Tuple
 
 
 
10
 
11
  from huggingface_hub import HfApi
12
  from datasets import load_dataset
 
13
 
14
  from src.display.formatting import styled_error, styled_message
15
  from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
16
+ from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
 
 
 
17
 
18
 
19
  def validate_submission(file_path: str) -> Tuple[bool, str]:
 
95
  return False, f"Error updating leaderboard: {e}"
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
99
  """
100
+ Process a submission to the CodeReview Bench leaderboard.
101
  """
102
  try:
103
  # Validate submission
 
105
  if not is_valid:
106
  return styled_error(validation_message)
107
 
108
+ # Process the submission entries
109
+ entries, message = process_jsonl_submission(file_path)
110
+ if not entries:
111
+ return styled_error(f"Failed to process submission: {message}")
112
 
113
+ # Upload raw submission file
114
  model_name = metadata.get("model_name", "unknown")
115
  model_name_safe = model_name.replace("/", "_").replace(" ", "_")
116
+
 
 
 
117
  api = HfApi(token=TOKEN)
118
  submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
119
  api.upload_file(
 
123
  repo_type="dataset",
124
  commit_message=f"Add raw submission for {model_name}"
125
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Process entries and add metadata
128
+ processed_entries = []
129
+ for entry in entries:
130
  # Add metadata to entry
131
+ entry.update({
132
+ "model_name": metadata.get("model_name"),
133
  "model_type": metadata.get("model_type"),
134
+ "review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
135
  "mode": metadata.get("mode"),
136
  "base_model": metadata.get("base_model"),
137
  "revision": metadata.get("revision"),
 
140
  "version": version,
141
  "submission_date": datetime.now().isoformat()
142
  })
143
+ processed_entries.append(entry)
144
 
145
+ # Submit entries to entries folder
146
+ for entry in processed_entries:
147
+ success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
148
  if not success:
149
  return styled_error(message)
150
 
151
+ # Get all entries from HF dataset and update leaderboard
152
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
153
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
154
+
155
+ all_entries = []
156
+ for entry_file in entry_files:
157
+ try:
158
+ entry_path = api.hf_hub_download(
159
+ repo_id=RESULTS_DATASET_ID,
160
+ filename=entry_file,
161
+ repo_type="dataset",
162
+ )
163
+ with open(entry_path, 'r') as f:
164
+ entry_data = json.load(f)
165
+ all_entries.append(entry_data)
166
+ except Exception as e:
167
+ print(f"Error loading entry {entry_file}: {e}")
 
 
 
 
 
 
 
 
168
 
169
+ # Update leaderboard with all entries
170
+ success, message = submit_leaderboard_to_hub(all_entries, version)
171
+ if not success:
172
+ return styled_error(message)
173
 
174
+ return styled_message("Submission successful! Model evaluated and leaderboard updated.")
 
175
 
176
  except Exception as e:
177
  return styled_error(f"Error processing submission: {e}")
178
  finally:
179
+ # Clean up temporary files if they exist
180
  try:
181
  if os.path.exists(file_path):
182
  os.remove(file_path)
 
 
183
  except:
184
  pass