SorrowTea Claude Sonnet 4.6 commited on
Commit
4e43175
·
1 Parent(s): 9c73731

Fix evaluator warnings and album breakdown display

Browse files

- evaluator: fix partial submission warning, add extraneous query tracking
- app.py: add per-album query counts in result summary
- about.py: update query counts and submission format docs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +14 -1
  2. src/about.py +27 -5
  3. src/evaluator.py +20 -2
app.py CHANGED
@@ -130,12 +130,24 @@ def handle_submission(file_obj, email, model_name, opt_in):
130
  global_metrics=result["global_metrics"],
131
  )
132
  if entry is None:
133
- leaderboard_msg = "Result saved but not eligible for leaderboard (incomplete submission). Only full submissions across all 3 albums are ranked."
 
 
 
134
  else:
135
  leaderboard_msg = "Result published to leaderboard."
136
  else:
137
  leaderboard_msg = "Result recorded privately. Not published to leaderboard."
138
 
 
 
 
 
 
 
 
 
 
139
  # Build result summary
140
  summary = {
141
  "status": "Success",
@@ -145,6 +157,7 @@ def handle_submission(file_obj, email, model_name, opt_in):
145
  "albums": albums,
146
  "evaluated_queries": result["evaluated_queries"],
147
  "total_gt_queries": result["total_gt_queries"],
 
148
  "metrics": result["global_metrics"],
149
  "leaderboard_status": leaderboard_msg,
150
  "notice": "Please download and save your results. Submission data is retained for 30 days only.",
 
130
  global_metrics=result["global_metrics"],
131
  )
132
  if entry is None:
133
+ if result["is_partial"]:
134
+ leaderboard_msg = f"Result saved but NOT eligible for leaderboard: incomplete submission ({result['evaluated_queries']}/{result['total_gt_queries']} queries). Only full submissions across all 3 albums are ranked."
135
+ else:
136
+ leaderboard_msg = "Result saved but NOT eligible for leaderboard. Only full submissions across all 3 albums are ranked."
137
  else:
138
  leaderboard_msg = "Result published to leaderboard."
139
  else:
140
  leaderboard_msg = "Result recorded privately. Not published to leaderboard."
141
 
142
+ # Build per-album breakdown
143
+ album_breakdown = {}
144
+ for a_id, alb_res in result.get("per_album", {}).items():
145
+ album_breakdown[f"album_{a_id}"] = {
146
+ "submitted": alb_res["evaluated_queries"],
147
+ "total": alb_res["total_gt_queries"],
148
+ "complete": not alb_res["is_partial"],
149
+ }
150
+
151
  # Build result summary
152
  summary = {
153
  "status": "Success",
 
157
  "albums": albums,
158
  "evaluated_queries": result["evaluated_queries"],
159
  "total_gt_queries": result["total_gt_queries"],
160
+ "album_breakdown": album_breakdown,
161
  "metrics": result["global_metrics"],
162
  "leaderboard_status": leaderboard_msg,
163
  "notice": "Please download and save your results. Submission data is retained for 30 days only.",
src/about.py CHANGED
@@ -62,7 +62,29 @@ Full dataset download: <a href="https://sbox.myoas.com/l/Be5be4053f6b43840" targ
62
  SUBMISSION_GUIDE = """
63
  ### Submission Format
64
 
65
- Upload a JSON file containing an array of prediction objects:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  ```json
68
  [
@@ -75,11 +97,11 @@ Upload a JSON file containing an array of prediction objects:
75
  ```
76
 
77
  **Required fields:**
78
- - `album_id`: Album number (1, 2, or 3)
79
- - `query_en`: The English query text (must match exactly)
80
- - `pred`: Ordered list of predicted image filenames
81
 
82
- You may submit results for any subset of albums. Partial submissions are accepted and evaluated.
83
  """
84
 
85
  EVALUATION_INFO = """
 
62
  SUBMISSION_GUIDE = """
63
  ### Submission Format
64
 
65
+ The dataset provides `test.json` per album. You must **combine all albums into a single JSON array** and add the `album_id` field to each query before submitting.
66
+
67
+ **Example transformation:**
68
+
69
+ ```python
70
+ import json
71
+
72
+ submission = []
73
+ for album_id in ["1", "2", "3"]:
74
+ with open(f"protected/album{album_id}/test.json") as f:
75
+ queries = json.load(f)
76
+ for q in queries:
77
+ submission.append({
78
+ "album_id": album_id,
79
+ "query_en": q["query_en"],
80
+ "pred": ["IMG_0001.jpg", "IMG_0002.jpg", ...] # your predictions
81
+ })
82
+
83
+ with open("submission.json", "w") as f:
84
+ json.dump(submission, f, indent=2)
85
+ ```
86
+
87
+ **Final submission format:**
88
 
89
  ```json
90
  [
 
97
  ```
98
 
99
  **Required fields:**
100
+ - `album_id`: Album number (`"1"`, `"2"`, or `"3"` — string)
101
+ - `query_en`: The English query text (must match exactly, case-sensitive)
102
+ - `pred`: Ordered list of predicted image filenames (order matters for NDCG)
103
 
104
+ You may submit results for any subset of albums. Partial submissions are accepted and evaluated, but **only full submissions** (all 3 albums, all test queries) are eligible for public leaderboard ranking.
105
  """
106
 
107
  EVALUATION_INFO = """
src/evaluator.py CHANGED
@@ -82,9 +82,11 @@ class Evaluator:
82
  source_accum = {}
83
  empty_gt_queries = 0
84
  evaluated_queries = 0
 
85
 
86
  for q, pred in album_submissions.items():
87
  if q not in gt_map:
 
88
  continue
89
 
90
  gt_item = gt_map[q]
@@ -140,6 +142,7 @@ class Evaluator:
140
  "evaluated_queries": evaluated_queries,
141
  "total_gt_queries": len(gt_data),
142
  "is_partial": evaluated_queries < len(gt_data),
 
143
  }
144
 
145
  def evaluate(self, submission_data: list) -> dict:
@@ -166,6 +169,7 @@ class Evaluator:
166
 
167
  total_evaluated = sum(alb["evaluated_queries"] for alb in per_album.values())
168
  total_gt = sum(alb["total_gt_queries"] for alb in per_album.values())
 
169
 
170
  result = {
171
  "per_album": per_album,
@@ -174,10 +178,24 @@ class Evaluator:
174
  "total_gt_queries": total_gt,
175
  "is_partial": total_evaluated < total_gt,
176
  "albums": sorted(albums.keys()),
 
177
  }
178
 
 
 
 
 
179
  if result["is_partial"]:
180
- missing = [a for a in ["1", "2", "3"] if a not in albums]
181
- result["warning"] = f"Submission incomplete. Missing albums: {', '.join(missing)}. Averaged results across submitted albums shown below."
 
 
 
 
 
 
 
 
 
182
 
183
  return result
 
82
  source_accum = {}
83
  empty_gt_queries = 0
84
  evaluated_queries = 0
85
+ extraneous_queries = 0
86
 
87
  for q, pred in album_submissions.items():
88
  if q not in gt_map:
89
+ extraneous_queries += 1
90
  continue
91
 
92
  gt_item = gt_map[q]
 
142
  "evaluated_queries": evaluated_queries,
143
  "total_gt_queries": len(gt_data),
144
  "is_partial": evaluated_queries < len(gt_data),
145
+ "extraneous_queries": extraneous_queries,
146
  }
147
 
148
  def evaluate(self, submission_data: list) -> dict:
 
169
 
170
  total_evaluated = sum(alb["evaluated_queries"] for alb in per_album.values())
171
  total_gt = sum(alb["total_gt_queries"] for alb in per_album.values())
172
+ total_extraneous = sum(alb.get("extraneous_queries", 0) for alb in per_album.values())
173
 
174
  result = {
175
  "per_album": per_album,
 
178
  "total_gt_queries": total_gt,
179
  "is_partial": total_evaluated < total_gt,
180
  "albums": sorted(albums.keys()),
181
+ "extraneous_queries": total_extraneous,
182
  }
183
 
184
+ # Build warning / notice messages
185
+ msgs = []
186
+ if total_extraneous > 0:
187
+ msgs.append(f"{total_extraneous} extraneous queries were ignored (not in current GT). This may be caused by an outdated test.json or extra queries. Valid queries: {total_evaluated}/{total_gt}.")
188
  if result["is_partial"]:
189
+ missing_albums = [a for a in ["1", "2", "3"] if a not in albums]
190
+ missing_queries = total_gt - total_evaluated
191
+ parts = []
192
+ if missing_albums:
193
+ parts.append(f"Missing albums: {', '.join(missing_albums)}")
194
+ if missing_queries > 0:
195
+ parts.append(f"Missing {missing_queries} queries ({total_evaluated}/{total_gt} submitted)")
196
+ msgs.append("Submission incomplete. " + "; ".join(parts) + ". Only full submissions are eligible for leaderboard ranking.")
197
+
198
+ if msgs:
199
+ result["warning"] = " ".join(msgs)
200
 
201
  return result