Spaces:
Running
Running
Fix evaluator warnings and album breakdown display
Browse files- evaluator: fix partial submission warning, add extraneous query tracking
- app.py: add per-album query counts in result summary
- about.py: update query counts and submission format docs
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- app.py +14 -1
- src/about.py +27 -5
- src/evaluator.py +20 -2
app.py
CHANGED
|
@@ -130,12 +130,24 @@ def handle_submission(file_obj, email, model_name, opt_in):
|
|
| 130 |
global_metrics=result["global_metrics"],
|
| 131 |
)
|
| 132 |
if entry is None:
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
| 134 |
else:
|
| 135 |
leaderboard_msg = "Result published to leaderboard."
|
| 136 |
else:
|
| 137 |
leaderboard_msg = "Result recorded privately. Not published to leaderboard."
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# Build result summary
|
| 140 |
summary = {
|
| 141 |
"status": "Success",
|
|
@@ -145,6 +157,7 @@ def handle_submission(file_obj, email, model_name, opt_in):
|
|
| 145 |
"albums": albums,
|
| 146 |
"evaluated_queries": result["evaluated_queries"],
|
| 147 |
"total_gt_queries": result["total_gt_queries"],
|
|
|
|
| 148 |
"metrics": result["global_metrics"],
|
| 149 |
"leaderboard_status": leaderboard_msg,
|
| 150 |
"notice": "Please download and save your results. Submission data is retained for 30 days only.",
|
|
|
|
| 130 |
global_metrics=result["global_metrics"],
|
| 131 |
)
|
| 132 |
if entry is None:
|
| 133 |
+
if result["is_partial"]:
|
| 134 |
+
leaderboard_msg = f"Result saved but NOT eligible for leaderboard: incomplete submission ({result['evaluated_queries']}/{result['total_gt_queries']} queries). Only full submissions across all 3 albums are ranked."
|
| 135 |
+
else:
|
| 136 |
+
leaderboard_msg = "Result saved but NOT eligible for leaderboard. Only full submissions across all 3 albums are ranked."
|
| 137 |
else:
|
| 138 |
leaderboard_msg = "Result published to leaderboard."
|
| 139 |
else:
|
| 140 |
leaderboard_msg = "Result recorded privately. Not published to leaderboard."
|
| 141 |
|
| 142 |
+
# Build per-album breakdown
|
| 143 |
+
album_breakdown = {}
|
| 144 |
+
for a_id, alb_res in result.get("per_album", {}).items():
|
| 145 |
+
album_breakdown[f"album_{a_id}"] = {
|
| 146 |
+
"submitted": alb_res["evaluated_queries"],
|
| 147 |
+
"total": alb_res["total_gt_queries"],
|
| 148 |
+
"complete": not alb_res["is_partial"],
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
# Build result summary
|
| 152 |
summary = {
|
| 153 |
"status": "Success",
|
|
|
|
| 157 |
"albums": albums,
|
| 158 |
"evaluated_queries": result["evaluated_queries"],
|
| 159 |
"total_gt_queries": result["total_gt_queries"],
|
| 160 |
+
"album_breakdown": album_breakdown,
|
| 161 |
"metrics": result["global_metrics"],
|
| 162 |
"leaderboard_status": leaderboard_msg,
|
| 163 |
"notice": "Please download and save your results. Submission data is retained for 30 days only.",
|
src/about.py
CHANGED
|
@@ -62,7 +62,29 @@ Full dataset download: <a href="https://sbox.myoas.com/l/Be5be4053f6b43840" targ
|
|
| 62 |
SUBMISSION_GUIDE = """
|
| 63 |
### Submission Format
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
```json
|
| 68 |
[
|
|
@@ -75,11 +97,11 @@ Upload a JSON file containing an array of prediction objects:
|
|
| 75 |
```
|
| 76 |
|
| 77 |
**Required fields:**
|
| 78 |
-
- `album_id`: Album number (1, 2, or 3)
|
| 79 |
-
- `query_en`: The English query text (must match exactly)
|
| 80 |
-
- `pred`: Ordered list of predicted image filenames
|
| 81 |
|
| 82 |
-
You may submit results for any subset of albums. Partial submissions are accepted and evaluated.
|
| 83 |
"""
|
| 84 |
|
| 85 |
EVALUATION_INFO = """
|
|
|
|
| 62 |
SUBMISSION_GUIDE = """
|
| 63 |
### Submission Format
|
| 64 |
|
| 65 |
+
The dataset provides `test.json` per album. You must **combine all albums into a single JSON array** and add the `album_id` field to each query before submitting.
|
| 66 |
+
|
| 67 |
+
**Example transformation:**
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
import json
|
| 71 |
+
|
| 72 |
+
submission = []
|
| 73 |
+
for album_id in ["1", "2", "3"]:
|
| 74 |
+
with open(f"protected/album{album_id}/test.json") as f:
|
| 75 |
+
queries = json.load(f)
|
| 76 |
+
for q in queries:
|
| 77 |
+
submission.append({
|
| 78 |
+
"album_id": album_id,
|
| 79 |
+
"query_en": q["query_en"],
|
| 80 |
+
"pred": ["IMG_0001.jpg", "IMG_0002.jpg", ...] # your predictions
|
| 81 |
+
})
|
| 82 |
+
|
| 83 |
+
with open("submission.json", "w") as f:
|
| 84 |
+
json.dump(submission, f, indent=2)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
**Final submission format:**
|
| 88 |
|
| 89 |
```json
|
| 90 |
[
|
|
|
|
| 97 |
```
|
| 98 |
|
| 99 |
**Required fields:**
|
| 100 |
+
- `album_id`: Album number (`"1"`, `"2"`, or `"3"` — string)
|
| 101 |
+
- `query_en`: The English query text (must match exactly, case-sensitive)
|
| 102 |
+
- `pred`: Ordered list of predicted image filenames (order matters for NDCG)
|
| 103 |
|
| 104 |
+
You may submit results for any subset of albums. Partial submissions are accepted and evaluated, but **only full submissions** (all 3 albums, all test queries) are eligible for public leaderboard ranking.
|
| 105 |
"""
|
| 106 |
|
| 107 |
EVALUATION_INFO = """
|
src/evaluator.py
CHANGED
|
@@ -82,9 +82,11 @@ class Evaluator:
|
|
| 82 |
source_accum = {}
|
| 83 |
empty_gt_queries = 0
|
| 84 |
evaluated_queries = 0
|
|
|
|
| 85 |
|
| 86 |
for q, pred in album_submissions.items():
|
| 87 |
if q not in gt_map:
|
|
|
|
| 88 |
continue
|
| 89 |
|
| 90 |
gt_item = gt_map[q]
|
|
@@ -140,6 +142,7 @@ class Evaluator:
|
|
| 140 |
"evaluated_queries": evaluated_queries,
|
| 141 |
"total_gt_queries": len(gt_data),
|
| 142 |
"is_partial": evaluated_queries < len(gt_data),
|
|
|
|
| 143 |
}
|
| 144 |
|
| 145 |
def evaluate(self, submission_data: list) -> dict:
|
|
@@ -166,6 +169,7 @@ class Evaluator:
|
|
| 166 |
|
| 167 |
total_evaluated = sum(alb["evaluated_queries"] for alb in per_album.values())
|
| 168 |
total_gt = sum(alb["total_gt_queries"] for alb in per_album.values())
|
|
|
|
| 169 |
|
| 170 |
result = {
|
| 171 |
"per_album": per_album,
|
|
@@ -174,10 +178,24 @@ class Evaluator:
|
|
| 174 |
"total_gt_queries": total_gt,
|
| 175 |
"is_partial": total_evaluated < total_gt,
|
| 176 |
"albums": sorted(albums.keys()),
|
|
|
|
| 177 |
}
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
if result["is_partial"]:
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
return result
|
|
|
|
| 82 |
source_accum = {}
|
| 83 |
empty_gt_queries = 0
|
| 84 |
evaluated_queries = 0
|
| 85 |
+
extraneous_queries = 0
|
| 86 |
|
| 87 |
for q, pred in album_submissions.items():
|
| 88 |
if q not in gt_map:
|
| 89 |
+
extraneous_queries += 1
|
| 90 |
continue
|
| 91 |
|
| 92 |
gt_item = gt_map[q]
|
|
|
|
| 142 |
"evaluated_queries": evaluated_queries,
|
| 143 |
"total_gt_queries": len(gt_data),
|
| 144 |
"is_partial": evaluated_queries < len(gt_data),
|
| 145 |
+
"extraneous_queries": extraneous_queries,
|
| 146 |
}
|
| 147 |
|
| 148 |
def evaluate(self, submission_data: list) -> dict:
|
|
|
|
| 169 |
|
| 170 |
total_evaluated = sum(alb["evaluated_queries"] for alb in per_album.values())
|
| 171 |
total_gt = sum(alb["total_gt_queries"] for alb in per_album.values())
|
| 172 |
+
total_extraneous = sum(alb.get("extraneous_queries", 0) for alb in per_album.values())
|
| 173 |
|
| 174 |
result = {
|
| 175 |
"per_album": per_album,
|
|
|
|
| 178 |
"total_gt_queries": total_gt,
|
| 179 |
"is_partial": total_evaluated < total_gt,
|
| 180 |
"albums": sorted(albums.keys()),
|
| 181 |
+
"extraneous_queries": total_extraneous,
|
| 182 |
}
|
| 183 |
|
| 184 |
+
# Build warning / notice messages
|
| 185 |
+
msgs = []
|
| 186 |
+
if total_extraneous > 0:
|
| 187 |
+
msgs.append(f"{total_extraneous} extraneous queries were ignored (not in current GT). This may be caused by an outdated test.json or extra queries. Valid queries: {total_evaluated}/{total_gt}.")
|
| 188 |
if result["is_partial"]:
|
| 189 |
+
missing_albums = [a for a in ["1", "2", "3"] if a not in albums]
|
| 190 |
+
missing_queries = total_gt - total_evaluated
|
| 191 |
+
parts = []
|
| 192 |
+
if missing_albums:
|
| 193 |
+
parts.append(f"Missing albums: {', '.join(missing_albums)}")
|
| 194 |
+
if missing_queries > 0:
|
| 195 |
+
parts.append(f"Missing {missing_queries} queries ({total_evaluated}/{total_gt} submitted)")
|
| 196 |
+
msgs.append("Submission incomplete. " + "; ".join(parts) + ". Only full submissions are eligible for leaderboard ranking.")
|
| 197 |
+
|
| 198 |
+
if msgs:
|
| 199 |
+
result["warning"] = " ".join(msgs)
|
| 200 |
|
| 201 |
return result
|