Spaces:

Intel
/

low_bit_open_llm_leaderboard

Running

App Files Files Community

low_bit_open_llm_leaderboard / src /populate.py

wenjiao

refactor repo code

b15b21e 6 days ago

raw

history blame contribute delete

18 kB

	import json
	import logging
	import os
	import re
	from datetime import datetime, timezone

	import pandas as pd

	from src.display.formatting import make_clickable_model
	from src.display.utils import eval_queue_cols
	from src.leaderboard.read_auto_pipeline_results import get_auto_pipeline_results_df as _get_auto_pipeline_results_df
	from src.queue_eta import format_eta

	logger = logging.getLogger(__name__)


	def _normalize_queue_entry(data: dict) -> dict:
	"""Normalize queue JSON keys to match EvalQueueColumn/QuantQueueColumn field names.

	Handles mismatches between:
	- JSON 'weight_dtype' → column 'weight_type'
	- Quant entries with 'quant_precision' but no 'precision'
	"""
	if "weight_type" not in data and "weight_dtype" in data:
	data["weight_type"] = data["weight_dtype"]
	if "precision" not in data and "quant_precision" in data:
	data["precision"] = data["quant_precision"]
	return data


	# ── Status inference from results (for quant requests without write-back) ──

	def _normalize_scheme(scheme: str) -> str:
	"""Normalize 'INT4 (W4A16)' and 'W4A16' to the same canonical key."""
	m = re.search(r"\(([^)]+)\)", scheme)
	return (m.group(1) if m else scheme).strip().upper()


	def _has_zero_accuracy_task(accuracy: dict) -> bool:
	"""Return True if any task in accuracy has acc == 0 (evaluation failure)."""
	tasks = accuracy.get("tasks")
	if not isinstance(tasks, dict):
	return False
	for task_val in tasks.values():
	acc_value = task_val if not isinstance(task_val, dict) else task_val.get("accuracy")
	try:
	if acc_value is not None and float(acc_value) == 0.0:
	return True
	except (TypeError, ValueError):
	pass
	return False


	def _derive_status_from_aggregate(agg: dict) -> str:
	"""Derive a pipeline status from an auto_pipeline aggregate result dict."""
	qs = agg.get("quant_summary") or {}
	acc = agg.get("accuracy") or {}
	qs_status = qs.get("status", "missing")
	acc_status = acc.get("status", "missing")

	if qs_status == "failed":
	return "Quant Failed"
	if acc_status == "failed":
	return "Eval Failed"
	if _has_zero_accuracy_task(acc):
	return "Eval Failed"
	if qs_status == "success" and acc_status == "success":
	return "Finished"
	if acc_status == "success":
	return "Finished"
	if qs_status == "success":
	return "Quantized"
	return "Partial"


	def _build_result_index(results_path: str) -> dict:
	"""Scan results/ for auto_pipeline aggregate files, build a lookup index.

	Returns: ``{(model_id_lower, scheme_normalized): [aggregate_dict, ...]}``
	where the list is sorted by ``generated_at`` ascending. Multiple
	aggregates per key are preserved so that a re-submission of a previously
	failed (model, scheme) does not cause the older queue entry to inherit the
	newer run's status.
	"""
	index: dict = {}
	for root, _, files in os.walk(results_path):
	for f in files:
	if not (f.startswith("results_") and f.endswith(".json")):
	continue
	fp = os.path.join(root, f)
	try:
	with open(fp) as fh:
	data = json.load(fh)
	except (json.JSONDecodeError, OSError):
	continue
	# Only auto_pipeline aggregate files have these keys
	if "run_dir" not in data or "copied_files" not in data:
	continue

	model_id = data.get("model_id", "")
	qs = data.get("quant_summary") or {}
	scheme = _normalize_scheme(qs.get("scheme", ""))
	key = (model_id.lower().strip(), scheme)

	index.setdefault(key, []).append(data)

	for key, aggs in index.items():
	aggs.sort(key=lambda a: a.get("generated_at", ""))
	return index


	def _pick_aggregate_in_window(
	aggs: list,
	submitted_time: str,
	next_submitted_time: str \| None,
	) -> dict \| None:
	"""Return the latest aggregate generated within ``[submitted, next_submitted)``.

	The window represents the lifetime of a single queue entry: the entry was
	submitted at ``submitted_time``, and any subsequent re-submission for the
	same (model, scheme) starts a new window at ``next_submitted_time``.
	Aggregates outside the window do not belong to this entry.
	"""
	if not aggs:
	return None
	chosen: dict \| None = None
	for agg in aggs:
	gen = agg.get("generated_at", "")
	if submitted_time and gen <= submitted_time:
	continue
	if next_submitted_time and gen >= next_submitted_time:
	break
	chosen = agg
	return chosen


	def _parse_iso_utc(value: str) -> datetime \| None:
	"""Parse the queue/result UTC timestamp format used in JSON files."""
	if not value:
	return None
	try:
	parsed = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
	except ValueError:
	return None
	if parsed.tzinfo is None:
	parsed = parsed.replace(tzinfo=timezone.utc)
	return parsed.astimezone(timezone.utc)


	def _result_is_newer_than_submission(entry: dict, agg: dict) -> bool:
	"""Return True only when an aggregate result was generated after submission.

	A failed model can be re-submitted with the same model/scheme key. In that
	case old aggregate result files still match the new request, but they must
	not override the fresh ``Pending`` status in ``status/``.
	"""
	submitted_at = _parse_iso_utc(entry.get("submitted_time", ""))
	generated_at = _parse_iso_utc(agg.get("generated_at", ""))
	if submitted_at is None or generated_at is None:
	# Keep the previous behaviour if one side has no comparable timestamp.
	return True
	return generated_at > submitted_at


	def _infer_quant_status(entry: dict, result_index: dict) -> str:
	"""Infer the true status of a quant request from results/ aggregates.

	The caller is expected to have populated ``entry["_matched_aggregate"]``
	with the aggregate that belongs to this submission window (see
	``_attach_matched_aggregates``). ``result_index`` is kept in the signature
	for backward compatibility but no longer used directly.

	Falls back to the original status field if no matching result is found.
	"""
	original_status = entry.get("status", "Pending")
	# Already a terminal status — trust it. ``Failed`` is the value written by
	# the CI dispatcher when retries are exhausted; map it to ``Quant Failed``
	# so the UI groups it with the rest of the failed quantizations instead of
	# dropping it entirely.
	if original_status in ("Finished", "Quant Failed", "Eval Failed"):
	return original_status
	if original_status == "Failed":
	return "Quant Failed"

	agg = entry.get("_matched_aggregate")
	if agg is None:
	return original_status
	return _derive_status_from_aggregate(agg)


	def _infer_eval_status(entry: dict, result_index: dict) -> str:
	"""Infer the true status of an eval request from results/ aggregates.

	Uses the per-entry aggregate stored in ``entry["_matched_aggregate"]``
	(populated by ``_attach_matched_aggregates``) so that an older failed
	submission is not overwritten by a newer success for the same model.
	"""
	original_status = entry.get("status", "Pending")
	if original_status in ("Finished", "Eval Failed"):
	return original_status
	if original_status == "Failed":
	return "Eval Failed"

	agg = entry.get("_matched_aggregate")
	if agg is None:
	return original_status

	acc = agg.get("accuracy") or {}
	acc_status = acc.get("status", "missing")
	if acc_status == "failed" or _has_zero_accuracy_task(acc):
	return "Eval Failed"
	if acc_status == "success":
	return "Finished"
	return original_status


	def _quant_match_key(entry: dict) -> tuple[str, str]:
	model = entry.get("_model_id", entry.get("model", ""))
	scheme = _normalize_scheme(entry.get("quant_scheme", ""))
	return (model.lower().strip(), scheme)


	def _eval_match_keys(entry: dict, result_index: dict) -> list[tuple[str, str]]:
	"""Return all aggregate keys whose model matches this eval entry.

	Eval entries are submitted against an already-quantized model and don't
	carry a separate ``quant_scheme`` field for matching, so we collect every
	indexed key whose model_id matches.
	"""
	model = entry.get("_model_id", entry.get("model", "")).lower().strip()
	return [k for k in result_index if k[0] == model]


	def _attach_matched_aggregates(
	entries: list[dict],
	result_index: dict,
	request_type: str \| None,
	) -> None:
	"""Attach the per-entry aggregate to ``entry["_matched_aggregate"]``.

	Entries are grouped by their inference key and sorted by ``submitted_time``
	so that aggregates can be windowed: the i-th submission for a given key
	consumes only aggregates generated between its ``submitted_time`` and the
	(i+1)-th submission's ``submitted_time``. This prevents an older failed
	queue entry from inheriting a newer successful run's status when a model
	is re-submitted.
	"""
	# Build groups by key
	groups: dict = {}
	for entry in entries:
	if request_type == "quant":
	keys = [_quant_match_key(entry)]
	elif request_type == "eval":
	keys = _eval_match_keys(entry, result_index)
	else:
	keys = [_quant_match_key(entry)]
	# Use a single representative key per entry so that windowing is
	# well-defined. For eval we may have multiple matching keys; in that
	# case window each group independently.
	for key in keys or [None]:
	groups.setdefault(key, []).append(entry)

	for key, group_entries in groups.items():
	if key is None:
	continue
	aggs = result_index.get(key, [])
	if not aggs:
	continue
	ordered = sorted(group_entries, key=lambda e: e.get("submitted_time", ""))
	for i, entry in enumerate(ordered):
	submitted = entry.get("submitted_time", "")
	next_submitted = (
	ordered[i + 1].get("submitted_time", "") if i + 1 < len(ordered) else None
	)
	agg = _pick_aggregate_in_window(aggs, submitted, next_submitted)
	if agg is not None:
	# First match wins — for eval entries that map to multiple
	# keys, keep the earliest generated aggregate.
	existing = entry.get("_matched_aggregate")
	if existing is None or agg.get("generated_at", "") < existing.get("generated_at", ""):
	entry["_matched_aggregate"] = agg


	def _load_queue_entries(save_path: str, request_type: str = None) -> list[dict]:
	"""Load all queue JSON entries from save_path (including subdirectories).

	Args:
	save_path: Directory containing queue JSON files.
	request_type: Optional filter — ``"eval"`` keeps only ``_eval_request_``
	files, ``"quant"`` keeps only ``_quant_request_`` files.
	``None`` keeps all files (backward-compatible).

	Returns:
	List of loaded & normalized dicts.
	"""
	if not os.path.isdir(save_path):
	return []
	entries = [e for e in os.listdir(save_path) if not e.startswith(".")]
	all_evals = []

	def _process_file(file_path: str):
	fname = os.path.basename(file_path)
	if request_type == "eval" and "_quant_request_" in fname:
	return
	if request_type == "quant" and "_eval_request_" in fname:
	return

	try:
	with open(file_path) as fp:
	data = json.load(fp)
	except (json.JSONDecodeError, OSError) as e:
	logger.warning("Skipping malformed queue file %s: %s", file_path, e)
	return

	if "model" not in data:
	logger.warning("Skipping queue file without 'model' key: %s", file_path)
	return

	data["_model_id"] = data["model"] # preserve raw ID before HTML transform
	data[eval_queue_cols.model.name] = make_clickable_model(data["model"])
	data[eval_queue_cols.revision.name] = data.get("revision", "main")
	_normalize_queue_entry(data)
	all_evals.append(data)

	for entry in entries:
	full_path = os.path.join(save_path, entry)
	if entry.endswith(".json"):
	_process_file(full_path)
	elif os.path.isdir(full_path):
	sub_entries = [e for e in os.listdir(full_path) if not e.startswith(".")]
	for sub_entry in sub_entries:
	if sub_entry.endswith(".json"):
	_process_file(os.path.join(full_path, sub_entry))

	return all_evals


	def _split_by_status(all_evals: list[dict]) -> tuple[list, list, list, list]:
	"""Split entries into (pending, running, finished, failed) lists by status."""
	pending = [e for e in all_evals if e.get("status") in ("Pending", "Rerun", "Waiting", "Quantized")]
	running = [e for e in all_evals if e.get("status") in ("Running", "Triggered")]
	finished = [e for e in all_evals
	if e.get("status", "").startswith("Finished")
	or e.get("status") == "PENDING_NEW_EVAL"]
	# ``Failed`` is written by the CI dispatcher when retries are exhausted.
	# Include it (and its case variants) here so retry-exhausted entries are
	# not silently dropped from the UI.
	failed = [e for e in all_evals
	if e.get("status") in ("Quant Failed", "Eval Failed", "Partial",
	"Failed", "failed")]
	return pending, running, finished, failed


	def _build_queue_dfs(pending: list, running: list, finished: list, failed: list,
	cols: list) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
	"""Build DataFrames from split lists, keeping only the requested cols."""
	# Determine which requested cols actually exist across ALL records so we
	# never drop a column just because the first entry happens to be old/missing it.
	all_records = pending + running + finished + failed
	if all_records:
	present_keys: set[str] = set()
	for rec in all_records:
	present_keys.update(rec.keys())
	existing_cols = [c for c in cols if c in present_keys]
	else:
	existing_cols = cols

	def _to_df(records):
	if not records:
	return pd.DataFrame(columns=existing_cols)
	return pd.DataFrame.from_records(records, columns=existing_cols)

	df_finished = _to_df(finished)
	df_running = _to_df(running)
	df_pending = _to_df(pending)
	df_failed = _to_df(failed)

	return df_finished, df_running, df_pending, df_failed


	def _inject_eta(pending: list[dict], running: list[dict], concurrency: int = 2):
	"""Add an ``eta`` field to each pending entry using slot-simulation ETA.

	Formula: ETA = running_remaining + ⌈queue_pos / concurrency⌉ × task_hours

	Modifies pending in-place (sorted by submitted_time first).
	Non-auto entries get an empty ETA string.
	"""
	import math
	from src.queue_eta import estimate_task_hours, format_eta, _get_params

	# Sort pending by submitted_time (oldest first) for correct queue position
	pending.sort(key=lambda e: e.get("submitted_time", ""))

	# Estimate average remaining time for running entries
	if running:
	active_hours = [estimate_task_hours(_get_params(e)) for e in running]
	running_remaining = sum(active_hours) / len(active_hours)
	else:
	running_remaining = 0.0

	auto_pos = 0 # counter for auto_quant/auto_eval entries only
	for entry in pending:
	script = entry.get("script", "")
	if script in ("auto_quant", "auto_eval"):
	auto_pos += 1
	task_hours = estimate_task_hours(_get_params(entry))
	eta = running_remaining + math.ceil(auto_pos / concurrency) * task_hours
	entry["eta"] = format_eta(eta)
	else:
	entry["eta"] = ""


	def get_evaluation_queue_df(save_path: str, cols: list,
	request_type: str = None,
	results_path: str = None) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
	"""Load evaluation queue and split into (finished, running, pending, failed) DataFrames.

	Args:
	save_path: Directory containing queue JSON files.
	cols: Column names to keep in the output DataFrames.
	request_type: ``"eval"``, ``"quant"``, or ``None`` (all).
	results_path: Path to results directory. When provided the status
	of each entry is cross-referenced against auto_pipeline aggregate
	results so that completed jobs whose status was never written back
	are correctly classified.
	"""
	all_evals = _load_queue_entries(save_path, request_type=request_type)

	# Infer true status from results for entries without write-back
	if results_path:
	result_index = _build_result_index(results_path)
	_attach_matched_aggregates(all_evals, result_index, request_type)
	for entry in all_evals:
	if request_type == "quant":
	entry["status"] = _infer_quant_status(entry, result_index)
	elif request_type == "eval":
	entry["status"] = _infer_eval_status(entry, result_index)

	pending, running, finished, failed = _split_by_status(all_evals)

	# Inject ETA for pending entries
	_inject_eta(pending, running, concurrency=2)

	# Ensure eta column exists in non-pending lists (empty for them)
	for entry in running + finished + failed:
	entry.setdefault("eta", "")

	return _build_queue_dfs(pending, running, finished, failed, cols)


	def get_auto_pipeline_results_df(results_path: str) -> pd.DataFrame:
	return _get_auto_pipeline_results_df(results_path)