Upload folder using huggingface_hub

0584798 verified 2 months ago

6.65 kB

	from __future__ import annotations

	import os
	from collections import Counter

	from combined_inference import classify_query
	from iab_classifier import predict_iab_content_classifier_batch
	from iab_retrieval import predict_iab_content_retrieval_batch
	from iab_taxonomy import parse_path_label


	def _include_shadow_retrieval_in_iab_views() -> bool:
	"""Shadow retrieval loads Alibaba-NLP/gte-Qwen2-1.5B (~7GB) when the taxonomy index exists."""
	value = os.environ.get("IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL", "0").strip().lower()
	return value in ("1", "true", "yes")


	def path_from_content(content: dict) -> tuple[str, ...]:
	path = []
	for tier in ("tier1", "tier2", "tier3", "tier4"):
	if tier in content:
	path.append(content[tier]["label"])
	return tuple(path)


	def path_from_label(label: str) -> tuple[str, ...]:
	return parse_path_label(label)


	def is_parent_safe(true_path: tuple[str, ...], pred_path: tuple[str, ...]) -> bool:
	if not pred_path:
	return False
	if len(pred_path) > len(true_path):
	return False
	return true_path[: len(pred_path)] == pred_path


	def error_bucket(true_path: tuple[str, ...], pred_path: tuple[str, ...]) -> str:
	if pred_path == true_path:
	return "exact_match"
	if not pred_path:
	return "no_prediction"
	if true_path[:1] != pred_path[:1]:
	return "wrong_tier1"
	if len(true_path) >= 2 and (len(pred_path) < 2 or true_path[:2] != pred_path[:2]):
	return "right_tier1_wrong_tier2"
	if is_parent_safe(true_path, pred_path):
	return "parent_safe_stop"
	return "wrong_deep_leaf"


	def compute_path_metrics(true_paths: list[tuple[str, ...]], pred_paths: list[tuple[str, ...]]) -> dict:
	total = len(true_paths)
	if total == 0:
	return {
	"tier1_accuracy": 0.0,
	"tier2_accuracy": 0.0,
	"tier3_accuracy": 0.0,
	"tier4_accuracy": 0.0,
	"exact_path_accuracy": 0.0,
	"parent_safe_accuracy": 0.0,
	"average_prediction_depth": 0.0,
	"error_buckets": {},
	}

	tier_hits = {1: 0, 2: 0, 3: 0, 4: 0}
	tier_totals = {1: 0, 2: 0, 3: 0, 4: 0}
	exact_hits = 0
	parent_safe_hits = 0
	buckets = Counter()
	for true_path, pred_path in zip(true_paths, pred_paths):
	if pred_path == true_path:
	exact_hits += 1
	if is_parent_safe(true_path, pred_path):
	parent_safe_hits += 1
	buckets[error_bucket(true_path, pred_path)] += 1
	for level in range(1, 5):
	if len(true_path) < level:
	continue
	tier_totals[level] += 1
	if len(pred_path) >= level and true_path[:level] == pred_path[:level]:
	tier_hits[level] += 1

	return {
	"tier1_accuracy": round(tier_hits[1] / max(tier_totals[1], 1), 4),
	"tier2_accuracy": round(tier_hits[2] / max(tier_totals[2], 1), 4),
	"tier3_accuracy": round(tier_hits[3] / max(tier_totals[3], 1), 4),
	"tier4_accuracy": round(tier_hits[4] / max(tier_totals[4], 1), 4),
	"exact_path_accuracy": round(exact_hits / total, 4),
	"parent_safe_accuracy": round(parent_safe_hits / total, 4),
	"average_prediction_depth": round(sum(len(path) for path in pred_paths) / total, 4),
	"error_buckets": dict(sorted(buckets.items())),
	}


	def evaluate_iab_views(rows: list[dict], max_combined_rows: int = 500) -> dict:
	texts = [row["text"] for row in rows]
	true_paths = [path_from_label(row["iab_path"]) for row in rows]
	classifier_outputs = predict_iab_content_classifier_batch(texts)
	if not any(output is not None for output in classifier_outputs):
	raise RuntimeError(
	"IAB classifier artifacts are unavailable. Run `python3 training/train_iab.py` "
	"and `python3 training/calibrate_confidence.py --head iab_content` "
	"from the `agentic-intent-classifier` directory first."
	)

	classifier_paths = [path_from_content(output["content"]) if output is not None else tuple() for output in classifier_outputs]
	views = {"classifier": compute_path_metrics(true_paths, classifier_paths)}

	if _include_shadow_retrieval_in_iab_views():
	retrieval_outputs = predict_iab_content_retrieval_batch(texts)
	else:
	retrieval_outputs = [None for _ in texts]
	views["shadow_embedding_retrieval"] = {
	"skipped": True,
	"reason": "disabled_by_default",
	"hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).",
	}

	if any(output is not None for output in retrieval_outputs):
	retrieval_paths = [path_from_content(output["content"]) if output is not None else tuple() for output in retrieval_outputs]
	views["shadow_embedding_retrieval"] = compute_path_metrics(true_paths, retrieval_paths)

	if len(rows) > max_combined_rows:
	views["combined_path"] = {
	"skipped": True,
	"reason": "dataset_too_large_for_combined_view",
	"count": len(rows),
	"max_combined_rows": max_combined_rows,
	}
	views["disagreements"] = {
	"skipped": True,
	"reason": "dataset_too_large_for_combined_view",
	"count": len(rows),
	"max_combined_rows": max_combined_rows,
	}
	return views

	combined_payloads = [classify_query(text) for text in texts]
	combined_contents = [payload["model_output"]["classification"]["iab_content"] for payload in combined_payloads]
	combined_fallbacks = [bool(payload["model_output"].get("fallback")) for payload in combined_payloads]
	combined_paths = [path_from_content(content) for content in combined_contents]
	views["combined_path"] = {
	**compute_path_metrics(true_paths, combined_paths),
	"fallback_rate": round(sum(combined_fallbacks) / max(len(combined_fallbacks), 1), 4),
	"fallback_overuse_count": sum(combined_fallbacks),
	}
	disagreements = {
	"classifier_vs_combined": sum(1 for left, right in zip(classifier_paths, combined_paths) if left != right),
	}
	if any(output is not None for output in retrieval_outputs):
	disagreements["retrieval_vs_classifier"] = sum(
	1 for left, right in zip(retrieval_paths, classifier_paths) if left != right
	)
	disagreements["retrieval_vs_combined"] = sum(
	1 for left, right in zip(retrieval_paths, combined_paths) if left != right
	)
	views["disagreements"] = disagreements
	return views