Upload iab_taxonomy.py with huggingface_hub

ed7ac51 verified 2 months ago

7.44 kB

	from __future__ import annotations

	import csv
	import json
	import os
	from dataclasses import dataclass
	from functools import lru_cache
	from pathlib import Path

	try:
	from .config import IAB_TAXONOMY_GRAPH_PATH, IAB_TAXONOMY_PATH, IAB_TAXONOMY_VERSION # type: ignore
	except ImportError:
	from config import IAB_TAXONOMY_GRAPH_PATH, IAB_TAXONOMY_PATH, IAB_TAXONOMY_VERSION

	_DEFAULT_MODEL_REPO_ID = "admesh/agentic-intent-classifier"


	@dataclass(frozen=True)
	class IabNode:
	unique_id: str
	parent_id: str \| None
	label: str
	path: tuple[str, ...]

	@property
	def level(self) -> int:
	return len(self.path)

	@property
	def path_label(self) -> str:
	return path_to_label(self.path)


	class IabTaxonomy:
	def __init__(self, nodes: list[IabNode]):
	self.nodes = nodes
	self._path_index = {node.path: node for node in nodes}
	self._children_index: dict[tuple[str, ...], list[IabNode]] = {}
	self._level_index: dict[int, list[IabNode]] = {}
	for node in nodes:
	self._children_index.setdefault(node.path[:-1], []).append(node)
	self._level_index.setdefault(node.level, []).append(node)
	for children in self._children_index.values():
	children.sort(key=lambda item: item.path)
	for level_nodes in self._level_index.values():
	level_nodes.sort(key=lambda item: item.path)

	def get_node(self, path: tuple[str, ...]) -> IabNode:
	if path not in self._path_index:
	raise KeyError(f"Unknown IAB path: {path}")
	return self._path_index[path]

	def build_level(self, path: tuple[str, ...]) -> dict:
	node = self.get_node(path)
	return {"id": node.unique_id, "label": node.label}

	def has_path(self, path: tuple[str, ...]) -> bool:
	return path in self._path_index

	def immediate_children(self, prefix: tuple[str, ...]) -> list[IabNode]:
	return list(self._children_index.get(prefix, []))

	def siblings(self, path: tuple[str, ...]) -> list[IabNode]:
	node = self.get_node(path)
	return [candidate for candidate in self._children_index.get(path[:-1], []) if candidate.path != node.path]

	def level_nodes(self, level: int) -> list[IabNode]:
	return list(self._level_index.get(level, []))

	def to_training_graph(self) -> dict:
	nodes = []
	for node in self.nodes:
	child_nodes = self.immediate_children(node.path)
	sibling_nodes = self.siblings(node.path)
	nodes.append(
	{
	"node_id": node.unique_id,
	"parent_id": node.parent_id,
	"level": node.level,
	"label": node.label,
	"path": list(node.path),
	"path_label": node.path_label,
	"child_ids": [child.unique_id for child in child_nodes],
	"child_paths": [child.path_label for child in child_nodes],
	"sibling_ids": [sibling.unique_id for sibling in sibling_nodes],
	"sibling_paths": [sibling.path_label for sibling in sibling_nodes],
	"canonical_surface_name": node.label,
	}
	)
	return {
	"taxonomy": "IAB Content Taxonomy",
	"taxonomy_version": IAB_TAXONOMY_VERSION,
	"node_count": len(nodes),
	"level_counts": {
	f"tier{level}": len(self.level_nodes(level))
	for level in range(1, 5)
	},
	"nodes": nodes,
	}

	def build_content_object(self, path: tuple[str, ...], mapping_mode: str, mapping_confidence: float) -> dict:
	if not path:
	raise ValueError("IAB path must not be empty")

	payload = {
	"taxonomy": "IAB Content Taxonomy",
	"taxonomy_version": IAB_TAXONOMY_VERSION,
	"tier1": self.build_level(path[:1]),
	"mapping_mode": mapping_mode,
	"mapping_confidence": round(float(mapping_confidence), 4),
	}
	if len(path) >= 2:
	payload["tier2"] = self.build_level(path[:2])
	if len(path) >= 3:
	payload["tier3"] = self.build_level(path[:3])
	if len(path) >= 4:
	payload["tier4"] = self.build_level(path[:4])
	return payload

	def build_content_object_from_label(
	self,
	path_label: str,
	mapping_mode: str,
	mapping_confidence: float,
	) -> dict:
	return self.build_content_object(
	path=parse_path_label(path_label),
	mapping_mode=mapping_mode,
	mapping_confidence=mapping_confidence,
	)


	def parse_path_label(path_label: str) -> tuple[str, ...]:
	path = tuple(part.strip() for part in path_label.split(">") if part.strip())
	if not path:
	raise ValueError("IAB path label must not be empty")
	return path


	def path_to_label(path: tuple[str, ...]) -> str:
	if not path:
	raise ValueError("IAB path must not be empty")
	return " > ".join(path)


	def _load_rows(path: Path) -> list[dict]:
	with path.open("r", encoding="utf-8") as handle:
	reader = csv.reader(handle, delimiter="\t")
	rows = list(reader)

	header = rows[1]
	data_rows = rows[2:]
	parsed = []
	for row in data_rows:
	padded = row + [""] * (len(header) - len(row))
	parsed.append(dict(zip(header, padded)))
	return parsed


	def _resolve_taxonomy_path() -> Path:
	"""Resolve taxonomy TSV path for local and HF trust_remote_code environments."""
	if IAB_TAXONOMY_PATH.exists():
	return IAB_TAXONOMY_PATH

	# HF dynamic modules often do not contain non-Python data files.
	# Fetch the taxonomy TSV directly from the model repo as a fallback.
	repo_id = os.environ.get("ADMESH_MODEL_REPO_ID", _DEFAULT_MODEL_REPO_ID).strip() or _DEFAULT_MODEL_REPO_ID
	revision = os.environ.get("ADMESH_MODEL_REVISION", "").strip() or None
	filename = f"data/iab-content/Content Taxonomy {IAB_TAXONOMY_VERSION}.tsv"
	try:
	from huggingface_hub import hf_hub_download
	except ModuleNotFoundError as exc:
	raise FileNotFoundError(
	f"Taxonomy TSV missing at {IAB_TAXONOMY_PATH}; install huggingface_hub or provide local taxonomy file."
	) from exc

	downloaded = hf_hub_download(
	repo_id=repo_id,
	repo_type="model",
	filename=filename,
	revision=revision,
	)
	return Path(downloaded)


	@lru_cache(maxsize=1)
	def get_iab_taxonomy() -> IabTaxonomy:
	nodes = []
	for row in _load_rows(_resolve_taxonomy_path()):
	path = tuple(
	value.strip()
	for key in ("Tier 1", "Tier 2", "Tier 3", "Tier 4")
	if (value := row.get(key, "").strip())
	)
	if not path:
	continue
	nodes.append(
	IabNode(
	unique_id=row["Unique ID"].strip(),
	parent_id=row["Parent"].strip() or None,
	label=row["Name"].strip(),
	path=path,
	)
	)
	return IabTaxonomy(nodes)


	def write_training_graph(path: Path = IAB_TAXONOMY_GRAPH_PATH) -> Path:
	taxonomy = get_iab_taxonomy()
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(json.dumps(taxonomy.to_training_graph(), indent=2, sort_keys=True) + "\n", encoding="utf-8")
	return path