admesh
/

agentic-intent-classifier

Text Classification

intent-classification

conversational-ai

calibrated-confidence

text-embeddings-inference

Model card Files Files and versions

agentic-intent-classifier / training /common.py

manikumargouni's picture

Upload folder using huggingface_hub

0584798 verified 2 months ago

history blame contribute delete

2.57 kB

	from __future__ import annotations

	import json
	import sys
	from pathlib import Path

	import numpy as np
	import torch
	from datasets import Dataset
	from sklearn.metrics import accuracy_score, f1_score

	BASE_DIR = Path(__file__).resolve().parent.parent
	if str(BASE_DIR) not in sys.path:
	sys.path.insert(0, str(BASE_DIR))


	def load_labeled_rows(path: Path, label_field: str, label2id: dict[str, int]) -> list[dict]:
	rows = []
	with path.open("r", encoding="utf-8") as handle:
	for line in handle:
	item = json.loads(line)
	rows.append({"text": item["text"], "label": label2id[item[label_field]]})
	return rows


	def load_labeled_rows_from_paths(paths: list[Path], label_field: str, label2id: dict[str, int]) -> list[dict]:
	rows = []
	for path in paths:
	if not path.exists():
	continue
	rows.extend(load_labeled_rows(path, label_field, label2id))
	return rows


	def prepare_dataset(rows: list[dict], tokenizer, max_length: int) -> Dataset:
	dataset = Dataset.from_list(rows)

	def tokenize(batch):
	return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_length)

	dataset = dataset.map(tokenize, batched=True)
	dataset = dataset.remove_columns(["text"])
	dataset.set_format("torch")
	return dataset


	def build_balanced_class_weights(rows: list[dict], num_labels: int) -> torch.Tensor:
	counts = np.zeros(num_labels, dtype=np.float32)
	for row in rows:
	counts[row["label"]] += 1.0

	nonzero = counts > 0
	if not np.any(nonzero):
	return torch.ones(num_labels, dtype=torch.float32)

	total = float(counts.sum())
	active_labels = float(np.count_nonzero(nonzero))
	weights = np.ones(num_labels, dtype=np.float32)
	weights[nonzero] = total / (active_labels * counts[nonzero])
	return torch.tensor(weights, dtype=torch.float32)


	def build_label_weight_tensor(labels: tuple[str, ...], weight_map: dict[str, float]) -> torch.Tensor:
	return torch.tensor(
	[float(weight_map.get(label, 1.0)) for label in labels],
	dtype=torch.float32,
	)


	def compute_classification_metrics(eval_pred):
	logits, labels = eval_pred
	preds = np.argmax(logits, axis=-1)
	return {
	"accuracy": accuracy_score(labels, preds),
	"macro_f1": f1_score(labels, preds, average="macro"),
	}


	def write_json(path: Path, payload: dict) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")