agentic-intent-classifier / training /build_decision_phase_difficulty_dataset.py

Upload folder using huggingface_hub

0584798 verified 2 months ago

13 kB

	from __future__ import annotations

	import json
	from pathlib import Path

	BASE_DIR = Path(__file__).resolve().parent.parent
	OUTPUT_DIR = BASE_DIR / "data" / "decision_phase_difficulty"
	BENCHMARK_PATH = BASE_DIR / "data" / "decision_phase_benchmark.jsonl"

	TRAIN_THEMES = (
	{
	"topic": "crm software",
	"product": "CRM",
	"products": "CRM tools",
	"provider_a": "HubSpot",
	"provider_b": "Zoho",
	"domain": "a small sales team",
	"goal": "manage leads better",
	"support_object": "account",
	"support_detail": "password reset",
	"asset": "CRM onboarding guide",
	},
	{
	"topic": "analytics software",
	"product": "analytics platform",
	"products": "analytics platforms",
	"provider_a": "Mixpanel",
	"provider_b": "Amplitude",
	"domain": "a product team",
	"goal": "measure activation",
	"support_object": "dashboard",
	"support_detail": "data sync issue",
	"asset": "analytics setup guide",
	},
	{
	"topic": "laptops",
	"product": "laptop",
	"products": "laptops",
	"provider_a": "MacBook Air",
	"provider_b": "Dell XPS 13",
	"domain": "college work",
	"goal": "choose the right laptop",
	"support_object": "order",
	"support_detail": "delivery delay",
	"asset": "laptop buying checklist",
	},
	)

	BENCHMARK_THEMES = {
	"easy": {
	"topic": "help desk software",
	"product": "help desk platform",
	"products": "help desk tools",
	"provider_a": "Zendesk",
	"provider_b": "Freshdesk",
	"domain": "a support team",
	"goal": "handle tickets faster",
	"support_object": "billing portal",
	"support_detail": "invoice issue",
	"asset": "help desk buyer guide",
	},
	"medium": {
	"topic": "cars",
	"product": "car",
	"products": "cars",
	"provider_a": "Toyota Corolla",
	"provider_b": "Honda Civic",
	"domain": "daily commuting",
	"goal": "choose the right car",
	"support_object": "reservation",
	"support_detail": "test drive booking",
	"asset": "car buying worksheet",
	},
	"hard": {
	"topic": "hosting platforms",
	"product": "hosting platform",
	"products": "hosting providers",
	"provider_a": "Vercel",
	"provider_b": "Netlify",
	"domain": "a startup launch",
	"goal": "ship a new website",
	"support_object": "deployment",
	"support_detail": "domain setup problem",
	"asset": "hosting migration guide",
	},
	}

	PHASE_TEMPLATES = {
	"awareness": {
	"easy": (
	"What is {topic}?",
	"Explain {topic}.",
	"How does {product} work?",
	"What does {provider_a} do?",
	"Give me the basics of {topic}.",
	),
	"medium": (
	"Help me understand what problem {provider_a} solves.",
	"What should a beginner know about {products}?",
	"Before I look at options, what is a {product}?",
	"What is the purpose of {topic} in {domain}?",
	"What does a {product} actually help with?",
	),
	"hard": (
	"I am not shopping yet, I just want to understand what {topic} is.",
	"Before I evaluate anything, what role does {topic} play in {domain}?",
	"I keep hearing about {provider_a}; what is it actually for?",
	"Can you clarify what people mean by {topic} in practice?",
	"I only need an overview of {topic} right now.",
	),
	},
	"research": {
	"easy": (
	"What {products} should I explore for {domain}?",
	"Show me options to consider for {goal}.",
	"What tools should I look at for {domain}?",
	"Help me research {products}.",
	"Where should I start with {products}?",
	),
	"medium": (
	"I am early in the process and want to explore {products}.",
	"Give me a shortlist of {products} worth researching.",
	"What directions should I investigate for {goal}?",
	"What categories should I look at before narrowing down?",
	"What are some promising {products} for {domain}?",
	),
	"hard": (
	"I am not ready to compare vendors yet, just help me scope the market.",
	"What should I research first if I am only beginning to look at {products}?",
	"I need a landscape view before I make a shortlist.",
	"What are the main options in this space before I decide anything?",
	"Help me map the market for {products} without recommending one yet.",
	),
	},
	"consideration": {
	"easy": (
	"Best {product} for {domain}.",
	"{provider_a} vs {provider_b}.",
	"Compare {products} for {goal}.",
	"Which {product} looks best for {domain}?",
	"What are some {products} worth considering?",
	),
	"medium": (
	"Compare {provider_a} and {provider_b} for {goal}.",
	"What are the pros and cons of {provider_a}?",
	"Help me evaluate the best {products} for {domain}.",
	"Which {product} seems worth considering right now?",
	"I am comparing options for {goal}; what should be on the shortlist?",
	),
	"hard": (
	"I am past basic research and now weighing tradeoffs between {provider_a} and {provider_b}.",
	"I want to compare serious options before committing to one.",
	"Help me think through the tradeoffs in the current shortlist.",
	"What looks strongest if I am narrowing down to a few options?",
	"I have done research, now help me compare the finalists.",
	),
	},
	"decision": {
	"easy": (
	"Which {product} should I choose?",
	"Should I pick {provider_a} or {provider_b}?",
	"Which option should I commit to?",
	"What is the best fit for me right now?",
	"Which plan should I choose today?",
	),
	"medium": (
	"I am ready to decide between {provider_a} and {provider_b}.",
	"Help me pick the final option for {goal}.",
	"Which {product} should I commit to this week?",
	"I need to make the call now; which option fits best?",
	"What should I choose if I need to decide today?",
	),
	"hard": (
	"I have a shortlist and need to commit to one vendor now.",
	"I am at the point of commitment and need a final recommendation.",
	"Which option should we sign off on before next week?",
	"I have enough information; tell me which one to go with.",
	"I need the final pick, not another round of comparison.",
	),
	},
	"action": {
	"easy": (
	"Start my free trial.",
	"Book a demo with {provider_a}.",
	"Create my account.",
	"Buy {provider_a} now.",
	"Download the {asset}.",
	),
	"medium": (
	"Take me to checkout for {provider_a}.",
	"Get me signed up for {provider_a}.",
	"Reserve my spot with {provider_a}.",
	"I want to purchase {provider_a} today.",
	"Send me the download link for the {asset}.",
	),
	"hard": (
	"I am ready to move forward now, where do I start the purchase?",
	"Help me complete the signup flow for {provider_a}.",
	"I want to act on this immediately and get access now.",
	"Can you help me finish the order for {provider_a}?",
	"I have decided, now let me complete the next step.",
	),
	},
	"post_purchase": {
	"easy": (
	"How do I set up my new {product}?",
	"Show me how to import contacts into {provider_a}.",
	"How do I onboard my team after purchase?",
	"What should I enable first after signup?",
	"How do I configure my account now that I signed up?",
	),
	"medium": (
	"We already subscribed; how do we get value quickly?",
	"What is the best way to roll this out after purchase?",
	"Help me configure {provider_a} now that we bought it.",
	"How do I invite teammates after signing up?",
	"What should I do first after we activate the plan?",
	),
	"hard": (
	"We already made the purchase, now I need guidance on rollout and setup.",
	"This is not a buying decision anymore; I need post-purchase onboarding help.",
	"I need adoption guidance now that the contract is signed.",
	"What is the right onboarding sequence after we commit to {provider_a}?",
	"We are past checkout and need implementation help.",
	),
	},
	"support": {
	"easy": (
	"I cannot log into my {support_object}.",
	"How do I reset my password?",
	"My invoice is wrong.",
	"The integration keeps failing.",
	"Our dashboard is not loading.",
	),
	"medium": (
	"Can you help me fix a {support_detail}?",
	"I am stuck because my {support_object} keeps breaking.",
	"My password reset link is not working.",
	"I need support with my {support_object}.",
	"Why is {provider_a} not syncing correctly?",
	),
	"hard": (
	"I am not evaluating anything, I just need this issue fixed.",
	"This is a live support problem, not a buying question.",
	"Please help me resolve a problem with my existing account.",
	"I cannot continue because something is broken in my setup.",
	"I need troubleshooting help, not recommendations.",
	),
	},
	}


	def split_for_index(index: int) -> str:
	bucket = index % 5
	if bucket < 3:
	return "train"
	if bucket == 3:
	return "val"
	return "test"


	def write_jsonl(path: Path, rows: list[dict]) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	with path.open("w", encoding="utf-8") as handle:
	for row in rows:
	handle.write(json.dumps(row, sort_keys=True) + "\n")


	def render_text(template: str, spec: dict[str, str]) -> str:
	return template.format(**spec).strip()


	def build_training_splits() -> dict[str, list[dict]]:
	splits = {"train": [], "val": [], "test": []}
	seen: set[str] = set()
	counters = {(phase, difficulty): 0 for phase in PHASE_TEMPLATES for difficulty in PHASE_TEMPLATES[phase]}

	for phase, difficulty_map in PHASE_TEMPLATES.items():
	for difficulty, templates in difficulty_map.items():
	for template in templates:
	theme_specs = TRAIN_THEMES if "{" in template else ({},)
	for spec in theme_specs:
	text = render_text(template, spec)
	key = text.lower()
	if key in seen:
	continue
	seen.add(key)
	split_name = split_for_index(counters[(phase, difficulty)])
	counters[(phase, difficulty)] += 1
	splits[split_name].append(
	{
	"text": text,
	"decision_phase": phase,
	"difficulty": difficulty,
	"source": "synthetic_decision_phase_difficulty",
	}
	)
	return splits


	def build_benchmark_rows() -> list[dict]:
	rows: list[dict] = []
	seen: set[str] = set()
	for phase, difficulty_map in PHASE_TEMPLATES.items():
	for difficulty, templates in difficulty_map.items():
	spec = BENCHMARK_THEMES.get(difficulty, {})
	for template in templates:
	text = render_text(template, spec)
	key = text.lower()
	if key in seen:
	continue
	seen.add(key)
	rows.append(
	{
	"text": text,
	"decision_phase": phase,
	"difficulty": difficulty,
	"source": "decision_phase_benchmark",
	}
	)
	return rows


	def main() -> None:
	splits = build_training_splits()
	for split_name, rows in splits.items():
	write_jsonl(OUTPUT_DIR / f"{split_name}.jsonl", rows)
	print(f"{split_name}: {len(rows)} rows")

	benchmark_rows = build_benchmark_rows()
	write_jsonl(BENCHMARK_PATH, benchmark_rows)
	print(f"benchmark: {len(benchmark_rows)} rows")


	if __name__ == "__main__":
	main()