mon_tokenizer / sample_usage.py

feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue

81cf36d 4 months ago

6.14 kB

	#!/usr/bin/env python3
	"""
	Sample usage examples for the Mon language tokenizer.

	This script demonstrates various ways to use the Mon tokenizer with
	Hugging Face Transformers library.
	"""

	import logging
	import time
	from typing import List, Dict, Any

	import torch
	from transformers import AutoTokenizer

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def basic_usage_example():
	"""Demonstrate basic tokenizer usage."""
	print("=== Basic Usage Example ===")

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
	print(f"✓ Loaded tokenizer (vocab size: {tokenizer.vocab_size:,})")

	# Example Mon texts
	texts = [
	"ဘာသာမန်",
	"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
	"ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
	]

	for i, text in enumerate(texts, 1):
	print(f"\nExample {i}:")
	print(f"Input: {text}")

	# Tokenize the text
	tokens = tokenizer(text, return_tensors="pt")
	input_ids = tokens["input_ids"][0]

	# Print results
	print(f"Token IDs: {input_ids.tolist()}")

	# Convert to token strings
	token_strings = tokenizer.convert_ids_to_tokens(input_ids)
	print(f"Tokens: {token_strings}")

	# Decode back to text
	decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
	print(f"Decoded: {decoded}")
	print(f"Round-trip success: {text == decoded}")


	def batch_processing_example():
	"""Demonstrate batch processing."""
	print("\n=== Batch Processing Example ===")

	tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")

	# Multiple texts for batch processing
	batch_texts = [
	"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
	"မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
	"အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ်ရ။"
	]

	# Batch tokenization with padding
	batch_tokens = tokenizer(
	batch_texts,
	padding=True,
	truncation=True,
	return_tensors="pt",
	max_length=128
	)

	print(f"Batch shape: {batch_tokens['input_ids'].shape}")
	print(f"Attention mask shape: {batch_tokens['attention_mask'].shape}")

	# Process each item
	for i, text in enumerate(batch_texts):
	tokens_count = batch_tokens['attention_mask'][i].sum().item()
	decoded = tokenizer.decode(batch_tokens['input_ids'][i], skip_special_tokens=True)
	print(f"Text {i+1}: {tokens_count} tokens -> '{decoded}'")


	def advanced_features_example():
	"""Demonstrate advanced features."""
	print("\n=== Advanced Features Example ===")

	tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
	text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"

	# Different tokenization options
	print("Special token handling:")

	# With special tokens
	with_special = tokenizer(text, add_special_tokens=True, return_tensors="pt")
	print(f" With special tokens: {with_special['input_ids'].shape[1]} tokens")

	# Without special tokens
	without_special = tokenizer(text, add_special_tokens=False, return_tensors="pt")
	print(f" Without special tokens: {without_special['input_ids'].shape[1]} tokens")

	# Special token info
	print(f"\nSpecial tokens:")
	print(f" BOS: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
	print(f" EOS: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
	print(f" UNK: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")
	print(f" PAD: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")


	def performance_example():
	"""Demonstrate performance characteristics."""
	print("\n=== Performance Example ===")

	tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")

	test_texts = [
	("Short", "ဘာသာမန်"),
	("Medium", "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။ မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"),
	("Long", "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။ " * 10)
	]

	for name, text in test_texts:
	char_count = len(text)

	# Measure tokenization time
	start_time = time.time()
	for _ in range(100): # Average over 100 runs
	tokens = tokenizer(text, return_tensors="pt")
	avg_time = (time.time() - start_time) / 100

	token_count = tokens['input_ids'].shape[1]
	chars_per_sec = char_count / avg_time if avg_time > 0 else 0

	print(f"{name}: {char_count} chars -> {token_count} tokens")
	print(f" Time: {avg_time*1000:.2f}ms ({chars_per_sec:.0f} chars/sec)")


	if __name__ == "__main__":
	print("🚀 Mon Tokenizer Usage Examples")
	print("=" * 50)

	try:
	basic_usage_example()
	batch_processing_example()
	advanced_features_example()
	performance_example()

	print(f"\n🎉 All examples completed successfully!")
	print(f"\nFor more information, visit:")
	print(f"https://huggingface.co/janakhpon/mon_tokenizer")

	except Exception as e:
	print(f"❌ Error running examples: {e}")
	exit(1)