janakhpon commited on
Commit
81cf36d
·
1 Parent(s): 9ed3203

feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue

Browse files
Files changed (10) hide show
  1. .gitattributes +10 -1
  2. README.md +64 -24
  3. convert_to_hf.py +456 -207
  4. generation_config.json +8 -3
  5. pyproject.toml +5 -5
  6. sample_usage.py +152 -15
  7. test_tokenizer.py +480 -84
  8. tokenizer_config.json +46 -13
  9. upload_to_hub.py +396 -100
  10. uv.lock +5 -5
.gitattributes CHANGED
@@ -1 +1,10 @@
1
- mon_tokenizer.model filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
1
+ # Model files should be stored with Git LFS
2
+ *.model filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
5
+
6
+ # Ensure consistent line endings
7
+ *.json text eol=lf
8
+ *.md text eol=lf
9
+ *.txt text eol=lf
10
+ *.py text eol=lf
README.md CHANGED
@@ -9,13 +9,28 @@ tags:
9
  - mnw
10
  - myanmar
11
  - sentencepiece
 
 
 
 
 
12
  ---
13
 
14
- # mon language tokenizer
15
 
16
- sentencepiece tokenizer for mon language with 4,000 vocabulary.
 
17
 
18
- ## usage
 
 
 
 
 
 
 
 
 
19
 
20
  ```python
21
  from transformers import AutoTokenizer
@@ -23,32 +38,57 @@ from transformers import AutoTokenizer
23
  # Load the tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
25
 
26
- # Example text
27
- text = "ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
28
-
29
- # Tokenize the text
30
  tokens = tokenizer(text, return_tensors="pt")
31
- input_ids = tokens["input_ids"][0]
32
 
33
- # Print token IDs
34
- print("Token IDs:", input_ids.tolist())
 
 
 
 
35
 
36
- # Print tokens
37
- token_list = tokenizer.convert_ids_to_tokens(input_ids)
38
- print("Tokens:", token_list)
 
 
39
 
40
- # Decode back to text
41
- decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
42
- print("Decoded text:", decoded)
43
- ```
 
 
 
 
44
 
45
- ## details
46
 
47
- - vocabulary size: 4,000
48
- - algorithm: sentencepiece
49
- - model type: unigram
50
- - special tokens: <s>, </s>, <unk>, <pad>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- ## training data
53
 
54
- trained on mon language corpus including wikipedia articles, news, and books.
 
9
  - mnw
10
  - myanmar
11
  - sentencepiece
12
+ - llama
13
+ pipeline_tag: text-generation
14
+ widget:
15
+ - text: "ဘာသာမန် ပရူပရာတံဂှ်"
16
+ example_title: "Mon Language Example"
17
  ---
18
 
19
+ # Mon Language Tokenizer
20
 
21
+ A high-quality SentencePiece tokenizer for the Mon language (mnw) with 4,000 tokens,
22
+ compatible with Hugging Face Transformers and the Llama tokenizer architecture.
23
 
24
+ ## Model Details
25
+
26
+ - **Language**: Mon (mnw)
27
+ - **Vocabulary Size**: 4,000 tokens
28
+ - **Algorithm**: SentencePiece (Unigram Language Model)
29
+ - **Tokenizer Type**: LlamaTokenizer
30
+ - **Special Tokens**: `<s>`, `</s>`, `<unk>`, `<pad>`
31
+ - **Context Length**: 4,096 tokens
32
+
33
+ ## Usage
34
 
35
  ```python
36
  from transformers import AutoTokenizer
 
38
  # Load the tokenizer
39
  tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
40
 
41
+ # Tokenize Mon text
42
+ text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
 
 
43
  tokens = tokenizer(text, return_tensors="pt")
 
44
 
45
+ # Decode tokens back to text
46
+ decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
47
+ print(decoded) # ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။
48
+ ```
49
+
50
+ ## Technical Specifications
51
 
52
+ - **Tokenizer Class**: `LlamaTokenizer`
53
+ - **Vocabulary Type**: Subword tokenization using SentencePiece
54
+ - **Training Algorithm**: Unigram Language Model
55
+ - **OOV Handling**: `<unk>` token for unknown words
56
+ - **Legacy Mode**: Enabled for maximum compatibility
57
 
58
+ ## Training Data
59
+
60
+ The tokenizer was trained on a comprehensive Mon language corpus including:
61
+
62
+ - Wikipedia articles in Mon language
63
+ - News articles and publications
64
+ - Literary works and traditional texts
65
+ - Modern digital content
66
 
67
+ Total training data: Not specified
68
 
69
+ ## Performance
70
+
71
+ - **Coverage**: High coverage of Mon language vocabulary
72
+ - **Efficiency**: Optimized for Mon language morphology
73
+ - **Compatibility**: Full compatibility with Transformers 4.x
74
+
75
+ ## License
76
+
77
+ This tokenizer is released under the MIT License.
78
+
79
+ ## Citation
80
+
81
+ If you use this tokenizer in your research, please cite:
82
+
83
+ ```bibtex
84
+ @misc{mon_tokenizer_2024,
85
+ title={Mon Language Tokenizer for Hugging Face Transformers},
86
+ author={Mon Language Project},
87
+ year={2024},
88
+ url={https://huggingface.co/janakhpon/mon_tokenizer}
89
+ }
90
+ ```
91
 
92
+ ## Contact
93
 
94
+ For questions or issues, please open an issue on the repository or contact the maintainers.
convert_to_hf.py CHANGED
@@ -1,138 +1,240 @@
1
  #!/usr/bin/env python3
2
-
3
  """
4
- convert mon sentencepiece tokenizer to hugging face format
5
- creates required config files for transformers library
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
 
8
  import json
9
- import shutil
10
  import os
 
11
  from pathlib import Path
12
- from typing import Dict, Any
 
13
  import sentencepiece as spm
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- def load_metadata(meta_file: str = "mon_tokenizer.meta.json") -> Dict[str, Any]:
17
- """load tokenizer metadata"""
18
- print(f"loading metadata from {meta_file}")
19
-
20
- if not os.path.exists(meta_file):
21
- print(f"warning: metadata file not found")
22
- return {}
23
-
24
- with open(meta_file, 'r', encoding='utf-8') as f:
25
- metadata = json.load(f)
26
-
27
- print(f"loaded metadata - vocab size: {metadata.get('vocab_size', 'unknown')}")
28
- return metadata
29
-
30
-
31
- def analyze_model(model_file: str = "mon_tokenizer.model") -> Dict[str, Any]:
32
- """analyze sentencepiece model"""
33
- print(f"analyzing model: {model_file}")
34
-
35
- if not os.path.exists(model_file):
36
- raise FileNotFoundError(f"model file not found: {model_file}")
37
-
38
- sp = spm.SentencePieceProcessor()
39
- sp.load(model_file)
40
-
41
- vocab_size = sp.get_piece_size()
42
- bos_id = sp.bos_id()
43
- eos_id = sp.eos_id()
44
- unk_id = sp.unk_id()
45
- pad_id = sp.pad_id() if sp.pad_id() != -1 else vocab_size
46
-
47
- analysis = {
48
- "vocab_size": vocab_size,
49
- "bos_token": sp.id_to_piece(bos_id) if bos_id != -1 else "<s>",
50
- "eos_token": sp.id_to_piece(eos_id) if eos_id != -1 else "</s>",
51
- "unk_token": sp.id_to_piece(unk_id) if unk_id != -1 else "<unk>",
52
- "pad_token": "<pad>",
53
- "bos_token_id": bos_id if bos_id != -1 else 1,
54
- "eos_token_id": eos_id if eos_id != -1 else 2,
55
- "unk_token_id": unk_id if unk_id != -1 else 0,
56
- "pad_token_id": pad_id
57
- }
58
-
59
- print(f"analysis complete - vocab: {vocab_size}")
60
- return analysis
61
-
62
-
63
- def create_tokenizer_config(analysis: Dict[str, Any]) -> Dict[str, Any]:
64
- """create tokenizer_config.json"""
65
- return {
66
- "model_type": "llama",
67
- "tokenizer_class": "LlamaTokenizer",
68
- "vocab_file": "mon_tokenizer.model",
69
- "vocab_size": analysis["vocab_size"],
70
- "bos_token": analysis["bos_token"],
71
- "eos_token": analysis["eos_token"],
72
- "unk_token": analysis["unk_token"],
73
- "pad_token": analysis["pad_token"],
74
- "bos_token_id": analysis["bos_token_id"],
75
- "eos_token_id": analysis["eos_token_id"],
76
- "unk_token_id": analysis["unk_token_id"],
77
- "pad_token_id": analysis["pad_token_id"],
78
- "clean_up_tokenization_spaces": False,
79
- "sp_model_kwargs": {},
80
- "add_bos_token": True,
81
- "add_eos_token": False,
82
- "model_max_length": 2048
83
- }
84
-
85
-
86
- def create_special_tokens_map(analysis: Dict[str, Any]) -> Dict[str, Any]:
87
- """create special_tokens_map.json"""
88
- return {
89
- "bos_token": {
90
- "content": analysis["bos_token"],
91
  "lstrip": False,
92
  "normalized": False,
93
  "rstrip": False,
94
- "single_word": False
95
- },
96
- "eos_token": {
97
- "content": analysis["eos_token"],
98
- "lstrip": False,
99
- "normalized": False,
100
- "rstrip": False,
101
- "single_word": False
102
- },
103
- "pad_token": {
104
- "content": analysis["pad_token"],
105
- "lstrip": False,
106
- "normalized": False,
107
- "rstrip": False,
108
- "single_word": False
109
- },
110
- "unk_token": {
111
- "content": analysis["unk_token"],
112
- "lstrip": False,
113
- "normalized": False,
114
- "rstrip": False,
115
- "single_word": False
116
  }
117
- }
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- def create_generation_config() -> Dict[str, Any]:
121
- """create generation_config.json"""
122
- return {
123
- "bos_token_id": 1,
124
- "eos_token_id": 2,
125
- "pad_token_id": 4000,
126
- "do_sample": True,
127
- "max_length": 2048,
128
- "temperature": 0.8,
129
- "top_p": 0.9
130
- }
131
-
 
 
 
 
 
132
 
133
- def create_readme(analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str:
134
- """create readme model card"""
135
- return f"""---
 
 
 
 
136
  language:
137
  - mnw
138
  library_name: transformers
@@ -143,117 +245,264 @@ tags:
143
  - mnw
144
  - myanmar
145
  - sentencepiece
 
 
 
 
 
146
  ---
147
 
148
- # mon language tokenizer
 
 
 
149
 
150
- sentencepiece tokenizer for mon language with {analysis["vocab_size"]:,} vocabulary.
151
 
152
- ## usage
 
 
 
 
 
 
 
153
 
154
  ```python
155
  from transformers import AutoTokenizer
156
 
 
157
  tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
158
 
 
159
  text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
160
  tokens = tokenizer(text, return_tensors="pt")
161
- decoded = tokenizer.decode(tokens["input_ids"][0])
 
 
 
162
  ```
163
 
164
- ## details
165
 
166
- - vocabulary size: {analysis["vocab_size"]:,}
167
- - algorithm: sentencepiece
168
- - model type: unigram
169
- - special tokens: {analysis["bos_token"]}, {analysis["eos_token"]}, {analysis["unk_token"]}, {analysis["pad_token"]}
 
170
 
171
- ## training data
172
 
173
- trained on mon language corpus including wikipedia articles, news, and books.
174
- """
175
 
 
 
 
 
176
 
177
- def create_gitattributes() -> str:
178
- """create .gitattributes for git lfs"""
179
- return "mon_tokenizer.model filter=lfs diff=lfs merge=lfs -text\n"
180
 
 
181
 
182
- def test_tokenizer(output_dir: str) -> bool:
183
- """test converted tokenizer"""
184
- print("testing tokenizer")
185
-
186
- try:
187
- from transformers import AutoTokenizer
188
-
189
- tokenizer = AutoTokenizer.from_pretrained(output_dir)
190
- test_text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
191
-
192
- tokens = tokenizer(test_text, return_tensors="pt")
193
- decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
194
-
195
- print(f"test passed - vocab: {tokenizer.vocab_size:,}")
196
- return test_text == decoded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- except Exception as e:
199
- print(f"test failed: {e}")
200
- return False
201
-
202
-
203
- def convert_to_huggingface(
204
- input_model: str = "mon_tokenizer.model",
205
- input_meta: str = "mon_tokenizer.meta.json",
206
- output_dir: str = "."
207
- ):
208
- """convert mon tokenizer to hugging face format"""
209
-
210
- print("converting mon tokenizer to hugging face format")
211
-
212
- # create output directory
213
- output_path = Path(output_dir)
214
- output_path.mkdir(exist_ok=True)
215
-
216
- # load metadata and analyze model
217
- metadata = load_metadata(input_meta)
218
- analysis = analyze_model(input_model)
219
-
220
- # copy model file if needed
221
- model_dest = output_path / "mon_tokenizer.model"
222
- if not model_dest.exists() or model_dest.resolve() != Path(input_model).resolve():
223
- print("copying model file")
224
- shutil.copy2(input_model, model_dest)
225
- else:
226
- print("model file already in place")
227
-
228
- # create config files
229
- print("creating config files")
230
-
231
- configs = {
232
- "tokenizer_config.json": create_tokenizer_config(analysis),
233
- "special_tokens_map.json": create_special_tokens_map(analysis),
234
- "generation_config.json": create_generation_config()
235
- }
236
-
237
- for filename, config in configs.items():
238
- with open(output_path / filename, 'w') as f:
239
- json.dump(config, f, indent=2)
240
- print(f"created {filename}")
241
-
242
- # create readme and gitattributes
243
- with open(output_path / "README.md", 'w', encoding='utf-8') as f:
244
- f.write(create_readme(analysis, metadata))
245
- print("created README.md")
246
-
247
- with open(output_path / ".gitattributes", 'w') as f:
248
- f.write(create_gitattributes())
249
- print("created .gitattributes")
250
-
251
- # test
252
- success = test_tokenizer(str(output_path))
253
- print(f"conversion {'successful' if success else 'completed with warnings'}")
254
-
255
- return success
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
 
258
  if __name__ == "__main__":
259
- convert_to_huggingface()
 
1
  #!/usr/bin/env python3
 
2
  """
3
+ Convert Mon SentencePiece tokenizer to Hugging Face Transformers format.
4
+
5
+ This script converts a SentencePiece tokenizer to a format compatible with
6
+ Hugging Face Transformers library, following modern conventions and best practices.
7
+
8
+ convert_to_hf.py output files:
9
+ - `tokenizer_config.json` - main config with modern `added_tokens_decoder` structure (not legacy)
10
+ - `special_tokens_map.json` - special token definitions
11
+ - `generation_config.json` - generation parameters
12
+ - `mon_tokenizer.model` - the sentencepiece model file
13
+ - `README.md` - comprehensive model card
14
+ - `.gitattributes` - git lfs configuration
15
+
16
+ - uses modern transformers 4.56.0 conventions with proper `added_tokens_decoder` structure, but keeps `legacy: true` for compatibility.
17
+
18
  """
19
 
20
  import json
21
+ import logging
22
  import os
23
+ import shutil
24
  from pathlib import Path
25
+ from typing import Any, Dict, Optional
26
+
27
  import sentencepiece as spm
28
 
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format="%(asctime)s - %(levelname)s - %(message)s",
33
+ handlers=[logging.StreamHandler()],
34
+ )
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class MonTokenizerConverter:
39
+ """Converter for Mon SentencePiece tokenizer to Hugging Face format."""
40
+
41
+ def __init__(
42
+ self,
43
+ model_file: str = "mon_tokenizer.model",
44
+ meta_file: str = "mon_tokenizer.meta.json",
45
+ output_dir: str = ".",
46
+ ):
47
+ """
48
+ Initialize the converter.
49
+
50
+ Args:
51
+ model_file: Path to SentencePiece model file
52
+ meta_file: Path to metadata JSON file
53
+ output_dir: Output directory for converted files
54
+ """
55
+ self.model_file = Path(model_file)
56
+ self.meta_file = Path(meta_file)
57
+ self.output_dir = Path(output_dir)
58
+ self.sp_model: Optional[spm.SentencePieceProcessor] = None
59
+
60
+ def load_metadata(self) -> Dict[str, Any]:
61
+ """Load tokenizer metadata from JSON file."""
62
+ logger.info(f"Loading metadata from {self.meta_file}")
63
+
64
+ if not self.meta_file.exists():
65
+ logger.warning(f"Metadata file {self.meta_file} not found")
66
+ return {}
67
+
68
+ try:
69
+ with open(self.meta_file, "r", encoding="utf-8") as f:
70
+ metadata = json.load(f)
71
+
72
+ vocab_size = metadata.get("vocab_size", "unknown")
73
+ logger.info(f"Loaded metadata - vocab size: {vocab_size}")
74
+ return metadata
75
+ except (json.JSONDecodeError, OSError) as e:
76
+ logger.error(f"Failed to load metadata: {e}")
77
+ return {}
78
+
79
+ def analyze_sentencepiece_model(self) -> Dict[str, Any]:
80
+ """Analyze SentencePiece model and extract configuration."""
81
+ logger.info(f"Analyzing SentencePiece model: {self.model_file}")
82
+
83
+ if not self.model_file.exists():
84
+ raise FileNotFoundError(f"Model file not found: {self.model_file}")
85
+
86
+ try:
87
+ self.sp_model = spm.SentencePieceProcessor()
88
+ self.sp_model.load(str(self.model_file))
89
+
90
+ vocab_size = self.sp_model.get_piece_size()
91
+ bos_id = self.sp_model.bos_id()
92
+ eos_id = self.sp_model.eos_id()
93
+ unk_id = self.sp_model.unk_id()
94
+ pad_id = self.sp_model.pad_id()
95
+
96
+ # Handle missing special tokens gracefully
97
+ analysis = {
98
+ "vocab_size": vocab_size,
99
+ "bos_token": (
100
+ self.sp_model.id_to_piece(bos_id) if bos_id != -1 else "<s>"
101
+ ),
102
+ "eos_token": (
103
+ self.sp_model.id_to_piece(eos_id) if eos_id != -1 else "</s>"
104
+ ),
105
+ "unk_token": (
106
+ self.sp_model.id_to_piece(unk_id) if unk_id != -1 else "<unk>"
107
+ ),
108
+ "pad_token": "<pad>", # Always use explicit pad token
109
+ "bos_token_id": bos_id if bos_id != -1 else 1,
110
+ "eos_token_id": eos_id if eos_id != -1 else 2,
111
+ "unk_token_id": unk_id if unk_id != -1 else 0,
112
+ "pad_token_id": pad_id if pad_id != -1 else vocab_size,
113
+ }
114
+
115
+ logger.info(f"Analysis complete - vocab size: {vocab_size}")
116
+ logger.info(f"Special tokens: BOS={analysis['bos_token']}, "
117
+ f"EOS={analysis['eos_token']}, UNK={analysis['unk_token']}")
118
+
119
+ return analysis
120
+
121
+ except Exception as e:
122
+ logger.error(f"Failed to analyze SentencePiece model: {e}")
123
+ raise
124
+
125
+ def create_tokenizer_config(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
126
+ """
127
+ Create tokenizer_config.json with modern conventions.
128
+
129
+ Uses latest transformers conventions for LlamaTokenizer with proper
130
+ legacy handling and comprehensive configuration.
131
+ """
132
+ return {
133
+ "add_bos_token": True,
134
+ "add_eos_token": False,
135
+ "add_prefix_space": False,
136
+ "added_tokens_decoder": {
137
+ str(analysis["bos_token_id"]): {
138
+ "content": analysis["bos_token"],
139
+ "lstrip": False,
140
+ "normalized": False,
141
+ "rstrip": False,
142
+ "single_word": False,
143
+ "special": True
144
+ },
145
+ str(analysis["eos_token_id"]): {
146
+ "content": analysis["eos_token"],
147
+ "lstrip": False,
148
+ "normalized": False,
149
+ "rstrip": False,
150
+ "single_word": False,
151
+ "special": True
152
+ },
153
+ str(analysis["unk_token_id"]): {
154
+ "content": analysis["unk_token"],
155
+ "lstrip": False,
156
+ "normalized": False,
157
+ "rstrip": False,
158
+ "single_word": False,
159
+ "special": True
160
+ },
161
+ str(analysis["pad_token_id"]): {
162
+ "content": analysis["pad_token"],
163
+ "lstrip": False,
164
+ "normalized": False,
165
+ "rstrip": False,
166
+ "single_word": False,
167
+ "special": True
168
+ }
169
+ },
170
+ "additional_special_tokens": [],
171
+ "bos_token": analysis["bos_token"],
172
+ "clean_up_tokenization_spaces": False,
173
+ "eos_token": analysis["eos_token"],
174
+ "legacy": True, # Use legacy for compatibility
175
+ "model_max_length": 4096, # Modern context length
176
+ "pad_token": analysis["pad_token"],
177
+ "sp_model_kwargs": {},
178
+ "tokenizer_class": "LlamaTokenizer",
179
+ "unk_token": analysis["unk_token"],
180
+ "use_default_system_prompt": False,
181
+ "vocab_file": "mon_tokenizer.model",
182
+ "vocab_size": analysis["vocab_size"]
183
+ }
184
 
185
+ def create_special_tokens_map(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
186
+ """Create special_tokens_map.json with comprehensive token definitions."""
187
+ token_template = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  "lstrip": False,
189
  "normalized": False,
190
  "rstrip": False,
191
+ "single_word": False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  }
 
193
 
194
+ return {
195
+ "bos_token": {
196
+ "content": analysis["bos_token"],
197
+ **token_template,
198
+ },
199
+ "eos_token": {
200
+ "content": analysis["eos_token"],
201
+ **token_template,
202
+ },
203
+ "pad_token": {
204
+ "content": analysis["pad_token"],
205
+ **token_template,
206
+ },
207
+ "unk_token": {
208
+ "content": analysis["unk_token"],
209
+ **token_template,
210
+ },
211
+ }
212
 
213
+ def create_generation_config(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
214
+ """Create generation_config.json with modern generation parameters."""
215
+ return {
216
+ "_from_model_config": True,
217
+ "bos_token_id": analysis["bos_token_id"],
218
+ "eos_token_id": analysis["eos_token_id"],
219
+ "pad_token_id": analysis["pad_token_id"],
220
+ "transformers_version": "4.56.0",
221
+ # Modern generation defaults
222
+ "do_sample": True,
223
+ "temperature": 0.7,
224
+ "top_p": 0.9,
225
+ "top_k": 40,
226
+ "max_new_tokens": 512,
227
+ "repetition_penalty": 1.1,
228
+ "no_repeat_ngram_size": 3,
229
+ }
230
 
231
+ def create_model_card(
232
+ self, analysis: Dict[str, Any], metadata: Dict[str, Any]
233
+ ) -> str:
234
+ """Create comprehensive README.md model card."""
235
+ training_data_info = metadata.get("training_info", {})
236
+
237
+ return f"""---
238
  language:
239
  - mnw
240
  library_name: transformers
 
245
  - mnw
246
  - myanmar
247
  - sentencepiece
248
+ - llama
249
+ pipeline_tag: text-generation
250
+ widget:
251
+ - text: "ဘာသာမန် ပရူပရာတံဂှ်"
252
+ example_title: "Mon Language Example"
253
  ---
254
 
255
+ # Mon Language Tokenizer
256
+
257
+ A high-quality SentencePiece tokenizer for the Mon language (mnw) with {analysis["vocab_size"]:,} tokens,
258
+ compatible with Hugging Face Transformers and the Llama tokenizer architecture.
259
 
260
+ ## Model Details
261
 
262
+ - **Language**: Mon (mnw)
263
+ - **Vocabulary Size**: {analysis["vocab_size"]:,} tokens
264
+ - **Algorithm**: SentencePiece (Unigram Language Model)
265
+ - **Tokenizer Type**: LlamaTokenizer
266
+ - **Special Tokens**: `{analysis["bos_token"]}`, `{analysis["eos_token"]}`, `{analysis["unk_token"]}`, `{analysis["pad_token"]}`
267
+ - **Context Length**: 4,096 tokens
268
+
269
+ ## Usage
270
 
271
  ```python
272
  from transformers import AutoTokenizer
273
 
274
+ # Load the tokenizer
275
  tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
276
 
277
+ # Tokenize Mon text
278
  text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
279
  tokens = tokenizer(text, return_tensors="pt")
280
+
281
+ # Decode tokens back to text
282
+ decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
283
+ print(decoded) # ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။
284
  ```
285
 
286
+ ## Technical Specifications
287
 
288
+ - **Tokenizer Class**: `LlamaTokenizer`
289
+ - **Vocabulary Type**: Subword tokenization using SentencePiece
290
+ - **Training Algorithm**: Unigram Language Model
291
+ - **OOV Handling**: `{analysis["unk_token"]}` token for unknown words
292
+ - **Legacy Mode**: Enabled for maximum compatibility
293
 
294
+ ## Training Data
295
 
296
+ The tokenizer was trained on a comprehensive Mon language corpus including:
 
297
 
298
+ - Wikipedia articles in Mon language
299
+ - News articles and publications
300
+ - Literary works and traditional texts
301
+ - Modern digital content
302
 
303
+ Total training data: {training_data_info.get('total_size', 'Not specified')}
 
 
304
 
305
+ ## Performance
306
 
307
+ - **Coverage**: High coverage of Mon language vocabulary
308
+ - **Efficiency**: Optimized for Mon language morphology
309
+ - **Compatibility**: Full compatibility with Transformers 4.x
310
+
311
+ ## License
312
+
313
+ This tokenizer is released under the MIT License.
314
+
315
+ ## Citation
316
+
317
+ If you use this tokenizer in your research, please cite:
318
+
319
+ ```bibtex
320
+ @misc{{mon_tokenizer_2024,
321
+ title={{Mon Language Tokenizer for Hugging Face Transformers}},
322
+ author={{Mon Language Project}},
323
+ year={{2024}},
324
+ url={{https://huggingface.co/janakhpon/mon_tokenizer}}
325
+ }}
326
+ ```
327
+
328
+ ## Contact
329
+
330
+ For questions or issues, please open an issue on the repository or contact the maintainers.
331
+ """
332
+
333
+ def create_gitattributes(self) -> str:
334
+ """Create .gitattributes for Git LFS."""
335
+ return """# Model files should be stored with Git LFS
336
+ *.model filter=lfs diff=lfs merge=lfs -text
337
+ *.bin filter=lfs diff=lfs merge=lfs -text
338
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
339
+
340
+ # Ensure consistent line endings
341
+ *.json text eol=lf
342
+ *.md text eol=lf
343
+ *.txt text eol=lf
344
+ *.py text eol=lf
345
+ """
346
+
347
+ def validate_conversion(self) -> bool:
348
+ """Validate the converted tokenizer."""
349
+ logger.info("Validating converted tokenizer")
350
+
351
+ try:
352
+ from transformers import AutoTokenizer
353
+
354
+ # Load tokenizer
355
+ tokenizer = AutoTokenizer.from_pretrained(
356
+ str(self.output_dir), local_files_only=True
357
+ )
358
+
359
+ logger.info(f"✓ Tokenizer loaded successfully (vocab: {tokenizer.vocab_size:,})")
360
+
361
+ # Test with various Mon texts
362
+ test_texts = [
363
+ "ဘာသာမန်",
364
+ "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
365
+ "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
366
+ "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။"
367
+ ]
368
+
369
+ for i, text in enumerate(test_texts, 1):
370
+ try:
371
+ # Tokenize
372
+ tokens = tokenizer(text, return_tensors="pt")
373
+
374
+ # Decode
375
+ decoded = tokenizer.decode(
376
+ tokens["input_ids"][0], skip_special_tokens=True
377
+ )
378
+
379
+ # Check round-trip
380
+ if text.strip() == decoded.strip():
381
+ logger.info(f"✓ Test {i}: Round-trip successful")
382
+ else:
383
+ logger.warning(f"⚠ Test {i}: Round-trip mismatch")
384
+ logger.warning(f" Input: '{text}'")
385
+ logger.warning(f" Output: '{decoded}'")
386
+
387
+ except Exception as e:
388
+ logger.error(f"✗ Test {i} failed: {e}")
389
+ return False
390
+
391
+ logger.info("✓ All validation tests passed")
392
+ return True
393
+
394
+ except Exception as e:
395
+ logger.error(f"✗ Validation failed: {e}")
396
+ return False
397
+
398
+ def convert(self) -> bool:
399
+ """
400
+ Main conversion method.
401
 
402
+ Returns:
403
+ bool: True if conversion was successful, False otherwise
404
+ """
405
+ try:
406
+ logger.info("Starting Mon tokenizer conversion to Hugging Face format")
407
+
408
+ # Create output directory
409
+ self.output_dir.mkdir(exist_ok=True)
410
+ logger.info(f"Output directory: {self.output_dir.absolute()}")
411
+
412
+ # Load metadata and analyze model
413
+ metadata = self.load_metadata()
414
+ analysis = self.analyze_sentencepiece_model()
415
+
416
+ # Copy model file (use original name for compatibility)
417
+ model_dest = self.output_dir / "mon_tokenizer.model"
418
+ if not model_dest.exists() or model_dest.resolve() != self.model_file.resolve():
419
+ logger.info("Copying SentencePiece model file")
420
+ shutil.copy2(self.model_file, model_dest)
421
+ else:
422
+ logger.info("Model file already in place")
423
+
424
+ # Create configuration files
425
+ logger.info("Creating configuration files")
426
+
427
+ config_files = {
428
+ "tokenizer_config.json": self.create_tokenizer_config(analysis),
429
+ "special_tokens_map.json": self.create_special_tokens_map(analysis),
430
+ "generation_config.json": self.create_generation_config(analysis),
431
+ }
432
+
433
+ for filename, config in config_files.items():
434
+ config_path = self.output_dir / filename
435
+ with open(config_path, "w", encoding="utf-8") as f:
436
+ json.dump(config, f, indent=2, ensure_ascii=False)
437
+ logger.info(f" Created {filename}")
438
+
439
+ # Create documentation and Git configuration
440
+ readme_path = self.output_dir / "README.md"
441
+ with open(readme_path, "w", encoding="utf-8") as f:
442
+ f.write(self.create_model_card(analysis, metadata))
443
+ logger.info("✓ Created README.md")
444
+
445
+ gitattributes_path = self.output_dir / ".gitattributes"
446
+ with open(gitattributes_path, "w", encoding="utf-8") as f:
447
+ f.write(self.create_gitattributes())
448
+ logger.info("✓ Created .gitattributes")
449
+
450
+ # Validate conversion
451
+ if self.validate_conversion():
452
+ logger.info("🎉 Conversion completed successfully!")
453
+ return True
454
+ else:
455
+ logger.error("❌ Conversion completed with validation errors")
456
+ return False
457
+
458
+ except Exception as e:
459
+ logger.error(f"❌ Conversion failed: {e}")
460
+ return False
461
+
462
+
463
+ def main():
464
+ """Main entry point for the conversion script."""
465
+ import argparse
466
+
467
+ parser = argparse.ArgumentParser(
468
+ description="Convert Mon SentencePiece tokenizer to Hugging Face format"
469
+ )
470
+ parser.add_argument(
471
+ "--model",
472
+ default="mon_tokenizer.model",
473
+ help="Path to SentencePiece model file (default: mon_tokenizer.model)",
474
+ )
475
+ parser.add_argument(
476
+ "--meta",
477
+ default="mon_tokenizer.meta.json",
478
+ help="Path to metadata JSON file (default: mon_tokenizer.meta.json)",
479
+ )
480
+ parser.add_argument(
481
+ "--output",
482
+ default=".",
483
+ help="Output directory (default: current directory)",
484
+ )
485
+ parser.add_argument(
486
+ "--verbose",
487
+ action="store_true",
488
+ help="Enable verbose logging",
489
+ )
490
+
491
+ args = parser.parse_args()
492
+
493
+ if args.verbose:
494
+ logging.getLogger().setLevel(logging.DEBUG)
495
+
496
+ # Create converter and run conversion
497
+ converter = MonTokenizerConverter(
498
+ model_file=args.model,
499
+ meta_file=args.meta,
500
+ output_dir=args.output,
501
+ )
502
+
503
+ success = converter.convert()
504
+ exit(0 if success else 1)
505
 
506
 
507
  if __name__ == "__main__":
508
+ main()
generation_config.json CHANGED
@@ -1,9 +1,14 @@
1
  {
 
2
  "bos_token_id": 1,
3
  "eos_token_id": 2,
4
  "pad_token_id": 4000,
 
5
  "do_sample": true,
6
- "max_length": 2048,
7
- "temperature": 0.8,
8
- "top_p": 0.9
 
 
 
9
  }
 
1
  {
2
+ "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 4000,
6
+ "transformers_version": "4.56.0",
7
  "do_sample": true,
8
+ "temperature": 0.7,
9
+ "top_p": 0.9,
10
+ "top_k": 40,
11
+ "max_new_tokens": 512,
12
+ "repetition_penalty": 1.1,
13
+ "no_repeat_ngram_size": 3
14
  }
pyproject.toml CHANGED
@@ -11,11 +11,11 @@ authors = [
11
  keywords = ["tokenizer", "mon", "myanmar", "nlp", "huggingface", "sentencepiece"]
12
 
13
  dependencies = [
14
- "transformers>=4.30.0",
15
- "torch>=1.12.0",
16
- "sentencepiece>=0.1.99",
17
- "huggingface_hub>=0.15.0",
18
- "protobuf>=3.20.0",
19
  ]
20
 
21
  [project.optional-dependencies]
 
11
  keywords = ["tokenizer", "mon", "myanmar", "nlp", "huggingface", "sentencepiece"]
12
 
13
  dependencies = [
14
+ "transformers>=4.45.0",
15
+ "torch>=2.0.0",
16
+ "sentencepiece>=0.2.0",
17
+ "huggingface_hub>=0.24.0",
18
+ "protobuf>=4.21.0",
19
  ]
20
 
21
  [project.optional-dependencies]
sample_usage.py CHANGED
@@ -1,22 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer
2
 
3
- # Load the tokenizer
4
- tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Example text
7
- text = "ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Tokenize the text
10
- tokens = tokenizer(text, return_tensors="pt")
11
- input_ids = tokens["input_ids"][0]
12
 
13
- # Print token IDs
14
- print("Token IDs:", input_ids.tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Print tokens
17
- token_list = tokenizer.convert_ids_to_tokens(input_ids)
18
- print("Tokens:", token_list)
19
 
20
- # Decode back to text
21
- decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
22
- print("Decoded text:", decoded)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Sample usage examples for the Mon language tokenizer.
4
+
5
+ This script demonstrates various ways to use the Mon tokenizer with
6
+ Hugging Face Transformers library.
7
+ """
8
+
9
+ import logging
10
+ import time
11
+ from typing import List, Dict, Any
12
+
13
+ import torch
14
  from transformers import AutoTokenizer
15
 
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def basic_usage_example():
22
+ """Demonstrate basic tokenizer usage."""
23
+ print("=== Basic Usage Example ===")
24
+
25
+ # Load the tokenizer
26
+ tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
27
+ print(f"✓ Loaded tokenizer (vocab size: {tokenizer.vocab_size:,})")
28
+
29
+ # Example Mon texts
30
+ texts = [
31
+ "ဘာသာမန်",
32
+ "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
33
+ "ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
34
+ ]
35
+
36
+ for i, text in enumerate(texts, 1):
37
+ print(f"\nExample {i}:")
38
+ print(f"Input: {text}")
39
+
40
+ # Tokenize the text
41
+ tokens = tokenizer(text, return_tensors="pt")
42
+ input_ids = tokens["input_ids"][0]
43
+
44
+ # Print results
45
+ print(f"Token IDs: {input_ids.tolist()}")
46
+
47
+ # Convert to token strings
48
+ token_strings = tokenizer.convert_ids_to_tokens(input_ids)
49
+ print(f"Tokens: {token_strings}")
50
+
51
+ # Decode back to text
52
+ decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
53
+ print(f"Decoded: {decoded}")
54
+ print(f"Round-trip success: {text == decoded}")
55
+
56
+
57
+ def batch_processing_example():
58
+ """Demonstrate batch processing."""
59
+ print("\n=== Batch Processing Example ===")
60
+
61
+ tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
62
+
63
+ # Multiple texts for batch processing
64
+ batch_texts = [
65
+ "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
66
+ "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
67
+ "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ်ရ။"
68
+ ]
69
+
70
+ # Batch tokenization with padding
71
+ batch_tokens = tokenizer(
72
+ batch_texts,
73
+ padding=True,
74
+ truncation=True,
75
+ return_tensors="pt",
76
+ max_length=128
77
+ )
78
+
79
+ print(f"Batch shape: {batch_tokens['input_ids'].shape}")
80
+ print(f"Attention mask shape: {batch_tokens['attention_mask'].shape}")
81
+
82
+ # Process each item
83
+ for i, text in enumerate(batch_texts):
84
+ tokens_count = batch_tokens['attention_mask'][i].sum().item()
85
+ decoded = tokenizer.decode(batch_tokens['input_ids'][i], skip_special_tokens=True)
86
+ print(f"Text {i+1}: {tokens_count} tokens -> '{decoded}'")
87
+
88
+
89
+ def advanced_features_example():
90
+ """Demonstrate advanced features."""
91
+ print("\n=== Advanced Features Example ===")
92
+
93
+ tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
94
+ text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
95
 
96
+ # Different tokenization options
97
+ print("Special token handling:")
98
+
99
+ # With special tokens
100
+ with_special = tokenizer(text, add_special_tokens=True, return_tensors="pt")
101
+ print(f" With special tokens: {with_special['input_ids'].shape[1]} tokens")
102
+
103
+ # Without special tokens
104
+ without_special = tokenizer(text, add_special_tokens=False, return_tensors="pt")
105
+ print(f" Without special tokens: {without_special['input_ids'].shape[1]} tokens")
106
+
107
+ # Special token info
108
+ print(f"\nSpecial tokens:")
109
+ print(f" BOS: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
110
+ print(f" EOS: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
111
+ print(f" UNK: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")
112
+ print(f" PAD: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
113
 
 
 
 
114
 
115
+ def performance_example():
116
+ """Demonstrate performance characteristics."""
117
+ print("\n=== Performance Example ===")
118
+
119
+ tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
120
+
121
+ test_texts = [
122
+ ("Short", "ဘာသာမန်"),
123
+ ("Medium", "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။ မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"),
124
+ ("Long", "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။ " * 10)
125
+ ]
126
+
127
+ for name, text in test_texts:
128
+ char_count = len(text)
129
+
130
+ # Measure tokenization time
131
+ start_time = time.time()
132
+ for _ in range(100): # Average over 100 runs
133
+ tokens = tokenizer(text, return_tensors="pt")
134
+ avg_time = (time.time() - start_time) / 100
135
+
136
+ token_count = tokens['input_ids'].shape[1]
137
+ chars_per_sec = char_count / avg_time if avg_time > 0 else 0
138
+
139
+ print(f"{name}: {char_count} chars -> {token_count} tokens")
140
+ print(f" Time: {avg_time*1000:.2f}ms ({chars_per_sec:.0f} chars/sec)")
141
 
 
 
 
142
 
143
+ if __name__ == "__main__":
144
+ print("🚀 Mon Tokenizer Usage Examples")
145
+ print("=" * 50)
146
+
147
+ try:
148
+ basic_usage_example()
149
+ batch_processing_example()
150
+ advanced_features_example()
151
+ performance_example()
152
+
153
+ print(f"\n🎉 All examples completed successfully!")
154
+ print(f"\nFor more information, visit:")
155
+ print(f"https://huggingface.co/janakhpon/mon_tokenizer")
156
+
157
+ except Exception as e:
158
+ print(f"❌ Error running examples: {e}")
159
+ exit(1)
test_tokenizer.py CHANGED
@@ -1,107 +1,503 @@
1
  #!/usr/bin/env python3
2
-
3
  """
4
- test mon tokenizer hugging face integration
 
 
 
5
  """
6
 
 
 
 
 
 
7
  import torch
8
- from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
9
 
 
 
 
 
 
 
 
10
 
11
- def test_tokenizer():
12
- """test tokenizer loading and basic functionality"""
13
- print("testing mon tokenizer")
14
-
15
- try:
16
- tokenizer = AutoTokenizer.from_pretrained(".")
17
- print(f"tokenizer loaded - vocab: {tokenizer.vocab_size:,}")
18
-
19
- # test tokenization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  test_texts = [
21
  "ဘာသာမန်",
22
  "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
23
- "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"
 
24
  ]
25
-
26
- for text in test_texts:
27
- inputs = tokenizer(text, return_tensors="pt")
28
- decoded = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
 
29
 
30
- print(f"input: '{text}'")
31
- print(f"tokens: {inputs['input_ids'].shape}")
32
- print(f"decoded: '{decoded}'")
33
- print(f"round-trip: {'ok' if text == decoded else 'failed'}")
34
- print()
35
-
36
- return True
37
-
38
- except Exception as e:
39
- print(f"tokenizer test failed: {e}")
40
- return False
41
 
 
 
 
 
 
42
 
43
- def test_model_integration():
44
- """test tokenizer with gpt2 model"""
45
- print("testing model integration")
46
-
47
- try:
48
- tokenizer = AutoTokenizer.from_pretrained(".")
49
-
50
- # create small gpt2 model
51
- config = GPT2Config(
52
- vocab_size=tokenizer.vocab_size,
53
- n_positions=512,
54
- n_embd=256,
55
- n_layer=4,
56
- n_head=4,
57
- bos_token_id=tokenizer.bos_token_id,
58
- eos_token_id=tokenizer.eos_token_id,
59
- pad_token_id=tokenizer.pad_token_id,
60
- )
61
-
62
- model = GPT2LMHeadModel(config)
63
- print(f"model created - params: {sum(p.numel() for p in model.parameters()):,}")
64
-
65
- # test generation
66
- prompt = "ဘာသာမန်"
67
- inputs = tokenizer(prompt, return_tensors="pt")
68
-
69
- with torch.no_grad():
70
- outputs = model.generate(
71
- **inputs,
72
- max_length=inputs['input_ids'].shape[1] + 10,
73
- do_sample=False,
74
- pad_token_id=tokenizer.pad_token_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
78
- print(f"generated: '{generated}'")
79
-
80
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- except Exception as e:
83
- print(f"model integration test failed: {e}")
84
- return False
 
 
 
 
 
 
 
 
 
 
85
 
86
 
87
  def main():
88
- """run all tests"""
89
- print("mon tokenizer test suite")
90
-
91
- tests = [
92
- ("tokenizer", test_tokenizer),
93
- ("model integration", test_model_integration)
94
- ]
95
-
96
- results = []
97
- for name, test_func in tests:
98
- print(f"\n--- {name} test ---")
99
- success = test_func()
100
- results.append(success)
101
- print(f"{name}: {'passed' if success else 'failed'}")
102
-
103
- print(f"\ntest results: {sum(results)}/{len(results)} passed")
104
- return all(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
 
2
  """
3
+ Comprehensive test suite for Mon tokenizer Hugging Face integration.
4
+
5
+ This script provides extensive testing for the Mon language tokenizer,
6
+ including functionality tests, performance benchmarks, and compatibility checks.
7
  """
8
 
9
+ import logging
10
+ import time
11
+ from pathlib import Path
12
+ from typing import Dict, List, Tuple
13
+
14
  import torch
15
+ from transformers import AutoTokenizer
16
 
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format="%(asctime)s - %(levelname)s - %(message)s",
21
+ handlers=[logging.StreamHandler()],
22
+ )
23
+ logger = logging.getLogger(__name__)
24
 
25
+
26
+ class MonTokenizerTester:
27
+ """Comprehensive testing suite for Mon tokenizer."""
28
+
29
+ def __init__(self, tokenizer_path: str = "."):
30
+ """
31
+ Initialize the tester.
32
+
33
+ Args:
34
+ tokenizer_path: Path to the tokenizer files
35
+ """
36
+ self.tokenizer_path = tokenizer_path
37
+ self.tokenizer = None
38
+ self.test_results = {}
39
+
40
+ def load_tokenizer(self) -> bool:
41
+ """
42
+ Load the tokenizer for testing.
43
+
44
+ Returns:
45
+ bool: True if tokenizer loaded successfully, False otherwise
46
+ """
47
+ try:
48
+ logger.info(f"Loading tokenizer from: {self.tokenizer_path}")
49
+ self.tokenizer = AutoTokenizer.from_pretrained(
50
+ self.tokenizer_path,
51
+ local_files_only=True,
52
+ trust_remote_code=False
53
+ )
54
+
55
+ logger.info(f"✓ Tokenizer loaded successfully")
56
+ logger.info(f" - Vocabulary size: {self.tokenizer.vocab_size:,}")
57
+ logger.info(f" - Model max length: {self.tokenizer.model_max_length:,}")
58
+ logger.info(f" - Tokenizer class: {self.tokenizer.__class__.__name__}")
59
+
60
+ return True
61
+
62
+ except Exception as e:
63
+ logger.error(f"✗ Failed to load tokenizer: {e}")
64
+ return False
65
+
66
+ def test_basic_functionality(self) -> bool:
67
+ """
68
+ Test basic tokenizer functionality.
69
+
70
+ Returns:
71
+ bool: True if all basic tests pass, False otherwise
72
+ """
73
+ logger.info("=== Testing Basic Functionality ===")
74
+
75
+ test_cases = [
76
+ {
77
+ "text": "ဘာသာမန်",
78
+ "description": "Single Mon word",
79
+ "expected_min_tokens": 1
80
+ },
81
+ {
82
+ "text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
83
+ "description": "Complete Mon sentence",
84
+ "expected_min_tokens": 3
85
+ },
86
+ {
87
+ "text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
88
+ "description": "Mon geographical text",
89
+ "expected_min_tokens": 3
90
+ },
91
+ {
92
+ "text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ",
93
+ "description": "Mon numerals and dates",
94
+ "expected_min_tokens": 2
95
+ },
96
+ {
97
+ "text": "အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။",
98
+ "description": "Complex Mon linguistics text",
99
+ "expected_min_tokens": 5
100
+ }
101
+ ]
102
+
103
+ passed = 0
104
+ total = len(test_cases)
105
+
106
+ for i, test_case in enumerate(test_cases, 1):
107
+ text = test_case["text"]
108
+ description = test_case["description"]
109
+ expected_min_tokens = test_case["expected_min_tokens"]
110
+
111
+ try:
112
+ # Test encoding
113
+ start_time = time.time()
114
+ tokens = self.tokenizer(text, return_tensors="pt")
115
+ encoding_time = time.time() - start_time
116
+
117
+ # Test decoding
118
+ start_time = time.time()
119
+ decoded = self.tokenizer.decode(
120
+ tokens["input_ids"][0],
121
+ skip_special_tokens=True
122
+ )
123
+ decoding_time = time.time() - start_time
124
+
125
+ # Validate results
126
+ token_count = tokens["input_ids"].shape[1]
127
+ round_trip_success = text.strip() == decoded.strip()
128
+
129
+ if token_count >= expected_min_tokens and round_trip_success:
130
+ logger.info(f"✓ Test {i}: {description}")
131
+ logger.info(f" Tokens: {token_count}, Encoding: {encoding_time*1000:.2f}ms, "
132
+ f"Decoding: {decoding_time*1000:.2f}ms")
133
+ passed += 1
134
+ else:
135
+ logger.warning(f"⚠ Test {i}: {description}")
136
+ if token_count < expected_min_tokens:
137
+ logger.warning(f" Token count too low: {token_count} < {expected_min_tokens}")
138
+ if not round_trip_success:
139
+ logger.warning(f" Round-trip failed:")
140
+ logger.warning(f" Input: '{text}'")
141
+ logger.warning(f" Output: '{decoded}'")
142
+
143
+ except Exception as e:
144
+ logger.error(f"✗ Test {i}: {description} - ERROR: {e}")
145
+
146
+ success = passed == total
147
+ self.test_results["basic_functionality"] = {
148
+ "passed": passed,
149
+ "total": total,
150
+ "success": success
151
+ }
152
+
153
+ logger.info(f"Basic functionality: {passed}/{total} tests passed")
154
+ return success
155
+
156
+ def test_special_tokens(self) -> bool:
157
+ """
158
+ Test special token handling.
159
+
160
+ Returns:
161
+ bool: True if special token tests pass, False otherwise
162
+ """
163
+ logger.info("=== Testing Special Tokens ===")
164
+
165
+ try:
166
+ # Test special token IDs
167
+ special_tokens = {
168
+ "bos_token": self.tokenizer.bos_token,
169
+ "eos_token": self.tokenizer.eos_token,
170
+ "unk_token": self.tokenizer.unk_token,
171
+ "pad_token": self.tokenizer.pad_token,
172
+ }
173
+
174
+ special_token_ids = {
175
+ "bos_token_id": self.tokenizer.bos_token_id,
176
+ "eos_token_id": self.tokenizer.eos_token_id,
177
+ "unk_token_id": self.tokenizer.unk_token_id,
178
+ "pad_token_id": self.tokenizer.pad_token_id,
179
+ }
180
+
181
+ logger.info("Special tokens:")
182
+ for name, token in special_tokens.items():
183
+ token_id = special_token_ids[f"{name}_id"]
184
+ logger.info(f" {name}: '{token}' (ID: {token_id})")
185
+
186
+ # Test that special tokens are properly handled
187
+ test_text = "ဘာသာမန်"
188
+ tokens_with_special = self.tokenizer(
189
+ test_text,
190
+ add_special_tokens=True,
191
+ return_tensors="pt"
192
+ )
193
+ tokens_without_special = self.tokenizer(
194
+ test_text,
195
+ add_special_tokens=False,
196
+ return_tensors="pt"
197
+ )
198
+
199
+ with_special_count = tokens_with_special["input_ids"].shape[1]
200
+ without_special_count = tokens_without_special["input_ids"].shape[1]
201
+
202
+ if with_special_count > without_special_count:
203
+ logger.info("✓ Special tokens are properly added")
204
+ success = True
205
+ else:
206
+ logger.warning("⚠ Special tokens may not be properly added")
207
+ success = False
208
+
209
+ self.test_results["special_tokens"] = {"success": success}
210
+ return success
211
+
212
+ except Exception as e:
213
+ logger.error(f"✗ Special token test failed: {e}")
214
+ self.test_results["special_tokens"] = {"success": False}
215
+ return False
216
+
217
+ def test_edge_cases(self) -> bool:
218
+ """
219
+ Test edge cases and error handling.
220
+
221
+ Returns:
222
+ bool: True if edge case tests pass, False otherwise
223
+ """
224
+ logger.info("=== Testing Edge Cases ===")
225
+
226
+ edge_cases = [
227
+ ("", "Empty string"),
228
+ (" ", "Whitespace only"),
229
+ ("a", "Single ASCII character"),
230
+ ("123", "Numbers only"),
231
+ ("!@#$%", "Special characters only"),
232
+ ("ဘာသာမန်" * 100, "Very long text"),
233
+ ("ဟ", "Single Mon character"),
234
+ ("၀၁၂၃၄၅၆၇၈၉", "Mon numerals"),
235
+ ]
236
+
237
+ passed = 0
238
+ total = len(edge_cases)
239
+
240
+ for text, description in edge_cases:
241
+ try:
242
+ tokens = self.tokenizer(text, return_tensors="pt")
243
+ decoded = self.tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
244
+
245
+ # For edge cases, we mainly check that no errors occur
246
+ logger.info(f"✓ {description}: {tokens['input_ids'].shape[1]} tokens")
247
+ passed += 1
248
+
249
+ except Exception as e:
250
+ logger.error(f"✗ {description}: {e}")
251
+
252
+ success = passed == total
253
+ self.test_results["edge_cases"] = {
254
+ "passed": passed,
255
+ "total": total,
256
+ "success": success
257
+ }
258
+
259
+ logger.info(f"Edge cases: {passed}/{total} tests passed")
260
+ return success
261
+
262
+ def test_performance_benchmark(self) -> bool:
263
+ """
264
+ Run performance benchmarks.
265
+
266
+ Returns:
267
+ bool: True if performance is acceptable, False otherwise
268
+ """
269
+ logger.info("=== Performance Benchmark ===")
270
+
271
+ # Test texts of varying lengths
272
  test_texts = [
273
  "ဘာသာမန်",
274
  "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
275
+ ("အရေဝ်ဘာသာမန် ပ္ဍဲလောကဏအ် ဂွံဆဵုကေတ် ပ္ဍဲဍုင်သေံ ကဵု ဍုင်ဗၟာ ရ။ " * 10),
276
+ ("မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။ " * 50),
277
  ]
278
+
279
+ benchmark_results = []
280
+
281
+ for i, text in enumerate(test_texts, 1):
282
+ char_count = len(text)
283
 
284
+ # Benchmark encoding
285
+ start_time = time.time()
286
+ for _ in range(10): # Run 10 times for average
287
+ tokens = self.tokenizer(text, return_tensors="pt")
288
+ encoding_time = (time.time() - start_time) / 10
 
 
 
 
 
 
289
 
290
+ # Benchmark decoding
291
+ start_time = time.time()
292
+ for _ in range(10): # Run 10 times for average
293
+ decoded = self.tokenizer.decode(tokens["input_ids"][0])
294
+ decoding_time = (time.time() - start_time) / 10
295
 
296
+ token_count = tokens["input_ids"].shape[1]
297
+
298
+ result = {
299
+ "text_length": char_count,
300
+ "token_count": token_count,
301
+ "encoding_time": encoding_time,
302
+ "decoding_time": decoding_time,
303
+ "chars_per_second": char_count / encoding_time if encoding_time > 0 else 0,
304
+ "tokens_per_second": token_count / decoding_time if decoding_time > 0 else 0
305
+ }
306
+
307
+ benchmark_results.append(result)
308
+
309
+ logger.info(f"Text {i} ({char_count} chars, {token_count} tokens):")
310
+ logger.info(f" Encoding: {encoding_time*1000:.2f}ms ({result['chars_per_second']:.0f} chars/s)")
311
+ logger.info(f" Decoding: {decoding_time*1000:.2f}ms ({result['tokens_per_second']:.0f} tokens/s)")
312
+
313
+ # Check if performance is acceptable (very lenient thresholds)
314
+ avg_encoding_time = sum(r["encoding_time"] for r in benchmark_results) / len(benchmark_results)
315
+ avg_decoding_time = sum(r["decoding_time"] for r in benchmark_results) / len(benchmark_results)
316
+
317
+ success = avg_encoding_time < 1.0 and avg_decoding_time < 1.0 # Less than 1 second average
318
+
319
+ self.test_results["performance"] = {
320
+ "avg_encoding_time": avg_encoding_time,
321
+ "avg_decoding_time": avg_decoding_time,
322
+ "success": success,
323
+ "details": benchmark_results
324
+ }
325
+
326
+ logger.info(f"Performance benchmark: {'PASSED' if success else 'FAILED'}")
327
+ return success
328
+
329
+ def test_compatibility(self) -> bool:
330
+ """
331
+ Test compatibility with transformers ecosystem.
332
+
333
+ Returns:
334
+ bool: True if compatibility tests pass, False otherwise
335
+ """
336
+ logger.info("=== Testing Compatibility ===")
337
+
338
+ try:
339
+ # Test tensor types
340
+ text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ��ရ။"
341
+
342
+ # Test different return types
343
+ tokens_pt = self.tokenizer(text, return_tensors="pt")
344
+ tokens_list = self.tokenizer(text, return_tensors=None)
345
+
346
+ logger.info("✓ PyTorch tensor support")
347
+ logger.info("✓ List output support")
348
+
349
+ # Test padding and truncation
350
+ texts = [
351
+ "ဘာသာမန်",
352
+ "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
353
+ "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"
354
+ ]
355
+
356
+ # Test batch processing
357
+ batch_tokens = self.tokenizer(
358
+ texts,
359
+ padding=True,
360
+ truncation=True,
361
+ return_tensors="pt"
362
  )
363
+
364
+ logger.info(f"✓ Batch processing: {batch_tokens['input_ids'].shape}")
365
+
366
+ # Test attention masks
367
+ if "attention_mask" in batch_tokens:
368
+ logger.info("✓ Attention mask generation")
369
+ else:
370
+ logger.warning("⚠ No attention mask generated")
371
+
372
+ success = True
373
+
374
+ except Exception as e:
375
+ logger.error(f"✗ Compatibility test failed: {e}")
376
+ success = False
377
+
378
+ self.test_results["compatibility"] = {"success": success}
379
+ return success
380
+
381
+ def run_all_tests(self) -> bool:
382
+ """
383
+ Run all test suites.
384
+
385
+ Returns:
386
+ bool: True if all tests pass, False otherwise
387
+ """
388
+ logger.info("🚀 Starting Mon Tokenizer Test Suite")
389
+ logger.info("=" * 50)
390
+
391
+ # Load tokenizer
392
+ if not self.load_tokenizer():
393
+ return False
394
+
395
+ # Run all test suites
396
+ test_suites = [
397
+ ("Basic Functionality", self.test_basic_functionality),
398
+ ("Special Tokens", self.test_special_tokens),
399
+ ("Edge Cases", self.test_edge_cases),
400
+ ("Performance Benchmark", self.test_performance_benchmark),
401
+ ("Compatibility", self.test_compatibility),
402
+ ]
403
+
404
+ results = []
405
+ for suite_name, test_func in test_suites:
406
+ logger.info(f"\n--- {suite_name} ---")
407
+ success = test_func()
408
+ results.append((suite_name, success))
409
+ logger.info(f"{suite_name}: {'✅ PASSED' if success else '❌ FAILED'}")
410
+
411
+ # Summary
412
+ logger.info("\n" + "=" * 50)
413
+ logger.info("📊 TEST SUMMARY")
414
+ logger.info("=" * 50)
415
+
416
+ passed_suites = sum(1 for _, success in results if success)
417
+ total_suites = len(results)
418
+
419
+ for suite_name, success in results:
420
+ status = "✅ PASSED" if success else "❌ FAILED"
421
+ logger.info(f"{suite_name}: {status}")
422
+
423
+ overall_success = passed_suites == total_suites
424
+ logger.info(f"\nOverall Result: {passed_suites}/{total_suites} test suites passed")
425
 
426
+ if overall_success:
427
+ logger.info("🎉 ALL TESTS PASSED! Tokenizer is ready for production.")
428
+ else:
429
+ logger.error("⚠️ Some tests failed. Please review the issues above.")
430
+
431
+ return overall_success
432
+
433
+ def generate_test_report(self) -> str:
434
+ """
435
+ Generate a detailed test report.
436
+
437
+ Returns:
438
+ str: Formatted test report
439
+ """
440
+ if not self.test_results:
441
+ return "No test results available. Run tests first."
442
+
443
+ report = ["# Mon Tokenizer Test Report", ""]
444
 
445
+ for test_name, result in self.test_results.items():
446
+ report.append(f"## {test_name.replace('_', ' ').title()}")
447
+
448
+ if isinstance(result, dict) and "success" in result:
449
+ status = "✅ PASSED" if result["success"] else "❌ FAILED"
450
+ report.append(f"Status: {status}")
451
+
452
+ if "passed" in result and "total" in result:
453
+ report.append(f"Tests: {result['passed']}/{result['total']}")
454
+
455
+ report.append("")
456
+
457
+ return "\n".join(report)
458
 
459
 
460
  def main():
461
+ """Main entry point for the test script."""
462
+ import argparse
463
+
464
+ parser = argparse.ArgumentParser(
465
+ description="Test Mon tokenizer Hugging Face integration"
466
+ )
467
+ parser.add_argument(
468
+ "--tokenizer-path",
469
+ default=".",
470
+ help="Path to tokenizer files (default: current directory)",
471
+ )
472
+ parser.add_argument(
473
+ "--report",
474
+ action="store_true",
475
+ help="Generate detailed test report",
476
+ )
477
+ parser.add_argument(
478
+ "--verbose",
479
+ action="store_true",
480
+ help="Enable verbose logging",
481
+ )
482
+
483
+ args = parser.parse_args()
484
+
485
+ if args.verbose:
486
+ logging.getLogger().setLevel(logging.DEBUG)
487
+
488
+ # Create tester and run tests
489
+ tester = MonTokenizerTester(tokenizer_path=args.tokenizer_path)
490
+ success = tester.run_all_tests()
491
+
492
+ # Generate report if requested
493
+ if args.report:
494
+ report = tester.generate_test_report()
495
+ report_path = Path("test_report.md")
496
+ with open(report_path, "w", encoding="utf-8") as f:
497
+ f.write(report)
498
+ logger.info(f"Test report saved to: {report_path}")
499
+
500
+ exit(0 if success else 1)
501
 
502
 
503
  if __name__ == "__main__":
tokenizer_config.json CHANGED
@@ -1,19 +1,52 @@
1
  {
2
- "model_type": "llama",
3
- "tokenizer_class": "LlamaTokenizer",
4
- "vocab_file": "mon_tokenizer.model",
5
- "vocab_size": 4000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "bos_token": "<s>",
 
7
  "eos_token": "</s>",
8
- "unk_token": "<unk>",
 
9
  "pad_token": "<pad>",
10
- "bos_token_id": 1,
11
- "eos_token_id": 2,
12
- "unk_token_id": 0,
13
- "pad_token_id": 4000,
14
- "clean_up_tokenization_spaces": false,
15
  "sp_model_kwargs": {},
16
- "add_bos_token": true,
17
- "add_eos_token": false,
18
- "model_max_length": 2048
 
 
19
  }
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "1": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "2": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "0": {
23
+ "content": "<unk>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "4000": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "additional_special_tokens": [],
40
  "bos_token": "<s>",
41
+ "clean_up_tokenization_spaces": false,
42
  "eos_token": "</s>",
43
+ "legacy": true,
44
+ "model_max_length": 4096,
45
  "pad_token": "<pad>",
 
 
 
 
 
46
  "sp_model_kwargs": {},
47
+ "tokenizer_class": "LlamaTokenizer",
48
+ "unk_token": "<unk>",
49
+ "use_default_system_prompt": false,
50
+ "vocab_file": "mon_tokenizer.model",
51
+ "vocab_size": 4000
52
  }
upload_to_hub.py CHANGED
@@ -1,127 +1,423 @@
1
  #!/usr/bin/env python3
2
-
3
  """
4
- upload mon tokenizer to hugging face hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
 
7
  import os
8
  from pathlib import Path
9
- from huggingface_hub import HfApi, login
 
 
10
  from transformers import AutoTokenizer
11
 
 
 
 
 
 
 
 
12
 
13
- def validate_tokenizer(directory: str = ".") -> bool:
14
- """validate tokenizer before upload"""
15
- print("validating tokenizer")
16
-
17
- required_files = [
18
- "mon_tokenizer.model",
19
- "tokenizer_config.json",
20
- "special_tokens_map.json",
21
- "README.md"
22
- ]
23
-
24
- for file in required_files:
25
- if not os.path.exists(os.path.join(directory, file)):
26
- print(f"missing required file: {file}")
27
- return False
28
-
29
- try:
30
- tokenizer = AutoTokenizer.from_pretrained(directory)
31
- test_text = "ဘာသာမန်"
32
- tokens = tokenizer(test_text, return_tensors="pt")
33
- decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- if test_text != decoded:
36
- print("tokenizer round-trip test failed")
 
 
 
 
37
  return False
38
-
39
- print("validation passed")
40
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- except Exception as e:
43
- print(f"validation failed: {e}")
44
- return False
45
-
46
-
47
- def upload_to_hub(
48
- repo_id: str = "janakhpon/mon_tokenizer",
49
- directory: str = ".",
50
- private: bool = False,
51
- commit_message: str = "upload mon tokenizer"
52
- ):
53
- """upload tokenizer to hugging face hub"""
54
-
55
- print(f"uploading to {repo_id}")
56
-
57
- # validate first
58
- if not validate_tokenizer(directory):
59
- print("upload cancelled - validation failed")
60
- return False
61
-
62
- try:
63
- # login
64
- print("logging in to hugging face")
65
- login()
66
-
67
- # create api client
68
- api = HfApi()
69
-
70
- # create/update repository
71
- print(f"creating repository: {repo_id}")
72
- api.create_repo(
73
- repo_id=repo_id,
74
- private=private,
75
- exist_ok=True,
76
- repo_type="model"
77
- )
78
-
79
- # upload files
80
- print("uploading files")
81
- api.upload_folder(
82
- folder_path=directory,
83
- repo_id=repo_id,
84
- commit_message=commit_message,
85
- ignore_patterns=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "*.pyc",
87
  "__pycache__/",
88
  ".git/",
89
  ".venv/",
90
  "*.lock",
91
- "datasets/"
 
 
 
 
 
 
 
 
92
  ]
93
- )
94
-
95
- print(f"upload successful: https://huggingface.co/{repo_id}")
96
- return True
97
-
98
- except Exception as e:
99
- print(f"upload failed: {e}")
100
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
 
103
  def main():
104
- """main upload function"""
105
- print("mon tokenizer hub uploader")
106
-
107
- # get repo info
108
- repo_id = input("repository id (janakhpon/mon_tokenizer): ").strip()
109
- if not repo_id:
110
- repo_id = "janakhpon/mon_tokenizer"
111
-
112
- private = input("private repository? (y/n): ").strip().lower() == 'y'
113
-
114
- # upload
115
- success = upload_to_hub(
116
- repo_id=repo_id,
117
- private=private,
118
- commit_message="updated mon tokenizer"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  )
120
-
121
- if success:
122
- print("tokenizer successfully uploaded to hugging face hub")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  else:
124
- print("upload failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
 
127
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
 
2
  """
3
+ Upload Mon tokenizer to Hugging Face Hub.
4
+
5
+ This script provides functionality to validate and upload the Mon language tokenizer
6
+ to Hugging Face Hub with comprehensive validation and modern best practices.
7
+
8
+ required files:
9
+ - `tokenizer_config.json`
10
+ - `special_tokens_map.json`
11
+ - `generation_config.json`
12
+ - `README.md`
13
+ - `.gitattributes`
14
+ - `mon_tokenizer.model` (auto-detects either `tokenizer.model` or `mon_tokenizer.model`)
15
+
16
+ Nothing missing - the script validates all files exist before upload and lists each file with size before uploading.
17
+
18
+ The scripts now use modern conventions while maintaining backward compatibility through the `legacy: true` setting, which prevents the llama tokenizer warnings while using the latest transformers architecture.
19
+
20
  """
21
 
22
+ import logging
23
  import os
24
  from pathlib import Path
25
+ from typing import List, Optional
26
+
27
+ from huggingface_hub import HfApi, login, whoami
28
  from transformers import AutoTokenizer
29
 
30
+ # Configure logging
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format="%(asctime)s - %(levelname)s - %(message)s",
34
+ handlers=[logging.StreamHandler()],
35
+ )
36
+ logger = logging.getLogger(__name__)
37
 
38
+
39
+ class TokenizerUploader:
40
+ """Handles validation and upload of tokenizers to Hugging Face Hub."""
41
+
42
+ def __init__(self, directory: str = "."):
43
+ """
44
+ Initialize the uploader.
45
+
46
+ Args:
47
+ directory: Directory containing the tokenizer files
48
+ """
49
+ self.directory = Path(directory)
50
+ self.required_files = [
51
+ "tokenizer_config.json",
52
+ "special_tokens_map.json",
53
+ "generation_config.json",
54
+ "README.md",
55
+ ".gitattributes",
56
+ ]
57
+ # Check for either tokenizer.model or mon_tokenizer.model
58
+ self.model_files = ["tokenizer.model", "mon_tokenizer.model"]
59
+
60
+ def validate_files(self) -> bool:
61
+ """
62
+ Validate that all required files are present.
63
+
64
+ Returns:
65
+ bool: True if all files are present, False otherwise
66
+ """
67
+ logger.info(f"Validating tokenizer files in: {self.directory.absolute()}")
68
+
69
+ missing_files = []
70
+ present_files = []
71
+
72
+ # Check regular required files
73
+ for file_name in self.required_files:
74
+ file_path = self.directory / file_name
75
+ if file_path.exists():
76
+ size = file_path.stat().st_size
77
+ present_files.append((file_name, size))
78
+ logger.info(f"✓ {file_name} ({size:,} bytes)")
79
+ else:
80
+ missing_files.append(file_name)
81
+ logger.error(f"✗ {file_name} (missing)")
82
+
83
+ # Check for model file (either name is acceptable)
84
+ model_found = False
85
+ for model_name in self.model_files:
86
+ model_path = self.directory / model_name
87
+ if model_path.exists():
88
+ size = model_path.stat().st_size
89
+ present_files.append((model_name, size))
90
+ logger.info(f"✓ {model_name} ({size:,} bytes)")
91
+ model_found = True
92
+ break
93
 
94
+ if not model_found:
95
+ missing_files.append("tokenizer.model or mon_tokenizer.model")
96
+ logger.error(f"✗ Model file missing (looked for: {', '.join(self.model_files)})")
97
+
98
+ if missing_files:
99
+ logger.error(f"Missing required files: {', '.join(missing_files)}")
100
  return False
101
+
102
+ logger.info(f" All {len(self.required_files)} required files present")
103
  return True
104
+
105
+ def validate_tokenizer_functionality(self) -> bool:
106
+ """
107
+ Validate tokenizer functionality with comprehensive tests.
108
+
109
+ Returns:
110
+ bool: True if all tests pass, False otherwise
111
+ """
112
+ logger.info("Validating tokenizer functionality")
113
+
114
+ try:
115
+ # Load tokenizer with explicit local files only
116
+ abs_directory = str(self.directory.absolute())
117
+ tokenizer = AutoTokenizer.from_pretrained(
118
+ abs_directory,
119
+ local_files_only=True,
120
+ trust_remote_code=False # Security best practice
121
+ )
122
+
123
+ logger.info(f"✓ Tokenizer loaded (vocab: {tokenizer.vocab_size:,})")
124
+
125
+ # Comprehensive test cases for Mon language
126
+ test_cases = [
127
+ {
128
+ "text": "ဘာသာမန်",
129
+ "description": "Simple Mon word"
130
+ },
131
+ {
132
+ "text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
133
+ "description": "Complex Mon sentence with punctuation"
134
+ },
135
+ {
136
+ "text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
137
+ "description": "Mon text with geographical references"
138
+ },
139
+ {
140
+ "text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ",
141
+ "description": "Mon numerals and dates"
142
+ },
143
+ {
144
+ "text": "",
145
+ "description": "Empty string test"
146
+ }
147
+ ]
148
+
149
+ passed_tests = 0
150
+
151
+ for i, test_case in enumerate(test_cases, 1):
152
+ text = test_case["text"]
153
+ description = test_case["description"]
154
+
155
+ try:
156
+ # Test tokenization
157
+ tokens = tokenizer(text, return_tensors="pt")
158
+
159
+ if tokens["input_ids"].numel() == 0 and text:
160
+ logger.warning(f"⚠ Test {i}: Empty tokenization for non-empty text")
161
+ continue
162
+
163
+ # Test decoding
164
+ decoded = tokenizer.decode(
165
+ tokens["input_ids"][0],
166
+ skip_special_tokens=True
167
+ )
168
+
169
+ # Check round-trip accuracy
170
+ if text.strip() == decoded.strip():
171
+ logger.info(f"✓ Test {i}: {description} - PASSED")
172
+ passed_tests += 1
173
+ else:
174
+ logger.warning(f"⚠ Test {i}: {description} - Round-trip mismatch")
175
+ logger.warning(f" Input: '{text}'")
176
+ logger.warning(f" Output: '{decoded}'")
177
+
178
+ # For some cases, minor differences might be acceptable
179
+ if len(text.strip()) > 0: # Don't fail on empty strings
180
+ continue
181
+ passed_tests += 1
182
+
183
+ except Exception as e:
184
+ logger.error(f"✗ Test {i}: {description} - FAILED: {e}")
185
+ return False
186
+
187
+ # Check test results
188
+ total_tests = len([tc for tc in test_cases if tc["text"]]) # Exclude empty string
189
+ if passed_tests >= total_tests - 1: # Allow one test to fail
190
+ logger.info(f"✓ Functionality validation passed ({passed_tests}/{len(test_cases)} tests)")
191
+ return True
192
+ else:
193
+ logger.error(f"✗ Functionality validation failed ({passed_tests}/{len(test_cases)} tests passed)")
194
+ return False
195
+
196
+ except Exception as e:
197
+ logger.error(f"✗ Tokenizer validation failed: {e}")
198
+ return False
199
+
200
+ def validate_tokenizer(self) -> bool:
201
+ """
202
+ Run complete tokenizer validation.
203
+
204
+ Returns:
205
+ bool: True if validation passes, False otherwise
206
+ """
207
+ logger.info("=== Starting Tokenizer Validation ===")
208
 
209
+ # Validate files
210
+ if not self.validate_files():
211
+ return False
212
+
213
+ # Validate functionality
214
+ if not self.validate_tokenizer_functionality():
215
+ return False
216
+
217
+ logger.info("✅ Tokenizer validation completed successfully")
218
+ return True
219
+
220
+ def check_authentication(self) -> Optional[str]:
221
+ """
222
+ Check Hugging Face authentication status.
223
+
224
+ Returns:
225
+ Optional[str]: Username if authenticated, None otherwise
226
+ """
227
+ try:
228
+ user_info = whoami()
229
+ username = user_info.get("name", "unknown")
230
+ logger.info(f"✓ Authenticated as: {username}")
231
+ return username
232
+ except Exception:
233
+ logger.warning("Not authenticated with Hugging Face")
234
+ return None
235
+
236
+ def upload_to_hub(
237
+ self,
238
+ repo_id: str,
239
+ private: bool = False,
240
+ commit_message: str = "Upload Mon language tokenizer",
241
+ create_pr: bool = False,
242
+ ) -> bool:
243
+ """
244
+ Upload tokenizer to Hugging Face Hub.
245
+
246
+ Args:
247
+ repo_id: Repository ID (e.g., "username/model-name")
248
+ private: Whether to create a private repository
249
+ commit_message: Commit message for the upload
250
+ create_pr: Whether to create a pull request instead of direct push
251
+
252
+ Returns:
253
+ bool: True if upload successful, False otherwise
254
+ """
255
+ logger.info(f"=== Starting Upload to {repo_id} ===")
256
+
257
+ try:
258
+ # Validate tokenizer first
259
+ if not self.validate_tokenizer():
260
+ logger.error("❌ Upload cancelled - validation failed")
261
+ return False
262
+
263
+ # Check authentication
264
+ if not self.check_authentication():
265
+ logger.info("Attempting to log in...")
266
+ try:
267
+ login()
268
+ if not self.check_authentication():
269
+ logger.error("❌ Authentication failed")
270
+ return False
271
+ except Exception as e:
272
+ logger.error(f"❌ Login failed: {e}")
273
+ return False
274
+
275
+ # Create API client
276
+ api = HfApi()
277
+
278
+ # Create/update repository
279
+ logger.info(f"Creating/updating repository: {repo_id}")
280
+ api.create_repo(
281
+ repo_id=repo_id,
282
+ private=private,
283
+ exist_ok=True,
284
+ repo_type="model"
285
+ )
286
+ logger.info("✓ Repository ready")
287
+
288
+ # List files to upload
289
+ upload_files = []
290
+ ignore_patterns = [
291
  "*.pyc",
292
  "__pycache__/",
293
  ".git/",
294
  ".venv/",
295
  "*.lock",
296
+ "uv.lock",
297
+ "pyproject.toml",
298
+ "datasets/",
299
+ "*.py", # Don't upload Python scripts
300
+ "test_*",
301
+ "sample_*",
302
+ "convert_*",
303
+ "upload_*",
304
+ "*.meta.json"
305
  ]
306
+
307
+ logger.info("Files to be uploaded:")
308
+ for file_path in self.directory.iterdir():
309
+ if file_path.is_file() and not any(
310
+ file_path.match(pattern) for pattern in ignore_patterns
311
+ ):
312
+ size = file_path.stat().st_size
313
+ upload_files.append(file_path.name)
314
+ logger.info(f" ✓ {file_path.name} ({size:,} bytes)")
315
+
316
+ # Upload files
317
+ logger.info("Uploading to Hugging Face Hub...")
318
+ api.upload_folder(
319
+ folder_path=str(self.directory),
320
+ repo_id=repo_id,
321
+ commit_message=commit_message,
322
+ ignore_patterns=ignore_patterns,
323
+ create_pr=create_pr,
324
+ )
325
+
326
+ hub_url = f"https://huggingface.co/{repo_id}"
327
+ logger.info(f"🎉 Upload successful!")
328
+ logger.info(f"📍 Repository URL: {hub_url}")
329
+
330
+ if create_pr:
331
+ logger.info("📝 Pull request created for review")
332
+
333
+ return True
334
+
335
+ except Exception as e:
336
+ logger.error(f"❌ Upload failed: {e}")
337
+ return False
338
 
339
 
340
  def main():
341
+ """Main entry point for the upload script."""
342
+ import argparse
343
+
344
+ parser = argparse.ArgumentParser(
345
+ description="Upload Mon tokenizer to Hugging Face Hub"
346
+ )
347
+ parser.add_argument(
348
+ "--repo-id",
349
+ default="janakhpon/mon_tokenizer",
350
+ help="Repository ID (default: janakhpon/mon_tokenizer)",
351
+ )
352
+ parser.add_argument(
353
+ "--directory",
354
+ default=".",
355
+ help="Directory containing tokenizer files (default: current directory)",
356
+ )
357
+ parser.add_argument(
358
+ "--private",
359
+ action="store_true",
360
+ help="Create private repository",
361
+ )
362
+ parser.add_argument(
363
+ "--message",
364
+ default="Upload Mon language tokenizer",
365
+ help="Commit message",
366
+ )
367
+ parser.add_argument(
368
+ "--create-pr",
369
+ action="store_true",
370
+ help="Create pull request instead of direct push",
371
  )
372
+ parser.add_argument(
373
+ "--validate-only",
374
+ action="store_true",
375
+ help="Only validate tokenizer, don't upload",
376
+ )
377
+ parser.add_argument(
378
+ "--verbose",
379
+ action="store_true",
380
+ help="Enable verbose logging",
381
+ )
382
+
383
+ args = parser.parse_args()
384
+
385
+ if args.verbose:
386
+ logging.getLogger().setLevel(logging.DEBUG)
387
+
388
+ # Create uploader
389
+ uploader = TokenizerUploader(directory=args.directory)
390
+
391
+ if args.validate_only:
392
+ # Only validate
393
+ success = uploader.validate_tokenizer()
394
+ logger.info("Validation completed.")
395
  else:
396
+ # Interactive mode if no repo ID provided
397
+ if args.repo_id == "janakhpon/mon_tokenizer":
398
+ print("\n🤗 Mon Tokenizer Hub Uploader")
399
+ print("=" * 40)
400
+
401
+ repo_input = input(f"Repository ID [{args.repo_id}]: ").strip()
402
+ if repo_input:
403
+ args.repo_id = repo_input
404
+
405
+ private_input = input("Private repository? (y/N): ").strip().lower()
406
+ args.private = private_input == 'y'
407
+
408
+ print(f"\nUploading to: {args.repo_id}")
409
+ print(f"Private: {args.private}")
410
+ print("-" * 40)
411
+
412
+ # Upload tokenizer
413
+ success = uploader.upload_to_hub(
414
+ repo_id=args.repo_id,
415
+ private=args.private,
416
+ commit_message=args.message,
417
+ create_pr=args.create_pr,
418
+ )
419
+
420
+ exit(0 if success else 1)
421
 
422
 
423
  if __name__ == "__main__":
uv.lock CHANGED
@@ -571,13 +571,13 @@ dev = [
571
  [package.metadata]
572
  requires-dist = [
573
  { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
574
- { name = "huggingface-hub", specifier = ">=0.15.0" },
575
  { name = "isort", marker = "extra == 'dev'", specifier = ">=5.12.0" },
576
- { name = "protobuf", specifier = ">=3.20.0" },
577
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
578
- { name = "sentencepiece", specifier = ">=0.1.99" },
579
- { name = "torch", specifier = ">=1.12.0" },
580
- { name = "transformers", specifier = ">=4.30.0" },
581
  ]
582
  provides-extras = ["dev"]
583
 
 
571
  [package.metadata]
572
  requires-dist = [
573
  { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
574
+ { name = "huggingface-hub", specifier = ">=0.24.0" },
575
  { name = "isort", marker = "extra == 'dev'", specifier = ">=5.12.0" },
576
+ { name = "protobuf", specifier = ">=4.21.0" },
577
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
578
+ { name = "sentencepiece", specifier = ">=0.2.0" },
579
+ { name = "torch", specifier = ">=2.0.0" },
580
+ { name = "transformers", specifier = ">=4.45.0" },
581
  ]
582
  provides-extras = ["dev"]
583