janakhpon commited on
Commit
e9d0f85
·
1 Parent(s): 81cf36d

feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue

Browse files
convert_to_hf.py CHANGED
@@ -9,7 +9,7 @@ convert_to_hf.py output files:
9
  - `tokenizer_config.json` - main config with modern `added_tokens_decoder` structure (not legacy)
10
  - `special_tokens_map.json` - special token definitions
11
  - `generation_config.json` - generation parameters
12
- - `mon_tokenizer.model` - the sentencepiece model file
13
  - `README.md` - comprehensive model card
14
  - `.gitattributes` - git lfs configuration
15
 
@@ -40,7 +40,7 @@ class MonTokenizerConverter:
40
 
41
  def __init__(
42
  self,
43
- model_file: str = "mon_tokenizer.model",
44
  meta_file: str = "mon_tokenizer.meta.json",
45
  output_dir: str = ".",
46
  ):
@@ -178,7 +178,7 @@ class MonTokenizerConverter:
178
  "tokenizer_class": "LlamaTokenizer",
179
  "unk_token": analysis["unk_token"],
180
  "use_default_system_prompt": False,
181
- "vocab_file": "mon_tokenizer.model",
182
  "vocab_size": analysis["vocab_size"]
183
  }
184
 
@@ -414,7 +414,7 @@ For questions or issues, please open an issue on the repository or contact the m
414
  analysis = self.analyze_sentencepiece_model()
415
 
416
  # Copy model file (use original name for compatibility)
417
- model_dest = self.output_dir / "mon_tokenizer.model"
418
  if not model_dest.exists() or model_dest.resolve() != self.model_file.resolve():
419
  logger.info("Copying SentencePiece model file")
420
  shutil.copy2(self.model_file, model_dest)
@@ -469,8 +469,8 @@ def main():
469
  )
470
  parser.add_argument(
471
  "--model",
472
- default="mon_tokenizer.model",
473
- help="Path to SentencePiece model file (default: mon_tokenizer.model)",
474
  )
475
  parser.add_argument(
476
  "--meta",
 
9
  - `tokenizer_config.json` - main config with modern `added_tokens_decoder` structure (not legacy)
10
  - `special_tokens_map.json` - special token definitions
11
  - `generation_config.json` - generation parameters
12
+ - `tokenizer.model` - the sentencepiece model file
13
  - `README.md` - comprehensive model card
14
  - `.gitattributes` - git lfs configuration
15
 
 
40
 
41
  def __init__(
42
  self,
43
+ model_file: str = "tokenizer.model",
44
  meta_file: str = "mon_tokenizer.meta.json",
45
  output_dir: str = ".",
46
  ):
 
178
  "tokenizer_class": "LlamaTokenizer",
179
  "unk_token": analysis["unk_token"],
180
  "use_default_system_prompt": False,
181
+ "vocab_file": "tokenizer.model",
182
  "vocab_size": analysis["vocab_size"]
183
  }
184
 
 
414
  analysis = self.analyze_sentencepiece_model()
415
 
416
  # Copy model file (use original name for compatibility)
417
+ model_dest = self.output_dir / "tokenizer.model"
418
  if not model_dest.exists() or model_dest.resolve() != self.model_file.resolve():
419
  logger.info("Copying SentencePiece model file")
420
  shutil.copy2(self.model_file, model_dest)
 
469
  )
470
  parser.add_argument(
471
  "--model",
472
+ default="tokenizer.model",
473
+ help="Path to SentencePiece model file (default: tokenizer.model)",
474
  )
475
  parser.add_argument(
476
  "--meta",
mon_tokenizer.meta.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "model_path": "mon_tokenizer.model",
3
  "vocab_path": "mon_tokenizer.vocab",
4
  "lines_trained": 32412,
5
  "total_characters": 2453293,
 
1
  {
2
+ "model_path": "tokenizer.model",
3
  "vocab_path": "mon_tokenizer.vocab",
4
  "lines_trained": 32412,
5
  "total_characters": 2453293,
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
mon_tokenizer.model → tokenizer.model RENAMED
File without changes
tokenizer_config.json CHANGED
@@ -47,6 +47,6 @@
47
  "tokenizer_class": "LlamaTokenizer",
48
  "unk_token": "<unk>",
49
  "use_default_system_prompt": false,
50
- "vocab_file": "mon_tokenizer.model",
51
  "vocab_size": 4000
52
  }
 
47
  "tokenizer_class": "LlamaTokenizer",
48
  "unk_token": "<unk>",
49
  "use_default_system_prompt": false,
50
+ "vocab_file": "tokenizer.model",
51
  "vocab_size": 4000
52
  }
upload_to_hub.py CHANGED
@@ -5,17 +5,24 @@ Upload Mon tokenizer to Hugging Face Hub.
5
  This script provides functionality to validate and upload the Mon language tokenizer
6
  to Hugging Face Hub with comprehensive validation and modern best practices.
7
 
8
- required files:
9
- - `tokenizer_config.json`
10
- - `special_tokens_map.json`
11
- - `generation_config.json`
12
- - `README.md`
13
- - `.gitattributes`
14
- - `mon_tokenizer.model` (auto-detects either `tokenizer.model` or `mon_tokenizer.model`)
15
 
16
- Nothing missing - the script validates all files exist before upload and lists each file with size before uploading.
 
 
 
17
 
18
- The scripts now use modern conventions while maintaining backward compatibility through the `legacy: true` setting, which prevents the llama tokenizer warnings while using the latest transformers architecture.
 
 
 
 
 
 
19
 
20
  """
21
 
@@ -50,12 +57,20 @@ class TokenizerUploader:
50
  self.required_files = [
51
  "tokenizer_config.json",
52
  "special_tokens_map.json",
53
- "generation_config.json",
54
  "README.md",
55
  ".gitattributes",
56
  ]
57
- # Check for either tokenizer.model or mon_tokenizer.model
58
- self.model_files = ["tokenizer.model", "mon_tokenizer.model"]
 
 
 
 
 
 
 
 
 
59
 
60
  def validate_files(self) -> bool:
61
  """
@@ -80,31 +95,44 @@ class TokenizerUploader:
80
  missing_files.append(file_name)
81
  logger.error(f"✗ {file_name} (missing)")
82
 
83
- # Check for model file (either name is acceptable)
 
 
 
 
 
 
 
 
84
  model_found = False
 
85
  for model_name in self.model_files:
86
  model_path = self.directory / model_name
87
  if model_path.exists():
88
  size = model_path.stat().st_size
89
  present_files.append((model_name, size))
 
90
  logger.info(f"✓ {model_name} ({size:,} bytes)")
91
  model_found = True
92
- break
93
 
94
  if not model_found:
95
- missing_files.append("tokenizer.model or mon_tokenizer.model")
96
- logger.error(f"✗ Model file missing (looked for: {', '.join(self.model_files)})")
 
 
97
 
98
  if missing_files:
99
  logger.error(f"Missing required files: {', '.join(missing_files)}")
100
  return False
101
 
102
- logger.info(f"✓ All {len(self.required_files)} required files present")
 
103
  return True
104
 
105
  def validate_tokenizer_functionality(self) -> bool:
106
  """
107
  Validate tokenizer functionality with comprehensive tests.
 
108
 
109
  Returns:
110
  bool: True if all tests pass, False otherwise
@@ -114,13 +142,24 @@ class TokenizerUploader:
114
  try:
115
  # Load tokenizer with explicit local files only
116
  abs_directory = str(self.directory.absolute())
 
 
 
 
 
 
 
 
 
 
117
  tokenizer = AutoTokenizer.from_pretrained(
118
  abs_directory,
119
  local_files_only=True,
120
  trust_remote_code=False # Security best practice
121
  )
122
 
123
- logger.info(f"✓ Tokenizer loaded (vocab: {tokenizer.vocab_size:,})")
 
124
 
125
  # Comprehensive test cases for Mon language
126
  test_cases = [
@@ -288,20 +327,63 @@ class TokenizerUploader:
288
  # List files to upload
289
  upload_files = []
290
  ignore_patterns = [
 
291
  "*.pyc",
292
  "__pycache__/",
 
 
 
293
  ".git/",
 
294
  ".venv/",
 
 
 
 
 
 
295
  "*.lock",
296
  "uv.lock",
 
 
297
  "pyproject.toml",
298
- "datasets/",
299
- "*.py", # Don't upload Python scripts
 
 
 
 
300
  "test_*",
 
 
301
  "sample_*",
 
 
 
 
302
  "convert_*",
303
- "upload_*",
304
- "*.meta.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  ]
306
 
307
  logger.info("Files to be uploaded:")
 
5
  This script provides functionality to validate and upload the Mon language tokenizer
6
  to Hugging Face Hub with comprehensive validation and modern best practices.
7
 
8
+ Required files:
9
+ - `tokenizer_config.json` - Main tokenizer configuration
10
+ - `special_tokens_map.json` - Special token mappings
11
+ - `README.md` - Model documentation and usage instructions
12
+ - `.gitattributes` - Git LFS configuration for large files
 
 
13
 
14
+ Required tokenizer model files (at least one):
15
+ - `tokenizer.json` - Fast tokenizer (recommended, HuggingFace Tokenizers)
16
+ - `tokenizer.model` - SentencePiece model file (slow tokenizer)
17
+ - `mon_tokenizer.model` - Custom named SentencePiece model
18
 
19
+ Optional but recommended files:
20
+ - `generation_config.json` - Text generation configuration
21
+ - `vocab.txt` - Vocabulary file for certain tokenizer types
22
+ - `merges.txt` - BPE merge rules for certain tokenizer types
23
+
24
+ The script validates all files exist before upload, supports both fast and slow tokenizers,
25
+ and uses modern HuggingFace Hub conventions while maintaining backward compatibility.
26
 
27
  """
28
 
 
57
  self.required_files = [
58
  "tokenizer_config.json",
59
  "special_tokens_map.json",
 
60
  "README.md",
61
  ".gitattributes",
62
  ]
63
+ # Optional but recommended files
64
+ self.optional_files = [
65
+ "generation_config.json",
66
+ "vocab.txt",
67
+ "merges.txt",
68
+ "tokenizer.json", # Fast tokenizer (becomes required if no .model file)
69
+ "added_tokens.json", # Additional tokens
70
+ "preprocessor_config.json", # Preprocessing configuration
71
+ ]
72
+ # Tokenizer model files - check for either fast or slow tokenizer
73
+ self.model_files = ["tokenizer.json", "tokenizer.model", "mon_tokenizer.model"]
74
 
75
  def validate_files(self) -> bool:
76
  """
 
95
  missing_files.append(file_name)
96
  logger.error(f"✗ {file_name} (missing)")
97
 
98
+ # Check optional files
99
+ for file_name in self.optional_files:
100
+ file_path = self.directory / file_name
101
+ if file_path.exists():
102
+ size = file_path.stat().st_size
103
+ present_files.append((file_name, size))
104
+ logger.info(f"✓ {file_name} ({size:,} bytes) [optional]")
105
+
106
+ # Check for tokenizer model files - at least one must exist
107
  model_found = False
108
+ found_models = []
109
  for model_name in self.model_files:
110
  model_path = self.directory / model_name
111
  if model_path.exists():
112
  size = model_path.stat().st_size
113
  present_files.append((model_name, size))
114
+ found_models.append(model_name)
115
  logger.info(f"✓ {model_name} ({size:,} bytes)")
116
  model_found = True
 
117
 
118
  if not model_found:
119
+ missing_files.append("tokenizer model file (tokenizer.json, tokenizer.model, or mon_tokenizer.model)")
120
+ logger.error(f"✗ No tokenizer model file found (looked for: {', '.join(self.model_files)})")
121
+ else:
122
+ logger.info(f"✓ Found tokenizer model(s): {', '.join(found_models)}")
123
 
124
  if missing_files:
125
  logger.error(f"Missing required files: {', '.join(missing_files)}")
126
  return False
127
 
128
+ total_required = len(self.required_files) + 1 # +1 for model file
129
+ logger.info(f"✓ All {total_required} essential files present")
130
  return True
131
 
132
  def validate_tokenizer_functionality(self) -> bool:
133
  """
134
  Validate tokenizer functionality with comprehensive tests.
135
+ Supports both fast (tokenizer.json) and slow (tokenizer.model) tokenizers.
136
 
137
  Returns:
138
  bool: True if all tests pass, False otherwise
 
142
  try:
143
  # Load tokenizer with explicit local files only
144
  abs_directory = str(self.directory.absolute())
145
+
146
+ # Determine tokenizer type for better error handling
147
+ has_fast = (self.directory / "tokenizer.json").exists()
148
+ has_slow = any((self.directory / model).exists() for model in ["tokenizer.model", "mon_tokenizer.model"])
149
+
150
+ if has_fast:
151
+ logger.info("Detected fast tokenizer (tokenizer.json)")
152
+ if has_slow:
153
+ logger.info("Detected slow tokenizer (*.model)")
154
+
155
  tokenizer = AutoTokenizer.from_pretrained(
156
  abs_directory,
157
  local_files_only=True,
158
  trust_remote_code=False # Security best practice
159
  )
160
 
161
+ tokenizer_type = "fast" if tokenizer.is_fast else "slow"
162
+ logger.info(f"✓ {tokenizer_type.capitalize()} tokenizer loaded (vocab: {tokenizer.vocab_size:,})")
163
 
164
  # Comprehensive test cases for Mon language
165
  test_cases = [
 
327
  # List files to upload
328
  upload_files = []
329
  ignore_patterns = [
330
+ # Python compilation artifacts
331
  "*.pyc",
332
  "__pycache__/",
333
+ "*.pyo",
334
+
335
+ # Version control and development
336
  ".git/",
337
+ ".gitignore",
338
  ".venv/",
339
+ "venv/",
340
+ ".env",
341
+ ".env.*",
342
+ ".python-version",
343
+
344
+ # Build and dependency files
345
  "*.lock",
346
  "uv.lock",
347
+ "Pipfile.lock",
348
+ "poetry.lock",
349
  "pyproject.toml",
350
+ "setup.py",
351
+ "setup.cfg",
352
+ "requirements.txt",
353
+ "requirements-dev.txt",
354
+
355
+ # Development and testing files
356
  "test_*",
357
+ "tests/",
358
+ "*_test.py",
359
  "sample_*",
360
+ "example_*",
361
+ "demo_*",
362
+
363
+ # Build and conversion scripts
364
  "convert_*",
365
+ "upload_*",
366
+ "build_*",
367
+ "*.py", # Don't upload Python scripts
368
+
369
+ # Dataset and training artifacts
370
+ "datasets/",
371
+ "data/",
372
+ "checkpoints/",
373
+ "logs/",
374
+ "wandb/",
375
+
376
+ # Temporary and cache files
377
+ "*.tmp",
378
+ "*.temp",
379
+ ".cache/",
380
+ "*.meta.json",
381
+ "*.backup",
382
+
383
+ # OS specific files
384
+ ".DS_Store",
385
+ "Thumbs.db",
386
+ "desktop.ini"
387
  ]
388
 
389
  logger.info("Files to be uploaded:")