| { | |
| "experiment": "C_tokenizer_ablation", | |
| "timestamp": "2026-04-13 09:30:52 UTC", | |
| "tokenizers": { | |
| "Ours-32K": { | |
| "vocab_size": 32000 | |
| }, | |
| "Llama-2": { | |
| "vocab_size": 32000 | |
| }, | |
| "HebrewGPT": { | |
| "vocab_size": 32000 | |
| } | |
| }, | |
| "vocabulary_composition": { | |
| "Ours-32K": { | |
| "Arabic": { | |
| "count": 14945, | |
| "pct": 46.7 | |
| }, | |
| "Hebrew": { | |
| "count": 8888, | |
| "pct": 27.8 | |
| }, | |
| "Latin": { | |
| "count": 7778, | |
| "pct": 24.3 | |
| }, | |
| "Other": { | |
| "count": 278, | |
| "pct": 0.9 | |
| }, | |
| "Digit": { | |
| "count": 110, | |
| "pct": 0.3 | |
| }, | |
| "Special": { | |
| "count": 1, | |
| "pct": 0.0 | |
| } | |
| }, | |
| "Llama-2": { | |
| "Latin": { | |
| "count": 25900, | |
| "pct": 80.9 | |
| }, | |
| "Other": { | |
| "count": 5848, | |
| "pct": 18.3 | |
| }, | |
| "Digit": { | |
| "count": 133, | |
| "pct": 0.4 | |
| }, | |
| "Arabic": { | |
| "count": 54, | |
| "pct": 0.2 | |
| }, | |
| "Hebrew": { | |
| "count": 36, | |
| "pct": 0.1 | |
| }, | |
| "Special": { | |
| "count": 16, | |
| "pct": 0.1 | |
| }, | |
| "Space": { | |
| "count": 13, | |
| "pct": 0.0 | |
| } | |
| }, | |
| "HebrewGPT": { | |
| "Hebrew": { | |
| "count": 23101, | |
| "pct": 72.2 | |
| }, | |
| "Other": { | |
| "count": 6399, | |
| "pct": 20.0 | |
| }, | |
| "Latin": { | |
| "count": 2238, | |
| "pct": 7.0 | |
| }, | |
| "Arabic": { | |
| "count": 137, | |
| "pct": 0.4 | |
| }, | |
| "Digit": { | |
| "count": 124, | |
| "pct": 0.4 | |
| }, | |
| "Special": { | |
| "count": 1, | |
| "pct": 0.0 | |
| } | |
| } | |
| }, | |
| "fertility": { | |
| "en": { | |
| "Ours-32K": { | |
| "fertility": 1.544, | |
| "bytes_per_token": 3.71, | |
| "total_tokens": 2785, | |
| "total_bytes": 10322 | |
| }, | |
| "Llama-2": { | |
| "fertility": 1.51, | |
| "bytes_per_token": 3.79, | |
| "total_tokens": 2724, | |
| "total_bytes": 10322 | |
| }, | |
| "HebrewGPT": { | |
| "fertility": 2.419, | |
| "bytes_per_token": 2.37, | |
| "total_tokens": 4364, | |
| "total_bytes": 10322 | |
| } | |
| }, | |
| "he": { | |
| "Ours-32K": { | |
| "fertility": 1.343, | |
| "bytes_per_token": 5.12, | |
| "total_tokens": 8866, | |
| "total_bytes": 45378 | |
| }, | |
| "Llama-2": { | |
| "fertility": 3.909, | |
| "bytes_per_token": 1.76, | |
| "total_tokens": 25806, | |
| "total_bytes": 45378 | |
| }, | |
| "HebrewGPT": { | |
| "fertility": 1.255, | |
| "bytes_per_token": 5.48, | |
| "total_tokens": 8283, | |
| "total_bytes": 45378 | |
| } | |
| }, | |
| "ar": { | |
| "Ours-32K": { | |
| "fertility": 2.222, | |
| "bytes_per_token": 3.48, | |
| "total_tokens": 7776, | |
| "total_bytes": 27023 | |
| }, | |
| "Llama-2": { | |
| "fertility": 4.363, | |
| "bytes_per_token": 1.77, | |
| "total_tokens": 15266, | |
| "total_bytes": 27023 | |
| }, | |
| "HebrewGPT": { | |
| "fertility": 4.154, | |
| "bytes_per_token": 1.86, | |
| "total_tokens": 14535, | |
| "total_bytes": 27023 | |
| } | |
| }, | |
| "fa": { | |
| "Ours-32K": { | |
| "fertility": 1.52, | |
| "bytes_per_token": 5.72, | |
| "total_tokens": 5302, | |
| "total_bytes": 30327 | |
| }, | |
| "Llama-2": { | |
| "fertility": 4.876, | |
| "bytes_per_token": 1.78, | |
| "total_tokens": 17014, | |
| "total_bytes": 30327 | |
| }, | |
| "HebrewGPT": { | |
| "fertility": 4.508, | |
| "bytes_per_token": 1.93, | |
| "total_tokens": 15727, | |
| "total_bytes": 30327 | |
| } | |
| } | |
| }, | |
| "efficiency": { | |
| "en": { | |
| "Ours-32K": 0.2698, | |
| "Llama-2": 0.2639, | |
| "HebrewGPT": 0.4228 | |
| }, | |
| "he": { | |
| "Ours-32K": 0.1954, | |
| "Llama-2": 0.5687, | |
| "HebrewGPT": 0.1825 | |
| }, | |
| "ar": { | |
| "Ours-32K": 0.2878, | |
| "Llama-2": 0.5649, | |
| "HebrewGPT": 0.5379 | |
| }, | |
| "fa": { | |
| "Ours-32K": 0.1748, | |
| "Llama-2": 0.561, | |
| "HebrewGPT": 0.5186 | |
| } | |
| } | |
| } |