SemiticGPT-3B / exp_c_tokenizer_ablation.json
ronnengmail's picture
Upload exp_c_tokenizer_ablation.json with huggingface_hub
e06a0c9 verified
{
"experiment": "C_tokenizer_ablation",
"timestamp": "2026-04-13 09:30:52 UTC",
"tokenizers": {
"Ours-32K": {
"vocab_size": 32000
},
"Llama-2": {
"vocab_size": 32000
},
"HebrewGPT": {
"vocab_size": 32000
}
},
"vocabulary_composition": {
"Ours-32K": {
"Arabic": {
"count": 14945,
"pct": 46.7
},
"Hebrew": {
"count": 8888,
"pct": 27.8
},
"Latin": {
"count": 7778,
"pct": 24.3
},
"Other": {
"count": 278,
"pct": 0.9
},
"Digit": {
"count": 110,
"pct": 0.3
},
"Special": {
"count": 1,
"pct": 0.0
}
},
"Llama-2": {
"Latin": {
"count": 25900,
"pct": 80.9
},
"Other": {
"count": 5848,
"pct": 18.3
},
"Digit": {
"count": 133,
"pct": 0.4
},
"Arabic": {
"count": 54,
"pct": 0.2
},
"Hebrew": {
"count": 36,
"pct": 0.1
},
"Special": {
"count": 16,
"pct": 0.1
},
"Space": {
"count": 13,
"pct": 0.0
}
},
"HebrewGPT": {
"Hebrew": {
"count": 23101,
"pct": 72.2
},
"Other": {
"count": 6399,
"pct": 20.0
},
"Latin": {
"count": 2238,
"pct": 7.0
},
"Arabic": {
"count": 137,
"pct": 0.4
},
"Digit": {
"count": 124,
"pct": 0.4
},
"Special": {
"count": 1,
"pct": 0.0
}
}
},
"fertility": {
"en": {
"Ours-32K": {
"fertility": 1.544,
"bytes_per_token": 3.71,
"total_tokens": 2785,
"total_bytes": 10322
},
"Llama-2": {
"fertility": 1.51,
"bytes_per_token": 3.79,
"total_tokens": 2724,
"total_bytes": 10322
},
"HebrewGPT": {
"fertility": 2.419,
"bytes_per_token": 2.37,
"total_tokens": 4364,
"total_bytes": 10322
}
},
"he": {
"Ours-32K": {
"fertility": 1.343,
"bytes_per_token": 5.12,
"total_tokens": 8866,
"total_bytes": 45378
},
"Llama-2": {
"fertility": 3.909,
"bytes_per_token": 1.76,
"total_tokens": 25806,
"total_bytes": 45378
},
"HebrewGPT": {
"fertility": 1.255,
"bytes_per_token": 5.48,
"total_tokens": 8283,
"total_bytes": 45378
}
},
"ar": {
"Ours-32K": {
"fertility": 2.222,
"bytes_per_token": 3.48,
"total_tokens": 7776,
"total_bytes": 27023
},
"Llama-2": {
"fertility": 4.363,
"bytes_per_token": 1.77,
"total_tokens": 15266,
"total_bytes": 27023
},
"HebrewGPT": {
"fertility": 4.154,
"bytes_per_token": 1.86,
"total_tokens": 14535,
"total_bytes": 27023
}
},
"fa": {
"Ours-32K": {
"fertility": 1.52,
"bytes_per_token": 5.72,
"total_tokens": 5302,
"total_bytes": 30327
},
"Llama-2": {
"fertility": 4.876,
"bytes_per_token": 1.78,
"total_tokens": 17014,
"total_bytes": 30327
},
"HebrewGPT": {
"fertility": 4.508,
"bytes_per_token": 1.93,
"total_tokens": 15727,
"total_bytes": 30327
}
}
},
"efficiency": {
"en": {
"Ours-32K": 0.2698,
"Llama-2": 0.2639,
"HebrewGPT": 0.4228
},
"he": {
"Ours-32K": 0.1954,
"Llama-2": 0.5687,
"HebrewGPT": 0.1825
},
"ar": {
"Ours-32K": 0.2878,
"Llama-2": 0.5649,
"HebrewGPT": 0.5379
},
"fa": {
"Ours-32K": 0.1748,
"Llama-2": 0.561,
"HebrewGPT": 0.5186
}
}
}