Spaces:
Sleeping
Sleeping
Add Llama tokenizer creation for Dutch, English, Code, Markdown and TeX.
Browse files- app.py +1 -1
- app_compression.py +1 -1
- config.py +17 -5
- stats/compress_rate.json +504 -0
- utils/compression_util.py +2 -2
- vocab/wizardcoder_15b_v1/__init__.py +4 -4
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from patcher.gr_interface import TabbedInterface
|
|
| 8 |
demo = TabbedInterface(
|
| 9 |
[tab_playground, tab_compression],
|
| 10 |
[" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
|
| 11 |
-
title='
|
| 12 |
css="css/style.css"
|
| 13 |
)
|
| 14 |
|
|
|
|
| 8 |
demo = TabbedInterface(
|
| 9 |
[tab_playground, tab_compression],
|
| 10 |
[" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
|
| 11 |
+
title='Tokenizer Arena ⚔️ (with some Dutch 🇳🇱🇧🇪🇸🇷 hacked in)',
|
| 12 |
css="css/style.css"
|
| 13 |
)
|
| 14 |
|
app_compression.py
CHANGED
|
@@ -59,7 +59,7 @@ with gr.Blocks() as demo:
|
|
| 59 |
with gr.Row():
|
| 60 |
compress_rate_corpus = gr.Dropdown(
|
| 61 |
common_corpuses, # , "code"
|
| 62 |
-
value=["cc100-
|
| 63 |
label="corpus",
|
| 64 |
multiselect=True
|
| 65 |
# info=""
|
|
|
|
| 59 |
with gr.Row():
|
| 60 |
compress_rate_corpus = gr.Dropdown(
|
| 61 |
common_corpuses, # , "code"
|
| 62 |
+
value=["cc100-nl", "cc100-en"],
|
| 63 |
label="corpus",
|
| 64 |
multiselect=True
|
| 65 |
# info=""
|
config.py
CHANGED
|
@@ -11,10 +11,22 @@ LAZY_IMPORT = True
|
|
| 11 |
# DEBUG: 设置环境变量 RUST_BACKTRACE=full
|
| 12 |
#
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
ラグビーワールドカップ2023フランス"""
|
| 19 |
default_tokenizer_type_1 = "llama3"
|
| 20 |
-
default_tokenizer_type_2 = "
|
|
|
|
|
|
| 11 |
# DEBUG: 设置环境变量 RUST_BACKTRACE=full
|
| 12 |
#
|
| 13 |
|
| 14 |
+
|
| 15 |
+
default_user_input = """“We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
|
| 16 |
+
Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
|
| 17 |
+
def load_image_file(file, mode='RGB'):
|
| 18 |
+
im = PIL.Image.open(file)
|
| 19 |
+
if mode:
|
| 20 |
+
im = im.convert(mode)
|
| 21 |
+
return np.array(im)
|
| 22 |
+
\section{The expected number of intervening \mbox{H\,{\sc i}}
|
| 23 |
+
absorbers}\label{section:expected_number}
|
| 24 |
+
\begin{equation}\label{equation:expected_number}
|
| 25 |
+
\mu = \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
|
| 26 |
+
\end{equation}
|
| 27 |
+
Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
|
| 28 |
+
华为发布Mate60手机
|
| 29 |
ラグビーワールドカップ2023フランス"""
|
| 30 |
default_tokenizer_type_1 = "llama3"
|
| 31 |
+
# default_tokenizer_type_2 = "internlm_chat_7b"
|
| 32 |
+
default_tokenizer_type_2 = "mistral_7b"
|
stats/compress_rate.json
CHANGED
|
@@ -4282,5 +4282,509 @@
|
|
| 4282 |
"n_bytes": 2633047,
|
| 4283 |
"n_tokens": 757405,
|
| 4284 |
"n_chars": 927311
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4285 |
}
|
| 4286 |
}
|
|
|
|
| 4282 |
"n_bytes": 2633047,
|
| 4283 |
"n_tokens": 757405,
|
| 4284 |
"n_chars": 927311
|
| 4285 |
+
},
|
| 4286 |
+
"dutch_llama_tokenizer.cc100-en": {
|
| 4287 |
+
"vocab_size": 32000,
|
| 4288 |
+
"n_bytes": 1124813,
|
| 4289 |
+
"n_tokens": 291975,
|
| 4290 |
+
"n_chars": 1121360
|
| 4291 |
+
},
|
| 4292 |
+
"gronlp-gpt2-small-dutch.cc100-en": {
|
| 4293 |
+
"vocab_size": 40000,
|
| 4294 |
+
"n_bytes": 1124813,
|
| 4295 |
+
"n_tokens": 361710,
|
| 4296 |
+
"n_chars": 1121360
|
| 4297 |
+
},
|
| 4298 |
+
"yhavinga-gpt2-medium-dutch.cc100-en": {
|
| 4299 |
+
"vocab_size": 50257,
|
| 4300 |
+
"n_bytes": 1124813,
|
| 4301 |
+
"n_tokens": 361847,
|
| 4302 |
+
"n_chars": 1121360
|
| 4303 |
+
},
|
| 4304 |
+
"yhavinga-ul2-large-en-nl.cc100-en": {
|
| 4305 |
+
"vocab_size": 32128,
|
| 4306 |
+
"n_bytes": 1124813,
|
| 4307 |
+
"n_tokens": 297641,
|
| 4308 |
+
"n_chars": 1121360
|
| 4309 |
+
},
|
| 4310 |
+
"dutch_llama_tokenizer.cc100-zh-Hans": {
|
| 4311 |
+
"vocab_size": 32000,
|
| 4312 |
+
"n_bytes": 2633047,
|
| 4313 |
+
"n_tokens": 2621293,
|
| 4314 |
+
"n_chars": 927311
|
| 4315 |
+
},
|
| 4316 |
+
"gronlp-gpt2-small-dutch.cc100-zh-Hans": {
|
| 4317 |
+
"vocab_size": 40000,
|
| 4318 |
+
"n_bytes": 2633047,
|
| 4319 |
+
"n_tokens": 1350320,
|
| 4320 |
+
"n_chars": 927311
|
| 4321 |
+
},
|
| 4322 |
+
"yhavinga-gpt2-medium-dutch.cc100-zh-Hans": {
|
| 4323 |
+
"vocab_size": 50257,
|
| 4324 |
+
"n_bytes": 2633047,
|
| 4325 |
+
"n_tokens": 2600872,
|
| 4326 |
+
"n_chars": 927311
|
| 4327 |
+
},
|
| 4328 |
+
"yhavinga-ul2-large-en-nl.cc100-zh-Hans": {
|
| 4329 |
+
"vocab_size": 32128,
|
| 4330 |
+
"n_bytes": 2633047,
|
| 4331 |
+
"n_tokens": 2519719,
|
| 4332 |
+
"n_chars": 927311
|
| 4333 |
+
},
|
| 4334 |
+
"aya_101.cc100-nl": {
|
| 4335 |
+
"vocab_size": 250100,
|
| 4336 |
+
"n_bytes": 1513030,
|
| 4337 |
+
"n_tokens": 423616,
|
| 4338 |
+
"n_chars": 1508067
|
| 4339 |
+
},
|
| 4340 |
+
"baichuan.cc100-nl": {
|
| 4341 |
+
"vocab_size": 64000,
|
| 4342 |
+
"n_bytes": 1513030,
|
| 4343 |
+
"n_tokens": 574927,
|
| 4344 |
+
"n_chars": 1508067
|
| 4345 |
+
},
|
| 4346 |
+
"baichuan2.cc100-nl": {
|
| 4347 |
+
"vocab_size": 125696,
|
| 4348 |
+
"n_bytes": 1513030,
|
| 4349 |
+
"n_tokens": 540387,
|
| 4350 |
+
"n_chars": 1508067
|
| 4351 |
+
},
|
| 4352 |
+
"bert_base_cased.cc100-nl": {
|
| 4353 |
+
"vocab_size": 28996,
|
| 4354 |
+
"n_bytes": 1513030,
|
| 4355 |
+
"n_tokens": 630793,
|
| 4356 |
+
"n_chars": 1508067
|
| 4357 |
+
},
|
| 4358 |
+
"bert_base_chinese.cc100-nl": {
|
| 4359 |
+
"vocab_size": 21128,
|
| 4360 |
+
"n_bytes": 1513030,
|
| 4361 |
+
"n_tokens": 626052,
|
| 4362 |
+
"n_chars": 1508067
|
| 4363 |
+
},
|
| 4364 |
+
"bert_base_uncased.cc100-nl": {
|
| 4365 |
+
"vocab_size": 30522,
|
| 4366 |
+
"n_bytes": 1513030,
|
| 4367 |
+
"n_tokens": 574651,
|
| 4368 |
+
"n_chars": 1508067
|
| 4369 |
+
},
|
| 4370 |
+
"bloom.cc100-nl": {
|
| 4371 |
+
"vocab_size": 250680,
|
| 4372 |
+
"n_bytes": 1513030,
|
| 4373 |
+
"n_tokens": 488924,
|
| 4374 |
+
"n_chars": 1508067
|
| 4375 |
+
},
|
| 4376 |
+
"byt5_small.cc100-nl": {
|
| 4377 |
+
"vocab_size": 384,
|
| 4378 |
+
"n_bytes": 1513030,
|
| 4379 |
+
"n_tokens": 1523030,
|
| 4380 |
+
"n_chars": 1508067
|
| 4381 |
+
},
|
| 4382 |
+
"character_glm_6b.cc100-nl": {
|
| 4383 |
+
"vocab_size": 64789,
|
| 4384 |
+
"n_bytes": 1513030,
|
| 4385 |
+
"n_tokens": 559014,
|
| 4386 |
+
"n_chars": 1508067
|
| 4387 |
+
},
|
| 4388 |
+
"chatglm2_6b.cc100-nl": {
|
| 4389 |
+
"vocab_size": 64787,
|
| 4390 |
+
"n_bytes": 1513030,
|
| 4391 |
+
"n_tokens": 559017,
|
| 4392 |
+
"n_chars": 1508067
|
| 4393 |
+
},
|
| 4394 |
+
"chatglm3_6b.cc100-nl": {
|
| 4395 |
+
"vocab_size": 64796,
|
| 4396 |
+
"n_bytes": 1513030,
|
| 4397 |
+
"n_tokens": 559014,
|
| 4398 |
+
"n_chars": 1508067
|
| 4399 |
+
},
|
| 4400 |
+
"chatglm_6b.cc100-nl": {
|
| 4401 |
+
"vocab_size": 150344,
|
| 4402 |
+
"n_bytes": 1513030,
|
| 4403 |
+
"n_tokens": 533174,
|
| 4404 |
+
"n_chars": 1508067
|
| 4405 |
+
},
|
| 4406 |
+
"chatyuan_large_v2.cc100-nl": {
|
| 4407 |
+
"vocab_size": 32128,
|
| 4408 |
+
"n_bytes": 1513030,
|
| 4409 |
+
"n_tokens": 837963,
|
| 4410 |
+
"n_chars": 1508067
|
| 4411 |
+
},
|
| 4412 |
+
"chinese_llama.cc100-nl": {
|
| 4413 |
+
"vocab_size": 49953,
|
| 4414 |
+
"n_bytes": 1513030,
|
| 4415 |
+
"n_tokens": 488766,
|
| 4416 |
+
"n_chars": 1508067
|
| 4417 |
+
},
|
| 4418 |
+
"chinese_llama2.cc100-nl": {
|
| 4419 |
+
"vocab_size": 55296,
|
| 4420 |
+
"n_bytes": 1513030,
|
| 4421 |
+
"n_tokens": 495966,
|
| 4422 |
+
"n_chars": 1508067
|
| 4423 |
+
},
|
| 4424 |
+
"code_davinci_002.cc100-nl": {
|
| 4425 |
+
"vocab_size": 50281,
|
| 4426 |
+
"n_bytes": 1513030,
|
| 4427 |
+
"n_tokens": 559119,
|
| 4428 |
+
"n_chars": 1508067
|
| 4429 |
+
},
|
| 4430 |
+
"crystal_coder.cc100-nl": {
|
| 4431 |
+
"vocab_size": 32022,
|
| 4432 |
+
"n_bytes": 1513030,
|
| 4433 |
+
"n_tokens": 485966,
|
| 4434 |
+
"n_chars": 1508067
|
| 4435 |
+
},
|
| 4436 |
+
"dbrx_instruct.cc100-nl": {
|
| 4437 |
+
"vocab_size": 100280,
|
| 4438 |
+
"n_bytes": 1513030,
|
| 4439 |
+
"n_tokens": 449343,
|
| 4440 |
+
"n_chars": 1508067
|
| 4441 |
+
},
|
| 4442 |
+
"deepseek_coder_33b_instruct.cc100-nl": {
|
| 4443 |
+
"vocab_size": 32022,
|
| 4444 |
+
"n_bytes": 1513030,
|
| 4445 |
+
"n_tokens": 603966,
|
| 4446 |
+
"n_chars": 1508067
|
| 4447 |
+
},
|
| 4448 |
+
"deepseek_llm_7b_base.cc100-nl": {
|
| 4449 |
+
"vocab_size": 100015,
|
| 4450 |
+
"n_bytes": 1513030,
|
| 4451 |
+
"n_tokens": 536746,
|
| 4452 |
+
"n_chars": 1508067
|
| 4453 |
+
},
|
| 4454 |
+
"dutch_llama_tokenizer.cc100-nl": {
|
| 4455 |
+
"vocab_size": 32000,
|
| 4456 |
+
"n_bytes": 1513030,
|
| 4457 |
+
"n_tokens": 366481,
|
| 4458 |
+
"n_chars": 1508067
|
| 4459 |
+
},
|
| 4460 |
+
"falcon_180b.cc100-nl": {
|
| 4461 |
+
"vocab_size": 65024,
|
| 4462 |
+
"n_bytes": 1513030,
|
| 4463 |
+
"n_tokens": 438112,
|
| 4464 |
+
"n_chars": 1508067
|
| 4465 |
+
},
|
| 4466 |
+
"falcon_7b.cc100-nl": {
|
| 4467 |
+
"vocab_size": 65024,
|
| 4468 |
+
"n_bytes": 1513030,
|
| 4469 |
+
"n_tokens": 438112,
|
| 4470 |
+
"n_chars": 1508067
|
| 4471 |
+
},
|
| 4472 |
+
"fastchat_t5_3b.cc100-nl": {
|
| 4473 |
+
"vocab_size": 32110,
|
| 4474 |
+
"n_bytes": 1513030,
|
| 4475 |
+
"n_tokens": 933018,
|
| 4476 |
+
"n_chars": 1508067
|
| 4477 |
+
},
|
| 4478 |
+
"flan_t5_base.cc100-nl": {
|
| 4479 |
+
"vocab_size": 32100,
|
| 4480 |
+
"n_bytes": 1513030,
|
| 4481 |
+
"n_tokens": 696337,
|
| 4482 |
+
"n_chars": 1508067
|
| 4483 |
+
},
|
| 4484 |
+
"gemma_7b.cc100-nl": {
|
| 4485 |
+
"vocab_size": 256000,
|
| 4486 |
+
"n_bytes": 1513030,
|
| 4487 |
+
"n_tokens": 387522,
|
| 4488 |
+
"n_chars": 1508067
|
| 4489 |
+
},
|
| 4490 |
+
"gpt2.cc100-nl": {
|
| 4491 |
+
"vocab_size": 50257,
|
| 4492 |
+
"n_bytes": 1513030,
|
| 4493 |
+
"n_tokens": 559119,
|
| 4494 |
+
"n_chars": 1508067
|
| 4495 |
+
},
|
| 4496 |
+
"gpt2_chinese.cc100-nl": {
|
| 4497 |
+
"vocab_size": 21128,
|
| 4498 |
+
"n_bytes": 1513030,
|
| 4499 |
+
"n_tokens": 676651,
|
| 4500 |
+
"n_chars": 1508067
|
| 4501 |
+
},
|
| 4502 |
+
"gpt_35_turbo.cc100-nl": {
|
| 4503 |
+
"vocab_size": 100277,
|
| 4504 |
+
"n_bytes": 1513030,
|
| 4505 |
+
"n_tokens": 449343,
|
| 4506 |
+
"n_chars": 1508067
|
| 4507 |
+
},
|
| 4508 |
+
"gpt_4.cc100-nl": {
|
| 4509 |
+
"vocab_size": 100277,
|
| 4510 |
+
"n_bytes": 1513030,
|
| 4511 |
+
"n_tokens": 449343,
|
| 4512 |
+
"n_chars": 1508067
|
| 4513 |
+
},
|
| 4514 |
+
"gpt_neox_japanese_2_7b.cc100-nl": {
|
| 4515 |
+
"vocab_size": 32000,
|
| 4516 |
+
"n_bytes": 1513030,
|
| 4517 |
+
"n_tokens": 1509448,
|
| 4518 |
+
"n_chars": 1508067
|
| 4519 |
+
},
|
| 4520 |
+
"gpt_nexo_20b.cc100-nl": {
|
| 4521 |
+
"vocab_size": 50277,
|
| 4522 |
+
"n_bytes": 1513030,
|
| 4523 |
+
"n_tokens": 497728,
|
| 4524 |
+
"n_chars": 1508067
|
| 4525 |
+
},
|
| 4526 |
+
"grok_1.cc100-nl": {
|
| 4527 |
+
"vocab_size": 131072,
|
| 4528 |
+
"n_bytes": 1513030,
|
| 4529 |
+
"n_tokens": 457359,
|
| 4530 |
+
"n_chars": 1508067
|
| 4531 |
+
},
|
| 4532 |
+
"gronlp-gpt2-small-dutch.cc100-nl": {
|
| 4533 |
+
"vocab_size": 40000,
|
| 4534 |
+
"n_bytes": 1513030,
|
| 4535 |
+
"n_tokens": 332376,
|
| 4536 |
+
"n_chars": 1508067
|
| 4537 |
+
},
|
| 4538 |
+
"internlm2_chat_7b.cc100-nl": {
|
| 4539 |
+
"vocab_size": 92544,
|
| 4540 |
+
"n_bytes": 1513030,
|
| 4541 |
+
"n_tokens": 494821,
|
| 4542 |
+
"n_chars": 1508067
|
| 4543 |
+
},
|
| 4544 |
+
"internlm2_math_7b.cc100-nl": {
|
| 4545 |
+
"vocab_size": 92544,
|
| 4546 |
+
"n_bytes": 1513030,
|
| 4547 |
+
"n_tokens": 494821,
|
| 4548 |
+
"n_chars": 1508067
|
| 4549 |
+
},
|
| 4550 |
+
"internlm_chat_7b.cc100-nl": {
|
| 4551 |
+
"vocab_size": 103168,
|
| 4552 |
+
"n_bytes": 1513030,
|
| 4553 |
+
"n_tokens": 494108,
|
| 4554 |
+
"n_chars": 1508067
|
| 4555 |
+
},
|
| 4556 |
+
"internlm_xcomposer_7b.cc100-nl": {
|
| 4557 |
+
"vocab_size": 103168,
|
| 4558 |
+
"n_bytes": 1513030,
|
| 4559 |
+
"n_tokens": 494108,
|
| 4560 |
+
"n_chars": 1508067
|
| 4561 |
+
},
|
| 4562 |
+
"jamba_v0_1.cc100-nl": {
|
| 4563 |
+
"vocab_size": 65536,
|
| 4564 |
+
"n_bytes": 1513030,
|
| 4565 |
+
"n_tokens": 442176,
|
| 4566 |
+
"n_chars": 1508067
|
| 4567 |
+
},
|
| 4568 |
+
"kplug.cc100-nl": {
|
| 4569 |
+
"vocab_size": 10261,
|
| 4570 |
+
"n_bytes": 1513030,
|
| 4571 |
+
"n_tokens": 678131,
|
| 4572 |
+
"n_chars": 1508067
|
| 4573 |
+
},
|
| 4574 |
+
"llama.cc100-nl": {
|
| 4575 |
+
"vocab_size": 32000,
|
| 4576 |
+
"n_bytes": 1513030,
|
| 4577 |
+
"n_tokens": 495966,
|
| 4578 |
+
"n_chars": 1508067
|
| 4579 |
+
},
|
| 4580 |
+
"llama2.cc100-nl": {
|
| 4581 |
+
"vocab_size": 32001,
|
| 4582 |
+
"n_bytes": 1513030,
|
| 4583 |
+
"n_tokens": 495966,
|
| 4584 |
+
"n_chars": 1508067
|
| 4585 |
+
},
|
| 4586 |
+
"llama3.cc100-nl": {
|
| 4587 |
+
"vocab_size": 128256,
|
| 4588 |
+
"n_bytes": 1513030,
|
| 4589 |
+
"n_tokens": 448173,
|
| 4590 |
+
"n_chars": 1508067
|
| 4591 |
+
},
|
| 4592 |
+
"llama_3_chinese_8b.cc100-nl": {
|
| 4593 |
+
"vocab_size": 128256,
|
| 4594 |
+
"n_bytes": 1513030,
|
| 4595 |
+
"n_tokens": 458173,
|
| 4596 |
+
"n_chars": 1508067
|
| 4597 |
+
},
|
| 4598 |
+
"mistral_7b.cc100-nl": {
|
| 4599 |
+
"vocab_size": 32000,
|
| 4600 |
+
"n_bytes": 1513030,
|
| 4601 |
+
"n_tokens": 515884,
|
| 4602 |
+
"n_chars": 1508067
|
| 4603 |
+
},
|
| 4604 |
+
"mixtral_8_7b.cc100-nl": {
|
| 4605 |
+
"vocab_size": 32000,
|
| 4606 |
+
"n_bytes": 1513030,
|
| 4607 |
+
"n_tokens": 515884,
|
| 4608 |
+
"n_chars": 1508067
|
| 4609 |
+
},
|
| 4610 |
+
"mobilebert_uncased.cc100-nl": {
|
| 4611 |
+
"vocab_size": 30522,
|
| 4612 |
+
"n_bytes": 1513030,
|
| 4613 |
+
"n_tokens": 574651,
|
| 4614 |
+
"n_chars": 1508067
|
| 4615 |
+
},
|
| 4616 |
+
"moss.cc100-nl": {
|
| 4617 |
+
"vocab_size": 106072,
|
| 4618 |
+
"n_bytes": 1513030,
|
| 4619 |
+
"n_tokens": 557984,
|
| 4620 |
+
"n_chars": 1508067
|
| 4621 |
+
},
|
| 4622 |
+
"mt5_large.cc100-nl": {
|
| 4623 |
+
"vocab_size": 250100,
|
| 4624 |
+
"n_bytes": 1513030,
|
| 4625 |
+
"n_tokens": 423616,
|
| 4626 |
+
"n_chars": 1508067
|
| 4627 |
+
},
|
| 4628 |
+
"dutch_llama_tokenizer.cc100-es": {
|
| 4629 |
+
"vocab_size": 32000,
|
| 4630 |
+
"n_bytes": 1664455,
|
| 4631 |
+
"n_tokens": 610314,
|
| 4632 |
+
"n_chars": 1630297
|
| 4633 |
+
},
|
| 4634 |
+
"gronlp-gpt2-small-dutch.cc100-es": {
|
| 4635 |
+
"vocab_size": 40000,
|
| 4636 |
+
"n_bytes": 1664455,
|
| 4637 |
+
"n_tokens": 608465,
|
| 4638 |
+
"n_chars": 1630297
|
| 4639 |
+
},
|
| 4640 |
+
"yhavinga-gpt2-medium-dutch.cc100-es": {
|
| 4641 |
+
"vocab_size": 50257,
|
| 4642 |
+
"n_bytes": 1664455,
|
| 4643 |
+
"n_tokens": 605886,
|
| 4644 |
+
"n_chars": 1630297
|
| 4645 |
+
},
|
| 4646 |
+
"yhavinga-ul2-large-en-nl.cc100-es": {
|
| 4647 |
+
"vocab_size": 32128,
|
| 4648 |
+
"n_bytes": 1664455,
|
| 4649 |
+
"n_tokens": 686255,
|
| 4650 |
+
"n_chars": 1630297
|
| 4651 |
+
},
|
| 4652 |
+
"olmo_7b.cc100-nl": {
|
| 4653 |
+
"vocab_size": 50280,
|
| 4654 |
+
"n_bytes": 1513030,
|
| 4655 |
+
"n_tokens": 497728,
|
| 4656 |
+
"n_chars": 1508067
|
| 4657 |
+
},
|
| 4658 |
+
"orion_14b_chat.cc100-nl": {
|
| 4659 |
+
"vocab_size": 84608,
|
| 4660 |
+
"n_bytes": 1513030,
|
| 4661 |
+
"n_tokens": 599429,
|
| 4662 |
+
"n_chars": 1508067
|
| 4663 |
+
},
|
| 4664 |
+
"phi_1.cc100-nl": {
|
| 4665 |
+
"vocab_size": 50295,
|
| 4666 |
+
"n_bytes": 1513030,
|
| 4667 |
+
"n_tokens": 559124,
|
| 4668 |
+
"n_chars": 1508067
|
| 4669 |
+
},
|
| 4670 |
+
"phi_2.cc100-nl": {
|
| 4671 |
+
"vocab_size": 50295,
|
| 4672 |
+
"n_bytes": 1513030,
|
| 4673 |
+
"n_tokens": 559124,
|
| 4674 |
+
"n_chars": 1508067
|
| 4675 |
+
},
|
| 4676 |
+
"phi_3_mini.cc100-nl": {
|
| 4677 |
+
"vocab_size": 32011,
|
| 4678 |
+
"n_bytes": 1513030,
|
| 4679 |
+
"n_tokens": 495966,
|
| 4680 |
+
"n_chars": 1508067
|
| 4681 |
+
},
|
| 4682 |
+
"pko_t5_large.cc100-nl": {
|
| 4683 |
+
"vocab_size": 50358,
|
| 4684 |
+
"n_bytes": 1513030,
|
| 4685 |
+
"n_tokens": 1017288,
|
| 4686 |
+
"n_chars": 1508067
|
| 4687 |
+
},
|
| 4688 |
+
"prompt_clue.cc100-nl": {
|
| 4689 |
+
"vocab_size": 32128,
|
| 4690 |
+
"n_bytes": 1513030,
|
| 4691 |
+
"n_tokens": 837963,
|
| 4692 |
+
"n_chars": 1508067
|
| 4693 |
+
},
|
| 4694 |
+
"qwen1_5_14b_chat.cc100-nl": {
|
| 4695 |
+
"vocab_size": 151646,
|
| 4696 |
+
"n_bytes": 1513030,
|
| 4697 |
+
"n_tokens": 453342,
|
| 4698 |
+
"n_chars": 1508067
|
| 4699 |
+
},
|
| 4700 |
+
"qwen_1_8b_chat.cc100-nl": {
|
| 4701 |
+
"vocab_size": 151851,
|
| 4702 |
+
"n_bytes": 1513030,
|
| 4703 |
+
"n_tokens": 453342,
|
| 4704 |
+
"n_chars": 1508067
|
| 4705 |
+
},
|
| 4706 |
+
"qwen_72b_chat.cc100-nl": {
|
| 4707 |
+
"vocab_size": 151851,
|
| 4708 |
+
"n_bytes": 1513030,
|
| 4709 |
+
"n_tokens": 453342,
|
| 4710 |
+
"n_chars": 1508067
|
| 4711 |
+
},
|
| 4712 |
+
"qwen_7b_chat.cc100-nl": {
|
| 4713 |
+
"vocab_size": 151851,
|
| 4714 |
+
"n_bytes": 1513030,
|
| 4715 |
+
"n_tokens": 453342,
|
| 4716 |
+
"n_chars": 1508067
|
| 4717 |
+
},
|
| 4718 |
+
"roberta_chinese_clue.cc100-nl": {
|
| 4719 |
+
"vocab_size": 8021,
|
| 4720 |
+
"n_bytes": 1513030,
|
| 4721 |
+
"n_tokens": 821246,
|
| 4722 |
+
"n_chars": 1508067
|
| 4723 |
+
},
|
| 4724 |
+
"skywork_13b_base.cc100-nl": {
|
| 4725 |
+
"vocab_size": 65519,
|
| 4726 |
+
"n_bytes": 1513030,
|
| 4727 |
+
"n_tokens": 495958,
|
| 4728 |
+
"n_chars": 1508067
|
| 4729 |
+
},
|
| 4730 |
+
"skywork_13b_math.cc100-nl": {
|
| 4731 |
+
"vocab_size": 65519,
|
| 4732 |
+
"n_bytes": 1513030,
|
| 4733 |
+
"n_tokens": 495958,
|
| 4734 |
+
"n_chars": 1508067
|
| 4735 |
+
},
|
| 4736 |
+
"solar_10_7b.cc100-nl": {
|
| 4737 |
+
"vocab_size": 32000,
|
| 4738 |
+
"n_bytes": 1513030,
|
| 4739 |
+
"n_tokens": 515884,
|
| 4740 |
+
"n_chars": 1508067
|
| 4741 |
+
},
|
| 4742 |
+
"starchat_alpha.cc100-nl": {
|
| 4743 |
+
"vocab_size": 49156,
|
| 4744 |
+
"n_bytes": 1513030,
|
| 4745 |
+
"n_tokens": 532871,
|
| 4746 |
+
"n_chars": 1508067
|
| 4747 |
+
},
|
| 4748 |
+
"switch_c_2048.cc100-nl": {
|
| 4749 |
+
"vocab_size": 32100,
|
| 4750 |
+
"n_bytes": 1513030,
|
| 4751 |
+
"n_tokens": 696333,
|
| 4752 |
+
"n_chars": 1508067
|
| 4753 |
+
},
|
| 4754 |
+
"t5_base.cc100-nl": {
|
| 4755 |
+
"vocab_size": 32100,
|
| 4756 |
+
"n_bytes": 1513030,
|
| 4757 |
+
"n_tokens": 696333,
|
| 4758 |
+
"n_chars": 1508067
|
| 4759 |
+
},
|
| 4760 |
+
"t5_large.cc100-nl": {
|
| 4761 |
+
"vocab_size": 32100,
|
| 4762 |
+
"n_bytes": 1513030,
|
| 4763 |
+
"n_tokens": 696333,
|
| 4764 |
+
"n_chars": 1508067
|
| 4765 |
+
},
|
| 4766 |
+
"t5_small.cc100-nl": {
|
| 4767 |
+
"vocab_size": 32100,
|
| 4768 |
+
"n_bytes": 1513030,
|
| 4769 |
+
"n_tokens": 696333,
|
| 4770 |
+
"n_chars": 1508067
|
| 4771 |
+
},
|
| 4772 |
+
"text_davinci_003.cc100-nl": {
|
| 4773 |
+
"vocab_size": 50281,
|
| 4774 |
+
"n_bytes": 1513030,
|
| 4775 |
+
"n_tokens": 559119,
|
| 4776 |
+
"n_chars": 1508067
|
| 4777 |
+
},
|
| 4778 |
+
"tigerbot_13b_chat_v2.cc100-nl": {
|
| 4779 |
+
"vocab_size": 60515,
|
| 4780 |
+
"n_bytes": 1513030,
|
| 4781 |
+
"n_tokens": 486271,
|
| 4782 |
+
"n_chars": 1508067
|
| 4783 |
+
},
|
| 4784 |
+
"tigerbot_70b_chat_v4_4k.cc100-nl": {
|
| 4785 |
+
"vocab_size": 65110,
|
| 4786 |
+
"n_bytes": 1513030,
|
| 4787 |
+
"n_tokens": 486472,
|
| 4788 |
+
"n_chars": 1508067
|
| 4789 |
}
|
| 4790 |
}
|
utils/compression_util.py
CHANGED
|
@@ -20,7 +20,7 @@ from typing import List, Optional, Union, Literal
|
|
| 20 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 21 |
|
| 22 |
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
| 23 |
-
common_corpuses = sorted(["cc100-
|
| 24 |
"cc100-fa", "cc100-ar", "cc100-ja"])
|
| 25 |
|
| 26 |
VALID_CODES_CC100 = [
|
|
@@ -155,7 +155,7 @@ def tokenize_corpus(
|
|
| 155 |
|
| 156 |
|
| 157 |
def get_compression_leaderboard(
|
| 158 |
-
corpuses: List[str] = ['cc100-
|
| 159 |
unit: str = "b_tokens/g_bytes",
|
| 160 |
tokenizer_filter: Optional[str] = None,
|
| 161 |
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
|
|
|
| 20 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 21 |
|
| 22 |
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
| 23 |
+
common_corpuses = sorted(["cc100-nl", "cc100-en", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
|
| 24 |
"cc100-fa", "cc100-ar", "cc100-ja"])
|
| 25 |
|
| 26 |
VALID_CODES_CC100 = [
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
def get_compression_leaderboard(
|
| 158 |
+
corpuses: List[str] = ['cc100-nl'],
|
| 159 |
unit: str = "b_tokens/g_bytes",
|
| 160 |
tokenizer_filter: Optional[str] = None,
|
| 161 |
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
vocab/wizardcoder_15b_v1/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
from transformers import AutoTokenizer
|
| 3 |
-
|
| 4 |
-
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# from transformers import AutoTokenizer
|
| 3 |
+
#
|
| 4 |
+
# tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
|