| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import sys |
| | import os |
| |
|
| | |
| | os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" |
| |
|
| | from transformers import DebertaTokenizer, DebertaTokenizerFast, DebertaConfig, DebertaForMaskedLM |
| |
|
| | mname_orig = "microsoft/deberta-base" |
| | mname_tiny = "tiny-deberta" |
| |
|
| |
|
| | |
| |
|
| | import json |
| | from transformers import AutoTokenizer |
| | from tokenizers import Tokenizer |
| | vocab_keep_items = 5000 |
| | tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True) |
| | assert tokenizer.is_fast, "This only works for fast tokenizers." |
| | tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) |
| | vocab = tokenizer_json["model"]["vocab"] |
| | if tokenizer_json["model"]["type"] == "BPE": |
| | new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } |
| | merges = tokenizer_json["model"]["merges"] |
| | new_merges = [] |
| | for i in range(len(merges)): |
| | a, b = merges[i].split() |
| | new_token = "".join((a, b)) |
| | if a in new_vocab and b in new_vocab and new_token in new_vocab: |
| | new_merges.append(merges[i]) |
| | tokenizer_json["model"]["merges"] = new_merges |
| | elif tokenizer_json["model"]["type"] == "Unigram": |
| | new_vocab = vocab[:vocab_keep_items] |
| | elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel": |
| | new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } |
| | else: |
| | raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}") |
| | tokenizer_json["model"]["vocab"] = new_vocab |
| | tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) |
| | tokenizer_fast_tiny = tokenizer |
| |
|
| |
|
| | |
| |
|
| | config_tiny = DebertaConfig.from_pretrained(mname_orig) |
| | print(config_tiny) |
| | |
| | config_tiny.update(dict( |
| | vocab_size=vocab_keep_items, |
| | embedding_size=32, |
| | pooler_size=32, |
| | hidden_size=32, |
| | intermediate_size=64, |
| | max_position_embeddings=128, |
| | num_attention_heads=2, |
| | num_hidden_layers=2, |
| | )) |
| | print("New config", config_tiny) |
| |
|
| | |
| |
|
| | model_tiny = DebertaForMaskedLM(config_tiny) |
| | print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}") |
| | model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny)) |
| |
|
| | |
| | inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt") |
| | |
| | outputs = model_tiny(**inputs) |
| | print("Test with normal tokenizer:", len(outputs.logits[0])) |
| |
|
| | |
| | model_tiny.half() |
| | model_tiny.save_pretrained(".") |
| | tokenizer_fast_tiny.save_pretrained(".") |
| |
|
| | |
| |
|
| | readme = "README.md" |
| | if not os.path.exists(readme): |
| | with open(readme, "w") as f: |
| | f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n") |
| |
|
| | print(f"Generated {mname_tiny}") |
| |
|