| | import torch |
| | from typing import Dict |
| | import transformers |
| | from transformers import LlamaForCausalLM, LlamaTokenizer |
| |
|
| | DEFAULT_PAD_TOKEN = "[PAD]" |
| | DEFAULT_EOS_TOKEN = "</s>" |
| | DEFAULT_BOS_TOKEN = "</s>" |
| | DEFAULT_UNK_TOKEN = "</s>" |
| | MAX_SOURCE_LENGTH = 512 |
| | MAX_TARGET_LENGTH = 512 |
| | print("Max source length: ", MAX_SOURCE_LENGTH) |
| | print("MAX target length: ", MAX_TARGET_LENGTH) |
| |
|
| |
|
| | def smart_tokenizer_and_embedding_resize( |
| | special_tokens_dict: Dict, |
| | tokenizer: transformers.PreTrainedTokenizer, |
| | ): |
| | """Resize tokenizer and embedding. |
| | Note: This is the unoptimized version that may make your embedding size not be divisible by 64. |
| | """ |
| | tokenizer.add_special_tokens(special_tokens_dict) |
| | tokenizer.add_special_tokens( |
| | { |
| | "eos_token": DEFAULT_EOS_TOKEN, |
| | "bos_token": DEFAULT_BOS_TOKEN, |
| | "unk_token": DEFAULT_UNK_TOKEN, |
| | } |
| | ) |
| |
|
| |
|
| | device_id = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
| |
|
| |
|
| | class InstructScore: |
| | def __init__(self): |
| | self.tokenizer = LlamaTokenizer.from_pretrained( |
| | "xu1998hz/InstructScore", model_max_length=MAX_SOURCE_LENGTH, use_fast=False |
| | ) |
| | |
| | self.tokenizer.padding_side = "left" |
| |
|
| | smart_tokenizer_and_embedding_resize( |
| | special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), |
| | tokenizer=self.tokenizer, |
| | ) |
| | self.model = LlamaForCausalLM.from_pretrained("xu1998hz/InstructScore").to( |
| | device_id |
| | ) |
| | self.model.eval() |
| |
|
| | def score(self, ref_ls, out_ls): |
| | prompt_ls = [ |
| | f'You are evaluating Chinese-to-English Machine translation task. The correct translation is "{ref}". The model generated translation is "{out}". Please identify all errors within each model output, up to a maximum of five. For each error, please give me the corresponding error type, major/minor label, error location of the model generated translation and explanation for the error. Major errors can confuse or mislead the reader due to significant change in meaning, while minor\ |
| | errors don\'t lead to loss of meaning but will be noticed.' |
| | for ref, out in zip(ref_ls, out_ls) |
| | ] |
| |
|
| | with torch.no_grad(): |
| | inputs = self.tokenizer( |
| | prompt_ls, |
| | return_tensors="pt", |
| | padding=True, |
| | truncation=True, |
| | max_length=MAX_SOURCE_LENGTH, |
| | ) |
| | outputs = self.model.generate( |
| | inputs["input_ids"].to(device_id), |
| | attention_mask=inputs["attention_mask"].to(device_id), |
| | max_new_tokens=MAX_TARGET_LENGTH, |
| | ) |
| | batch_outputs = self.tokenizer.batch_decode( |
| | outputs, |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=True, |
| | ) |
| | scores_ls = [ |
| | (-1) * output.count("Major/minor: Minor") |
| | + (-5) * output.count("Major/minor: Major") |
| | for output in batch_outputs |
| | ] |
| | return batch_outputs, scores_ls |
| |
|
| |
|
| | def main(): |
| | refs = [ |
| | "SEScore is a simple but effective next generation text generation evaluation metric", |
| | "SEScore it really works", |
| | ] |
| | outs = [ |
| | "SEScore is a simple effective text evaluation metric for next generation", |
| | "SEScore is not working", |
| | ] |
| |
|
| | scorer = InstructScore() |
| | batch_outputs, scores_ls = scorer.score(refs, outs) |
| | print(batch_outputs) |
| | print(scores_ls) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|