| import fnmatch |
| import torch |
| from dataclasses import dataclass, replace |
| from bigcode_eval.tasks import ALL_TASKS |
| from bigcode_eval.evaluator import Evaluator |
| from dmx.compressor import config_rules |
| from dmx.compressor.modeling import DmxModel |
| from transformers import ( AutoModelForCausalLM, AutoTokenizer ) |
| import traceback |
|
|
| @dataclass |
| class BigcodeEvalArguments: |
| prefix: str = "" |
| do_sample: bool = True |
| temperature: float = 0.8 |
| top_k: int = 0 |
| top_p: float = 0.95 |
| n_samples: int = 10 |
| eos: str = "<|endoftext|>" |
| seed: int = 0 |
| modeltype: str = "causal" |
| instruction_tokens: str = None |
| batch_size: int = 2 |
| max_length_generation: int = 1024 |
| limit: int = None |
| limit_start: int = 0 |
| metric_output_path: str = "evaluation_results.json" |
| save_every_k_tasks: int = -1 |
| postprocess: bool = True |
| allow_code_execution: bool = True |
| generation_only: bool = False |
| load_generations_path: str = None |
| load_data_path: str = None |
| save_generations: bool = False |
| load_generations_intermediate_paths: str = None |
| save_generations_path: str = "generations.json" |
| save_references: bool = False |
| save_references_path: str = "references.json" |
| prompt: str = "prompt" |
| max_memory_per_gpu: str = None |
| check_references: bool = False |
|
|
| def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None): |
| """ |
| Run code evaluation on the provided task using the specified model and tokenizer. |
| |
| Args: |
| model: The model to use for evaluation. |
| tokenizer: The tokenizer to use for evaluation. |
| task: The task to evaluate. |
| accelerator: Optional Accelerator instance. |
| args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments. |
| |
| Returns: |
| result: A dictionary containing metric and result. |
| """ |
| |
| if accelerator is None: |
| from accelerate import Accelerator |
| accelerator = Accelerator() |
|
|
| |
| eval_args = BigcodeEvalArguments() |
| if args is not None: |
| eval_args = replace(eval_args, **args) |
|
|
| |
| if not fnmatch.filter(ALL_TASKS, task): |
| raise ValueError(f"Invalid task: {task}") |
|
|
| |
| if dmx_config is not None: |
| model = DmxModel.from_torch(model).to("cuda") |
| tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") |
| model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}")) |
| setup = model(tensor) |
| else: |
| model = model.to("cuda") |
| tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") |
| setup = model(tensor) |
|
|
| |
| if not tokenizer.eos_token: |
| if tokenizer.bos_token: |
| tokenizer.eos_token = tokenizer.bos_token |
| print("bos_token used as eos_token") |
| else: |
| raise ValueError("No eos_token or bos_token found") |
| try: |
| tokenizer.pad_token = tokenizer.eos_token |
| except AttributeError: |
| print("Not setting pad_token to eos_token") |
| pass |
|
|
| evaluator = Evaluator(accelerator, model, tokenizer, eval_args) |
|
|
| try: |
| unparsed_result = evaluator.evaluate(task) |
| except Exception as e: |
| print(f"Error evaluating task {task}: {e}") |
|
|
| if eval_args.n_samples == 1: |
| result = {task: {"pass@1": unparsed_result["pass@1"]}} |
| elif eval_args.n_samples == 10: |
| result = {task: {"pass@10": unparsed_result["pass@10"]}} |
| else: |
| result = {task: unparsed_result} |
|
|
| return result |
|
|
| def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1): |
| model_kwargs = { |
| "revision": revision_name, |
| "trust_remote_code": True, |
| } |
|
|
| if pass_k == 10: |
| eval_args = { |
| "max_length_generation": 1024, |
| "batch_size": 2, |
| "n_samples": 10, |
| "temperature": 0.8, |
| "top_p": 0.95, |
| } |
| else: |
| eval_args = { |
| "max_length_generation": 1024, |
| "batch_size": 1, |
| "n_samples": 1, |
| "do_sample": False, |
| "temperature": None, |
| "top_p": None, |
| "top_k": None, |
| } |
| |
| model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs) |
| tokenizer = AutoTokenizer.from_pretrained( |
| model_repo_name, |
| **model_kwargs, |
| padding_side="right", |
| ) |
|
|
| try: |
| result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args) |
| return result, None |
| except Exception as e: |
| error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}" |
| print(error_message) |
| return None, error_message |
|
|