| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """CodeBLEU metric.""" |
|
|
| import evaluate |
| import datasets |
|
|
| |
| from .bleu import corpus_bleu |
| from .utils import pad_sequence |
| from .weighted_ngram_match import ngrams |
| from .syntax_match import calc_syntax_match |
| from .parser_DFG import DFG_python |
| from .parser_utils import tree_to_token_index |
| from .dataflow_match import calc_dataflow_match |
|
|
| from .my_codebleu import calc_codebleu |
|
|
|
|
| |
| _CITATION = """\ |
| @InProceedings{huggingface:module, |
| title = {CodeBLEU: A Metric for Evaluating Code Generation}, |
| authors={Sedykh, Ivan}, |
| year={2022} |
| } |
| """ |
|
|
| |
| _DESCRIPTION = """\ |
| This new module is an adaptation of the original CodeBLEU metric from CodexGLUE benchmark |
| for evaluating code generation. |
| """ |
|
|
|
|
| |
| _KWARGS_DESCRIPTION = """ |
| Calculates how good are predictions given some references, using certain scores |
| Args: |
| predictions: list of predictions to score. Each predictions |
| should be a string with tokens separated by spaces. |
| references: list of lists of references. Each list |
| should contain len(predictions) items. |
| lang: programming language in ['java','js','c_sharp','php','go','python','ruby'] |
| tokenizer: tokenizer function str -> List[str], Defaults to lambda s: s.split() |
| params: str, weights for averaging(see CodeBLEU paper). |
| Defaults to equal weights "0.25,0.25,0.25,0.25". |
| Returns: |
| CodeBLEU: resulting score, |
| ngram_match_score: See paper CodeBLEU, |
| weighted_ngram_match_score: See paper CodeBLEU, |
| syntax_match_score: See paper CodeBLEU, |
| dataflow_match_score: See paper CodeBLEU, |
| Examples: |
| |
| >>> codebleu = evaluate.load("my_new_module") |
| >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) |
| >>> print(results) |
| {'accuracy': 1.0} |
| """ |
|
|
| |
| |
|
|
|
|
| @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
| class codebleu(evaluate.Metric): |
| """CodeBLEU metric from CodexGLUE""" |
|
|
| def _info(self): |
| return evaluate.MetricInfo( |
| description=_DESCRIPTION, |
| citation=_CITATION, |
| inputs_description=_KWARGS_DESCRIPTION, |
| features=[ |
| datasets.Features( |
| { |
| "predictions": datasets.Value("string", id="sequence"), |
| "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
| } |
| ), |
| datasets.Features( |
| { |
| "predictions": datasets.Value("string", id="sequence"), |
| "references": datasets.Value("string", id="sequence"), |
| } |
| ), |
| ], |
| reference_urls=[ |
| "https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans/evaluator", |
| "https://arxiv.org/abs/2009.10297", |
| ], |
| ) |
|
|
| def _download_and_prepare(self, dl_manager): |
| """Optional: download external resources useful to compute the scores""" |
| |
| |
| |
| self.kw_dir = dl_manager.download_and_extract("https://huggingface.co/spaces/dvitel/codebleu/resolve/main/keywords.tar.gz") |
| print("Downloaded keywords to", self.kw_dir) |
| self.langso_dir = dl_manager.download("https://huggingface.co/spaces/dvitel/codebleu/resolve/main/my-languages.so") |
| print("Downloaded languages.so to", self.langso_dir) |
|
|
| def _compute(self, predictions, references, lang = "python", tokenizer=None, params="0.25,0.25,0.25,0.25"): |
| """Returns the scores""" |
| res = calc_codebleu( |
| predictions=predictions, |
| references=references, |
| lang=lang, |
| tokenizer=tokenizer, |
| params=params, |
| kw_dir = self.kw_dir, |
| langso_dir = self.langso_dir |
| ) |
| return res |
|
|