Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| Chinese METEOR β with Jieba pre-segmentation + CwnGraph Chinese WordNet | |
| """ | |
| # suppress WordNet warnings | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import logging | |
| logging.getLogger("nltk").setLevel(logging.CRITICAL) | |
| import os | |
| import sys | |
| import jieba_fast as jieba | |
| import datasets | |
| from typing import List, Dict | |
| import numpy as np | |
| from nltk.translate import meteor_score | |
| from nltk import word_tokenize | |
| import nltk | |
| import evaluate | |
| import re | |
| import pycantonese | |
| # Download once | |
| nltk.download("wordnet", quiet=True) | |
| nltk.download("omw-1.4", quiet=True) | |
| nltk.download("punkt", quiet=True) | |
| nltk.download('punkt_tab', quiet=True) | |
| # ------------------------------------------------------------------- # | |
| # REAL Chinese WordNet (CwnGraph) Integration | |
| # ------------------------------------------------------------------- # | |
| _cwn = None | |
| def _load_cwn(): | |
| global _cwn | |
| if _cwn is None: | |
| try: | |
| from CwnGraph import CwnImage | |
| print("Loading Chinese WordNet (CwnGraph, first time only)...") | |
| _cwn = CwnImage.latest() | |
| except ImportError: | |
| raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph") | |
| return _cwn | |
| # Helper to get lemma name (with fallback for API versions) | |
| def _get_lemma_name(lemma): | |
| try: | |
| return lemma.name | |
| except AttributeError: | |
| return str(lemma).split(': ')[1].split('_')[0] | |
| # Custom Lemma & Synset for NLTK compatibility | |
| class _CwnLemma: | |
| def __init__(self, name): self._name = name | |
| def name(self): return self._name | |
| class _CwnSynset: | |
| def __init__(self, lemmas, synset_id): | |
| self._lemmas = lemmas | |
| self._id = synset_id | |
| def lemmas(self): | |
| return [_CwnLemma(name) for name in self._lemmas] | |
| # ------------------------------------------------------------------- # | |
| # HuggingFace Evaluation Metric | |
| # ------------------------------------------------------------------- # | |
| _DESCRIPTION = """\ | |
| This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation. | |
| """ | |
| _KWARGS_DESCRIPTION = """ | |
| Calculates how good are predictions given some references, using certain scores | |
| Args: | |
| predictions (str): translation sentence to score. | |
| references (str): reference sentence for each translation. | |
| Returns: | |
| meteor: the average METEOR score | |
| scores: the METEOR score for each sentence pairs | |
| Examples: | |
| Examples should be written in doctest format, and should illustrate how | |
| to use the function. | |
| >>> cmeteor = evaluate.load("raptorkwok/chinesemeteor") | |
| >>> results = cmeteor.compute(references=["ζε¨ι裑ει£―"], predictions=["ζε¨ιε εζι£―"]) | |
| >>> print(results) | |
| {'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]} | |
| """ | |
| # ------------------------------------------------------------------- # | |
| # HuggingFace evaluate template | |
| # ------------------------------------------------------------------- # | |
| class ChineseMETEOR(evaluate.Metric): | |
| """TODO: Short description not ready yet.""" | |
| def _info(self): | |
| return evaluate.MetricInfo( | |
| module_type="metric", | |
| description=_DESCRIPTION, | |
| citation="""@inproceedings{denkowski-lavie-2014-meteor, | |
| title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language", | |
| author = "Denkowski, Michael and Lavie, Alon", | |
| booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation", | |
| year = "2014" | |
| }""", | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| features=datasets.Features( | |
| { | |
| "predictions": datasets.Value("string"), | |
| "references": datasets.Value("string"), | |
| } | |
| ), | |
| # Homepage of the module for documentation | |
| homepage="https://yourappapp.com", | |
| # Additional links to the codebase or references | |
| codebase_urls=["https://github.com/nltk/nltk"], | |
| reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"], | |
| ) | |
| def _download_and_prepare(self, dl_manager) -> None: | |
| """Download external resources useful to compute the scores""" | |
| import nltk | |
| nltk.download("wordnet", quiet=True) | |
| nltk.download("omw-1.4", quiet=True) | |
| nltk.download("punkt", quiet=True) | |
| nltk.download('punkt_tab', quiet=True) | |
| # CwnGraph auto-downloads on first use | |
| def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]: | |
| original_stdout = sys.stdout # store original output | |
| sys.stdout = open(os.devnull, 'w') | |
| try: | |
| # Tokenize using PyCantonese | |
| pred_seg = [pycantonese.segment(p.strip()) for p in predictions] | |
| ref_seg = [pycantonese.segment(r.strip()) for r in references] | |
| # --- Apply Real Chinese WordNet into METEOR algorithm --- | |
| def _cwn_synsets(self, word, pos=None): # Matches NLTK method call | |
| if not isinstance(word, str) or not word.strip(): | |
| #print(f"DEBUG: Skipping non-string input: {type(word)}") | |
| return [] | |
| cwn = _load_cwn() | |
| try: | |
| # Use escaped regex for exact match (CwnGraph expects string pattern) | |
| pattern = f"^{re.escape(word)}$" | |
| lemmas = cwn.find_lemma(pattern) | |
| except Exception as e: | |
| #print(f"DEBUG: Error querying CWN for '{word}': {e}") | |
| return [] | |
| exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word] | |
| if not exact_lemmas: | |
| #print(f"DEBUG: No exact lemma found for '{word}'") | |
| return [] | |
| synsets_list = [] | |
| seen_synset_ids = set() | |
| for lemma in exact_lemmas: | |
| for sense in lemma.senses: | |
| synset = sense.synset | |
| if synset: | |
| try: | |
| synset_id = synset.id | |
| except AttributeError: | |
| synset_id = str(synset) | |
| if synset_id not in seen_synset_ids: | |
| seen_synset_ids.add(synset_id) | |
| try: | |
| synset_lemmas = synset.lemmas | |
| syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas] | |
| except AttributeError: | |
| synset_lemmas = [] | |
| for s in synset.senses: | |
| try: | |
| # Access the single lemma via lemmas[0] | |
| lemma = s.lemmas[0] | |
| synset_lemmas.append(lemma) | |
| except (AttributeError, IndexError, TypeError): | |
| try: | |
| lemma = s.lemma | |
| synset_lemmas.append(lemma) | |
| except AttributeError: | |
| #print(f"DEBUG: Could not extract lemma from sense {s}") | |
| continue | |
| syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas] | |
| syn_lemmas_set = set(syn_lemma_names) | |
| if syn_lemmas_set: | |
| synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id)) | |
| #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}") | |
| return synsets_list[:1] | |
| # Use class for proper method binding | |
| class ChineseWordNet: | |
| def synsets(self, word, pos=None): | |
| return _cwn_synsets(self, word, pos) | |
| chinese_wn = ChineseWordNet() | |
| scores = [ | |
| meteor_score.single_meteor_score( | |
| ref, | |
| hyp, | |
| wordnet=chinese_wn | |
| ) | |
| for ref, hyp in zip(ref_seg, pred_seg) | |
| ] | |
| finally: | |
| sys.stdout.close() | |
| sys.stdout = original_stdout # restore original output | |
| return { | |
| "meteor": float(np.mean(scores)), | |
| "scores": scores, | |
| } | |