# -*- coding: utf-8 -*- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet """ # suppress WordNet warnings import warnings warnings.filterwarnings("ignore") import logging logging.getLogger("nltk").setLevel(logging.CRITICAL) import os import sys import jieba_fast as jieba import datasets from typing import List, Dict import numpy as np from nltk.translate import meteor_score from nltk import word_tokenize import nltk import evaluate import re import pycantonese # Download once nltk.download("wordnet", quiet=True) nltk.download("omw-1.4", quiet=True) nltk.download("punkt", quiet=True) nltk.download('punkt_tab', quiet=True) # ------------------------------------------------------------------- # # REAL Chinese WordNet (CwnGraph) Integration # ------------------------------------------------------------------- # _cwn = None def _load_cwn(): global _cwn if _cwn is None: try: from CwnGraph import CwnImage print("Loading Chinese WordNet (CwnGraph, first time only)...") _cwn = CwnImage.latest() except ImportError: raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph") return _cwn # Helper to get lemma name (with fallback for API versions) def _get_lemma_name(lemma): try: return lemma.name except AttributeError: return str(lemma).split(': ')[1].split('_')[0] # Custom Lemma & Synset for NLTK compatibility class _CwnLemma: def __init__(self, name): self._name = name def name(self): return self._name class _CwnSynset: def __init__(self, lemmas, synset_id): self._lemmas = lemmas self._id = synset_id def lemmas(self): return [_CwnLemma(name) for name in self._lemmas] # ------------------------------------------------------------------- # # HuggingFace Evaluation Metric # ------------------------------------------------------------------- # _DESCRIPTION = """\ This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation. """ _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: predictions (str): translation sentence to score. references (str): reference sentence for each translation. Returns: meteor: the average METEOR score scores: the METEOR score for each sentence pairs Examples: Examples should be written in doctest format, and should illustrate how to use the function. >>> cmeteor = evaluate.load("raptorkwok/chinesemeteor") >>> results = cmeteor.compute(references=["我在這裡吃飯"], predictions=["我在這兒吃晚飯"]) >>> print(results) {'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]} """ # ------------------------------------------------------------------- # # HuggingFace evaluate template # ------------------------------------------------------------------- # @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class ChineseMETEOR(evaluate.Metric): """TODO: Short description not ready yet.""" def _info(self): return evaluate.MetricInfo( module_type="metric", description=_DESCRIPTION, citation="""@inproceedings{denkowski-lavie-2014-meteor, title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language", author = "Denkowski, Michael and Lavie, Alon", booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation", year = "2014" }""", inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), } ), # Homepage of the module for documentation homepage="https://yourappapp.com", # Additional links to the codebase or references codebase_urls=["https://github.com/nltk/nltk"], reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"], ) def _download_and_prepare(self, dl_manager) -> None: """Download external resources useful to compute the scores""" import nltk nltk.download("wordnet", quiet=True) nltk.download("omw-1.4", quiet=True) nltk.download("punkt", quiet=True) nltk.download('punkt_tab', quiet=True) # CwnGraph auto-downloads on first use def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]: original_stdout = sys.stdout # store original output sys.stdout = open(os.devnull, 'w') try: # Tokenize using PyCantonese pred_seg = [pycantonese.segment(p.strip()) for p in predictions] ref_seg = [pycantonese.segment(r.strip()) for r in references] # --- Apply Real Chinese WordNet into METEOR algorithm --- def _cwn_synsets(self, word, pos=None): # Matches NLTK method call if not isinstance(word, str) or not word.strip(): #print(f"DEBUG: Skipping non-string input: {type(word)}") return [] cwn = _load_cwn() try: # Use escaped regex for exact match (CwnGraph expects string pattern) pattern = f"^{re.escape(word)}$" lemmas = cwn.find_lemma(pattern) except Exception as e: #print(f"DEBUG: Error querying CWN for '{word}': {e}") return [] exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word] if not exact_lemmas: #print(f"DEBUG: No exact lemma found for '{word}'") return [] synsets_list = [] seen_synset_ids = set() for lemma in exact_lemmas: for sense in lemma.senses: synset = sense.synset if synset: try: synset_id = synset.id except AttributeError: synset_id = str(synset) if synset_id not in seen_synset_ids: seen_synset_ids.add(synset_id) try: synset_lemmas = synset.lemmas syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas] except AttributeError: synset_lemmas = [] for s in synset.senses: try: # Access the single lemma via lemmas[0] lemma = s.lemmas[0] synset_lemmas.append(lemma) except (AttributeError, IndexError, TypeError): try: lemma = s.lemma synset_lemmas.append(lemma) except AttributeError: #print(f"DEBUG: Could not extract lemma from sense {s}") continue syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas] syn_lemmas_set = set(syn_lemma_names) if syn_lemmas_set: synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id)) #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}") return synsets_list[:1] # Use class for proper method binding class ChineseWordNet: def synsets(self, word, pos=None): return _cwn_synsets(self, word, pos) chinese_wn = ChineseWordNet() scores = [ meteor_score.single_meteor_score( ref, hyp, wordnet=chinese_wn ) for ref, hyp in zip(ref_seg, pred_seg) ] finally: sys.stdout.close() sys.stdout = original_stdout # restore original output return { "meteor": float(np.mean(scores)), "scores": scores, }