chinesemeteor / chinesemeteor.py
raptorkwok's picture
simplify codes and bug fix
29744eb
# -*- coding: utf-8 -*-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Chinese METEOR β€” with Jieba pre-segmentation + CwnGraph Chinese WordNet
"""
# suppress WordNet warnings
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger("nltk").setLevel(logging.CRITICAL)
import os
import sys
import jieba_fast as jieba
import datasets
from typing import List, Dict
import numpy as np
from nltk.translate import meteor_score
from nltk import word_tokenize
import nltk
import evaluate
import re
import pycantonese
# Download once
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True)
# ------------------------------------------------------------------- #
# REAL Chinese WordNet (CwnGraph) Integration
# ------------------------------------------------------------------- #
_cwn = None
def _load_cwn():
global _cwn
if _cwn is None:
try:
from CwnGraph import CwnImage
print("Loading Chinese WordNet (CwnGraph, first time only)...")
_cwn = CwnImage.latest()
except ImportError:
raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
return _cwn
# Helper to get lemma name (with fallback for API versions)
def _get_lemma_name(lemma):
try:
return lemma.name
except AttributeError:
return str(lemma).split(': ')[1].split('_')[0]
# Custom Lemma & Synset for NLTK compatibility
class _CwnLemma:
def __init__(self, name): self._name = name
def name(self): return self._name
class _CwnSynset:
def __init__(self, lemmas, synset_id):
self._lemmas = lemmas
self._id = synset_id
def lemmas(self):
return [_CwnLemma(name) for name in self._lemmas]
# ------------------------------------------------------------------- #
# HuggingFace Evaluation Metric
# ------------------------------------------------------------------- #
_DESCRIPTION = """\
This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
"""
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
predictions (str): translation sentence to score.
references (str): reference sentence for each translation.
Returns:
meteor: the average METEOR score
scores: the METEOR score for each sentence pairs
Examples:
Examples should be written in doctest format, and should illustrate how
to use the function.
>>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
>>> results = cmeteor.compute(references=["ζˆ‘εœ¨ι€™θ£‘εƒι£―"], predictions=["ζˆ‘εœ¨ι€™ε…’εƒζ™šι£―"])
>>> print(results)
{'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
"""
# ------------------------------------------------------------------- #
# HuggingFace evaluate template
# ------------------------------------------------------------------- #
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ChineseMETEOR(evaluate.Metric):
"""TODO: Short description not ready yet."""
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation="""@inproceedings{denkowski-lavie-2014-meteor,
title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
author = "Denkowski, Michael and Lavie, Alon",
booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
year = "2014"
}""",
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string"),
"references": datasets.Value("string"),
}
),
# Homepage of the module for documentation
homepage="https://yourappapp.com",
# Additional links to the codebase or references
codebase_urls=["https://github.com/nltk/nltk"],
reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
)
def _download_and_prepare(self, dl_manager) -> None:
"""Download external resources useful to compute the scores"""
import nltk
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True)
# CwnGraph auto-downloads on first use
def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
original_stdout = sys.stdout # store original output
sys.stdout = open(os.devnull, 'w')
try:
# Tokenize using PyCantonese
pred_seg = [pycantonese.segment(p.strip()) for p in predictions]
ref_seg = [pycantonese.segment(r.strip()) for r in references]
# --- Apply Real Chinese WordNet into METEOR algorithm ---
def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
if not isinstance(word, str) or not word.strip():
#print(f"DEBUG: Skipping non-string input: {type(word)}")
return []
cwn = _load_cwn()
try:
# Use escaped regex for exact match (CwnGraph expects string pattern)
pattern = f"^{re.escape(word)}$"
lemmas = cwn.find_lemma(pattern)
except Exception as e:
#print(f"DEBUG: Error querying CWN for '{word}': {e}")
return []
exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
if not exact_lemmas:
#print(f"DEBUG: No exact lemma found for '{word}'")
return []
synsets_list = []
seen_synset_ids = set()
for lemma in exact_lemmas:
for sense in lemma.senses:
synset = sense.synset
if synset:
try:
synset_id = synset.id
except AttributeError:
synset_id = str(synset)
if synset_id not in seen_synset_ids:
seen_synset_ids.add(synset_id)
try:
synset_lemmas = synset.lemmas
syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
except AttributeError:
synset_lemmas = []
for s in synset.senses:
try:
# Access the single lemma via lemmas[0]
lemma = s.lemmas[0]
synset_lemmas.append(lemma)
except (AttributeError, IndexError, TypeError):
try:
lemma = s.lemma
synset_lemmas.append(lemma)
except AttributeError:
#print(f"DEBUG: Could not extract lemma from sense {s}")
continue
syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
syn_lemmas_set = set(syn_lemma_names)
if syn_lemmas_set:
synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
#print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
return synsets_list[:1]
# Use class for proper method binding
class ChineseWordNet:
def synsets(self, word, pos=None):
return _cwn_synsets(self, word, pos)
chinese_wn = ChineseWordNet()
scores = [
meteor_score.single_meteor_score(
ref,
hyp,
wordnet=chinese_wn
)
for ref, hyp in zip(ref_seg, pred_seg)
]
finally:
sys.stdout.close()
sys.stdout = original_stdout # restore original output
return {
"meteor": float(np.mean(scores)),
"scores": scores,
}