Spaces:

raptorkwok
/

chinesemeteor

Sleeping

App Files Files Community

chinesemeteor / chinesemeteor.py

raptorkwok

simplify codes and bug fix

29744eb about 2 months ago

raw

history blame contribute delete

9.53 kB

	# -- coding: utf-8 --
	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
	"""
	# suppress WordNet warnings
	import warnings
	warnings.filterwarnings("ignore")
	import logging
	logging.getLogger("nltk").setLevel(logging.CRITICAL)
	import os
	import sys
	import jieba_fast as jieba
	import datasets
	from typing import List, Dict
	import numpy as np
	from nltk.translate import meteor_score
	from nltk import word_tokenize
	import nltk
	import evaluate
	import re
	import pycantonese

	# Download once
	nltk.download("wordnet", quiet=True)
	nltk.download("omw-1.4", quiet=True)
	nltk.download("punkt", quiet=True)
	nltk.download('punkt_tab', quiet=True)

	# ------------------------------------------------------------------- #
	# REAL Chinese WordNet (CwnGraph) Integration
	# ------------------------------------------------------------------- #
	_cwn = None
	def _load_cwn():
	global _cwn
	if _cwn is None:
	try:
	from CwnGraph import CwnImage
	print("Loading Chinese WordNet (CwnGraph, first time only)...")
	_cwn = CwnImage.latest()
	except ImportError:
	raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
	return _cwn

	# Helper to get lemma name (with fallback for API versions)
	def _get_lemma_name(lemma):
	try:
	return lemma.name
	except AttributeError:
	return str(lemma).split(': ')[1].split('_')[0]

	# Custom Lemma & Synset for NLTK compatibility
	class _CwnLemma:
	def __init__(self, name): self._name = name
	def name(self): return self._name

	class _CwnSynset:
	def __init__(self, lemmas, synset_id):
	self._lemmas = lemmas
	self._id = synset_id
	def lemmas(self):
	return [_CwnLemma(name) for name in self._lemmas]

	# ------------------------------------------------------------------- #
	# HuggingFace Evaluation Metric
	# ------------------------------------------------------------------- #

	_DESCRIPTION = """\
	This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
	"""

	_KWARGS_DESCRIPTION = """
	Calculates how good are predictions given some references, using certain scores
	Args:
	predictions (str): translation sentence to score.
	references (str): reference sentence for each translation.
	Returns:
	meteor: the average METEOR score
	scores: the METEOR score for each sentence pairs

	Examples:
	Examples should be written in doctest format, and should illustrate how
	to use the function.

	>>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
	>>> results = cmeteor.compute(references=["我在這裡吃飯"], predictions=["我在這兒吃晚飯"])
	>>> print(results)
	{'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
	"""

	# ------------------------------------------------------------------- #
	# HuggingFace evaluate template
	# ------------------------------------------------------------------- #
	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class ChineseMETEOR(evaluate.Metric):
	"""TODO: Short description not ready yet."""

	def _info(self):
	return evaluate.MetricInfo(
	module_type="metric",
	description=_DESCRIPTION,
	citation="""@inproceedings{denkowski-lavie-2014-meteor,
	title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
	author = "Denkowski, Michael and Lavie, Alon",
	booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
	year = "2014"
	}""",
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features(
	{
	"predictions": datasets.Value("string"),
	"references": datasets.Value("string"),
	}
	),
	# Homepage of the module for documentation
	homepage="https://yourappapp.com",
	# Additional links to the codebase or references
	codebase_urls=["https://github.com/nltk/nltk"],
	reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
	)

	def _download_and_prepare(self, dl_manager) -> None:
	"""Download external resources useful to compute the scores"""
	import nltk
	nltk.download("wordnet", quiet=True)
	nltk.download("omw-1.4", quiet=True)
	nltk.download("punkt", quiet=True)
	nltk.download('punkt_tab', quiet=True)
	# CwnGraph auto-downloads on first use

	def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
	original_stdout = sys.stdout # store original output
	sys.stdout = open(os.devnull, 'w')

	try:
	# Tokenize using PyCantonese
	pred_seg = [pycantonese.segment(p.strip()) for p in predictions]
	ref_seg = [pycantonese.segment(r.strip()) for r in references]

	# --- Apply Real Chinese WordNet into METEOR algorithm ---
	def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
	if not isinstance(word, str) or not word.strip():
	#print(f"DEBUG: Skipping non-string input: {type(word)}")
	return []
	cwn = _load_cwn()
	try:
	# Use escaped regex for exact match (CwnGraph expects string pattern)
	pattern = f"^{re.escape(word)}$"
	lemmas = cwn.find_lemma(pattern)
	except Exception as e:
	#print(f"DEBUG: Error querying CWN for '{word}': {e}")
	return []

	exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
	if not exact_lemmas:
	#print(f"DEBUG: No exact lemma found for '{word}'")
	return []
	synsets_list = []
	seen_synset_ids = set()
	for lemma in exact_lemmas:
	for sense in lemma.senses:
	synset = sense.synset
	if synset:
	try:
	synset_id = synset.id
	except AttributeError:
	synset_id = str(synset)
	if synset_id not in seen_synset_ids:
	seen_synset_ids.add(synset_id)
	try:
	synset_lemmas = synset.lemmas
	syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
	except AttributeError:
	synset_lemmas = []
	for s in synset.senses:
	try:
	# Access the single lemma via lemmas[0]
	lemma = s.lemmas[0]
	synset_lemmas.append(lemma)
	except (AttributeError, IndexError, TypeError):
	try:
	lemma = s.lemma
	synset_lemmas.append(lemma)
	except AttributeError:
	#print(f"DEBUG: Could not extract lemma from sense {s}")
	continue
	syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
	syn_lemmas_set = set(syn_lemma_names)
	if syn_lemmas_set:
	synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
	#print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
	return synsets_list[:1]

	# Use class for proper method binding
	class ChineseWordNet:
	def synsets(self, word, pos=None):
	return _cwn_synsets(self, word, pos)

	chinese_wn = ChineseWordNet()

	scores = [
	meteor_score.single_meteor_score(
	ref,
	hyp,
	wordnet=chinese_wn
	)
	for ref, hyp in zip(ref_seg, pred_seg)
	]
	finally:
	sys.stdout.close()
	sys.stdout = original_stdout # restore original output

	return {
	"meteor": float(np.mean(scores)),
	"scores": scores,
	}