# -*- coding: utf-8 -*-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
"""
# suppress WordNet warnings
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger("nltk").setLevel(logging.CRITICAL)
import os
import sys
import jieba_fast as jieba
import datasets
from typing import List, Dict
import numpy as np
from nltk.translate import meteor_score
from nltk import word_tokenize
import nltk
import evaluate
import re
import pycantonese

# Download once
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True)

# ------------------------------------------------------------------- #
#  REAL Chinese WordNet (CwnGraph) Integration
# ------------------------------------------------------------------- #
_cwn = None
def _load_cwn():
    global _cwn
    if _cwn is None:
        try:
            from CwnGraph import CwnImage
            print("Loading Chinese WordNet (CwnGraph, first time only)...")
            _cwn = CwnImage.latest()
        except ImportError:
            raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
    return _cwn

# Helper to get lemma name (with fallback for API versions)
def _get_lemma_name(lemma):
    try:
        return lemma.name
    except AttributeError:
        return str(lemma).split(': ')[1].split('_')[0]

# Custom Lemma & Synset for NLTK compatibility
class _CwnLemma:
    def __init__(self, name): self._name = name
    def name(self): return self._name

class _CwnSynset:
    def __init__(self, lemmas, synset_id): 
        self._lemmas = lemmas
        self._id = synset_id
    def lemmas(self):
        return [_CwnLemma(name) for name in self._lemmas]

# ------------------------------------------------------------------- #
#  HuggingFace Evaluation Metric
# ------------------------------------------------------------------- #

_DESCRIPTION = """\
This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
"""

_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
    predictions (str): translation sentence to score.
    references (str): reference sentence for each translation.
Returns:
    meteor: the average METEOR score
    scores: the METEOR score for each sentence pairs
    
Examples:
    Examples should be written in doctest format, and should illustrate how
    to use the function.

    >>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
    >>> results = cmeteor.compute(references=["我在這裡吃飯"], predictions=["我在這兒吃晚飯"])
    >>> print(results)
    {'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
"""

# ------------------------------------------------------------------- #
#  HuggingFace evaluate template
# ------------------------------------------------------------------- #
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ChineseMETEOR(evaluate.Metric):
    """TODO: Short description not ready yet."""
    
    def _info(self):
        return evaluate.MetricInfo(
            module_type="metric",
            description=_DESCRIPTION,
            citation="""@inproceedings{denkowski-lavie-2014-meteor,
                title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
                author = "Denkowski, Michael  and  Lavie, Alon",
                booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
                year = "2014"
            }""",
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string"),
                    "references": datasets.Value("string"),
                }
            ),
            # Homepage of the module for documentation
            homepage="https://yourappapp.com",
            # Additional links to the codebase or references
            codebase_urls=["https://github.com/nltk/nltk"],
            reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
        )

    def _download_and_prepare(self, dl_manager) -> None:
        """Download external resources useful to compute the scores"""
        import nltk
        nltk.download("wordnet", quiet=True)
        nltk.download("omw-1.4", quiet=True)
        nltk.download("punkt", quiet=True)
        nltk.download('punkt_tab', quiet=True)
        # CwnGraph auto-downloads on first use
        
    def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
        original_stdout = sys.stdout # store original output
        sys.stdout = open(os.devnull, 'w')

        try:
            # Tokenize using PyCantonese
            pred_seg = [pycantonese.segment(p.strip()) for p in predictions]
            ref_seg  = [pycantonese.segment(r.strip()) for r in references]
    
            # --- Apply Real Chinese WordNet into METEOR algorithm ---
            def _cwn_synsets(self, word, pos=None):  # Matches NLTK method call
                if not isinstance(word, str) or not word.strip():
                    #print(f"DEBUG: Skipping non-string input: {type(word)}")
                    return []
                cwn = _load_cwn()
                try:
                    # Use escaped regex for exact match (CwnGraph expects string pattern)
                    pattern = f"^{re.escape(word)}$"
                    lemmas = cwn.find_lemma(pattern)
                except Exception as e:
                    #print(f"DEBUG: Error querying CWN for '{word}': {e}")
                    return []
                
                exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
                if not exact_lemmas:
                    #print(f"DEBUG: No exact lemma found for '{word}'")
                    return []
                synsets_list = []
                seen_synset_ids = set()
                for lemma in exact_lemmas:
                    for sense in lemma.senses:
                        synset = sense.synset
                        if synset:
                            try:
                                synset_id = synset.id
                            except AttributeError:
                                synset_id = str(synset)
                            if synset_id not in seen_synset_ids:
                                seen_synset_ids.add(synset_id)
                                try:
                                    synset_lemmas = synset.lemmas
                                    syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
                                except AttributeError:
                                    synset_lemmas = []
                                    for s in synset.senses:
                                        try:
                                            # Access the single lemma via lemmas[0]
                                            lemma = s.lemmas[0]
                                            synset_lemmas.append(lemma)
                                        except (AttributeError, IndexError, TypeError):
                                            try:
                                                lemma = s.lemma
                                                synset_lemmas.append(lemma)
                                            except AttributeError:
                                                #print(f"DEBUG: Could not extract lemma from sense {s}")
                                                continue
                                    syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
                                syn_lemmas_set = set(syn_lemma_names)
                                if syn_lemmas_set:
                                    synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
                #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
                return synsets_list[:1]
    
            # Use class for proper method binding
            class ChineseWordNet:
                def synsets(self, word, pos=None):
                    return _cwn_synsets(self, word, pos)
    
            chinese_wn = ChineseWordNet()
    
            scores = [
                meteor_score.single_meteor_score(
                    ref,
                    hyp,
                    wordnet=chinese_wn
                )
                for ref, hyp in zip(ref_seg, pred_seg)
            ]
        finally:
            sys.stdout.close()
            sys.stdout = original_stdout # restore original output

        return {
            "meteor": float(np.mean(scores)),
            "scores": scores,
        }