Spaces:

Norelad
/

coptic-translation-interface

Running

File size: 5,376 Bytes

#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)

Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""

import stanza
import warnings
warnings.filterwarnings('ignore')

class CopticParserCore:
    """Lightweight Coptic parser for web applications"""

    def __init__(self):
        self.nlp = None
        self.diaparser = None

    def load_parser(self):
        """Initialize Stanza parser with Coptic models"""
        if self.nlp is not None:
            return  # Already loaded

        print("Loading Coptic NLP models...")

        try:
            # Try to load Stanza with all processors
            self.nlp = stanza.Pipeline(
                lang='cop',
                processors='tokenize,pos,lemma,depparse',
                download_method=None,
                verbose=False
            )
            print("✓ Coptic parser loaded successfully")

        except Exception as e:
            # If models not found, download them
            if "Resources file not found" in str(e) or "not found" in str(e).lower():
                print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
                try:
                    # Download Coptic models
                    stanza.download('cop', verbose=False)

                    # Try loading again
                    self.nlp = stanza.Pipeline(
                        lang='cop',
                        processors='tokenize,pos,lemma,depparse',
                        download_method=None,
                        verbose=False
                    )
                    print("✓ Coptic models downloaded and loaded successfully")
                except Exception as download_error:
                    print(f"❌ Failed to download Coptic models: {download_error}")
                    raise
            else:
                print(f"❌ Failed to load parser: {e}")
                raise

    def parse_text(self, text):
        """
        Parse Coptic text and return structured results

        Args:
            text: Coptic text to parse

        Returns:
            dict with:
                - sentences: list of parsed sentence data
                - total_sentences: int
                - total_tokens: int
                - text: original text
        """
        if not text or not text.strip():
            return None

        # Ensure parser is loaded
        self.load_parser()

        # Parse with Stanza
        doc = self.nlp(text)

        if not doc.sentences:
            return None

        # Extract structured data
        sentences = []
        total_tokens = 0

        for sent_idx, sentence in enumerate(doc.sentences, 1):
            words_data = []

            for word in sentence.words:
                word_data = {
                    'id': word.id,
                    'form': word.text,
                    'lemma': word.lemma or '_',
                    'upos': word.upos,
                    'xpos': word.xpos or '_',
                    'feats': word.feats or '_',
                    'head': word.head,
                    'deprel': word.deprel,
                    'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
                }
                words_data.append(word_data)
                total_tokens += 1

            sentences.append({
                'id': sent_idx,
                'text': sentence.text,
                'words': words_data
            })

        return {
            'sentences': sentences,
            'total_sentences': len(sentences),
            'total_tokens': total_tokens,
            'text': text
        }

    def format_conllu(self, parse_result):
        """Format parse result as CoNLL-U"""
        if not parse_result:
            return ""

        lines = []
        for sentence in parse_result['sentences']:
            lines.append(f"# sent_id = {sentence['id']}")
            lines.append(f"# text = {sentence['text']}")

            for word in sentence['words']:
                line = "\t".join([
                    str(word['id']),
                    word['form'],
                    word['lemma'],
                    word['upos'],
                    word['xpos'],
                    word['feats'],
                    str(word['head']),
                    word['deprel'],
                    '_',  # deps
                    '_'   # misc
                ])
                lines.append(line)

            lines.append("")  # Blank line between sentences

        return "\n".join(lines)

    def format_table(self, parse_result):
        """Format parse result as markdown table"""
        if not parse_result:
            return ""

        output = []

        for sentence in parse_result['sentences']:
            output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
            output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
            output.append("|:---|:-----|:------|:-----|:-----|:-------|")

            for word in sentence['words']:
                output.append(
                    f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
                    f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
                )

        return "\n".join(output)