Spaces:

Norelad
/

coptic-translation-interface

Sleeping

File size: 8,580 Bytes

#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)

Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic)
for enhanced grammatical validation and error detection.

Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""

import stanza
import warnings
warnings.filterwarnings('ignore')

class CopticParserCore:
    """Lightweight neural-symbolic Coptic parser for web applications"""

    def __init__(self):
        self.nlp = None
        self.diaparser = None
        self.prolog = None  # Prolog engine for grammatical validation
        self._init_prolog()

    def _init_prolog(self):
        """Initialize Prolog engine for grammatical validation (optional)"""
        try:
            from coptic_prolog_rules import create_prolog_engine
            self.prolog = create_prolog_engine()
            if self.prolog and self.prolog.prolog_initialized:
                print("✓ Prolog engine initialized successfully")
        except Exception as e:
            print(f"ℹ  Prolog validation not available: {e}")
            print("   Parser will continue with neural-only mode")
            self.prolog = None

    def load_parser(self):
        """Initialize Stanza parser with Coptic models"""
        if self.nlp is not None:
            return  # Already loaded

        print("Loading Coptic NLP models...")

        try:
            # Try to load Stanza with all processors
            self.nlp = stanza.Pipeline(
                lang='cop',
                processors='tokenize,pos,lemma,depparse',
                download_method=None,
                verbose=False
            )
            print("✓ Coptic neural parser loaded successfully")

        except Exception as e:
            # If models not found, download them
            if "Resources file not found" in str(e) or "not found" in str(e).lower():
                print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
                try:
                    # Download Coptic models
                    stanza.download('cop', verbose=False)

                    # Try loading again
                    self.nlp = stanza.Pipeline(
                        lang='cop',
                        processors='tokenize,pos,lemma,depparse',
                        download_method=None,
                        verbose=False
                    )
                    print("✓ Coptic models downloaded and loaded successfully")
                except Exception as download_error:
                    print(f"❌ Failed to download Coptic models: {download_error}")
                    raise
            else:
                print(f"❌ Failed to load parser: {e}")
                raise

    def parse_text(self, text, include_prolog_validation=True):
        """
        Parse Coptic text and return structured results with Prolog validation

        Args:
            text: Coptic text to parse
            include_prolog_validation: Whether to run Prolog grammatical validation (default: True)

        Returns:
            dict with:
                - sentences: list of parsed sentence data
                - total_sentences: int
                - total_tokens: int
                - text: original text
                - prolog_validation: dict with validation results (if enabled and available)
        """
        if not text or not text.strip():
            return None

        # Ensure parser is loaded
        self.load_parser()

        # Parse with Stanza (neural)
        doc = self.nlp(text)

        if not doc.sentences:
            return None

        # Extract structured data
        sentences = []
        total_tokens = 0

        for sent_idx, sentence in enumerate(doc.sentences, 1):
            words_data = []

            for word in sentence.words:
                word_data = {
                    'id': word.id,
                    'form': word.text,
                    'lemma': word.lemma or '_',
                    'upos': word.upos,
                    'xpos': word.xpos or '_',
                    'feats': word.feats or '_',
                    'head': word.head,
                    'deprel': word.deprel,
                    'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
                }
                words_data.append(word_data)
                total_tokens += 1

            sentences.append({
                'id': sent_idx,
                'text': sentence.text,
                'words': words_data
            })

        result = {
            'sentences': sentences,
            'total_sentences': len(sentences),
            'total_tokens': total_tokens,
            'text': text
        }

        # Add Prolog validation (symbolic) if available and requested
        if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized:
            try:
                validation = self._validate_with_prolog(sentences)
                result['prolog_validation'] = validation
            except Exception as e:
                print(f"ℹ  Prolog validation skipped: {e}")
                result['prolog_validation'] = None

        return result

    def _validate_with_prolog(self, sentences):
        """
        Validate parsed sentences using Prolog grammatical rules

        Args:
            sentences: List of parsed sentence data

        Returns:
            dict with validation results including patterns detected and warnings
        """
        if not self.prolog:
            return None

        validation_results = {
            'patterns_detected': [],
            'warnings': [],
            'has_errors': False
        }

        for sentence in sentences:
            # Extract tokens, POS tags, heads, and dependency relations
            tokens = [word['form'] for word in sentence['words']]
            pos_tags = [word['upos'] for word in sentence['words']]
            heads = [word['head'] for word in sentence['words']]
            deprels = [word['deprel'] for word in sentence['words']]

            # Validate with Prolog
            try:
                sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels)

                if sent_validation:
                    # Merge results
                    if sent_validation.get('patterns'):
                        validation_results['patterns_detected'].extend(sent_validation['patterns'])

                    if sent_validation.get('warnings'):
                        validation_results['warnings'].extend(sent_validation['warnings'])
                        validation_results['has_errors'] = True

            except Exception as e:
                print(f"ℹ  Prolog validation error for sentence: {e}")

        return validation_results

    def format_conllu(self, parse_result):
        """Format parse result as CoNLL-U"""
        if not parse_result:
            return ""

        lines = []
        for sentence in parse_result['sentences']:
            lines.append(f"# sent_id = {sentence['id']}")
            lines.append(f"# text = {sentence['text']}")

            for word in sentence['words']:
                line = "\t".join([
                    str(word['id']),
                    word['form'],
                    word['lemma'],
                    word['upos'],
                    word['xpos'],
                    word['feats'],
                    str(word['head']),
                    word['deprel'],
                    '_',  # deps
                    '_'   # misc
                ])
                lines.append(line)

            lines.append("")  # Blank line between sentences

        return "\n".join(lines)

    def format_table(self, parse_result):
        """Format parse result as markdown table"""
        if not parse_result:
            return ""

        output = []

        for sentence in parse_result['sentences']:
            output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
            output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
            output.append("|:---|:-----|:------|:-----|:-----|:-------|")

            for word in sentence['words']:
                output.append(
                    f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
                    f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
                )

        return "\n".join(output)