#!/usr/bin/env python3 """ Coptic Dependency Parser - Core Module (Web-Compatible) Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic) for enhanced grammatical validation and error detection. Extracted from coptic-parser.py for integration with web interfaces. Author: André Linden (2025) License: CC BY-NC-SA 4.0 """ import stanza import warnings warnings.filterwarnings('ignore') class CopticParserCore: """Lightweight neural-symbolic Coptic parser for web applications""" def __init__(self): self.nlp = None self.diaparser = None self.prolog = None # Prolog engine for grammatical validation self._init_prolog() def _init_prolog(self): """Initialize Prolog engine for grammatical validation (optional)""" try: from coptic_prolog_rules import create_prolog_engine self.prolog = create_prolog_engine() if self.prolog and self.prolog.prolog_initialized: print("✓ Prolog engine initialized successfully") except Exception as e: print(f"ℹ Prolog validation not available: {e}") print(" Parser will continue with neural-only mode") self.prolog = None def load_parser(self): """Initialize Stanza parser with Coptic models""" if self.nlp is not None: return # Already loaded print("Loading Coptic NLP models...") try: # Try to load Stanza with all processors self.nlp = stanza.Pipeline( lang='cop', processors='tokenize,pos,lemma,depparse', download_method=None, verbose=False ) print("✓ Coptic neural parser loaded successfully") except Exception as e: # If models not found, download them if "Resources file not found" in str(e) or "not found" in str(e).lower(): print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...") try: # Download Coptic models stanza.download('cop', verbose=False) # Try loading again self.nlp = stanza.Pipeline( lang='cop', processors='tokenize,pos,lemma,depparse', download_method=None, verbose=False ) print("✓ Coptic models downloaded and loaded successfully") except Exception as download_error: print(f"❌ Failed to download Coptic models: {download_error}") raise else: print(f"❌ Failed to load parser: {e}") raise def parse_text(self, text, include_prolog_validation=True): """ Parse Coptic text and return structured results with Prolog validation Args: text: Coptic text to parse include_prolog_validation: Whether to run Prolog grammatical validation (default: True) Returns: dict with: - sentences: list of parsed sentence data - total_sentences: int - total_tokens: int - text: original text - prolog_validation: dict with validation results (if enabled and available) """ if not text or not text.strip(): return None # Ensure parser is loaded self.load_parser() # Parse with Stanza (neural) doc = self.nlp(text) if not doc.sentences: return None # Extract structured data sentences = [] total_tokens = 0 for sent_idx, sentence in enumerate(doc.sentences, 1): words_data = [] for word in sentence.words: word_data = { 'id': word.id, 'form': word.text, 'lemma': word.lemma or '_', 'upos': word.upos, 'xpos': word.xpos or '_', 'feats': word.feats or '_', 'head': word.head, 'deprel': word.deprel, 'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text } words_data.append(word_data) total_tokens += 1 sentences.append({ 'id': sent_idx, 'text': sentence.text, 'words': words_data }) result = { 'sentences': sentences, 'total_sentences': len(sentences), 'total_tokens': total_tokens, 'text': text } # Add Prolog validation (symbolic) if available and requested if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized: try: validation = self._validate_with_prolog(sentences) result['prolog_validation'] = validation except Exception as e: print(f"ℹ Prolog validation skipped: {e}") result['prolog_validation'] = None return result def _validate_with_prolog(self, sentences): """ Validate parsed sentences using Prolog grammatical rules Args: sentences: List of parsed sentence data Returns: dict with validation results including patterns detected and warnings """ if not self.prolog: return None validation_results = { 'patterns_detected': [], 'warnings': [], 'has_errors': False } for sentence in sentences: # Extract tokens, POS tags, heads, and dependency relations tokens = [word['form'] for word in sentence['words']] pos_tags = [word['upos'] for word in sentence['words']] heads = [word['head'] for word in sentence['words']] deprels = [word['deprel'] for word in sentence['words']] # Validate with Prolog try: sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels) if sent_validation: # Merge results if sent_validation.get('patterns'): validation_results['patterns_detected'].extend(sent_validation['patterns']) if sent_validation.get('warnings'): validation_results['warnings'].extend(sent_validation['warnings']) validation_results['has_errors'] = True except Exception as e: print(f"ℹ Prolog validation error for sentence: {e}") return validation_results def format_conllu(self, parse_result): """Format parse result as CoNLL-U""" if not parse_result: return "" lines = [] for sentence in parse_result['sentences']: lines.append(f"# sent_id = {sentence['id']}") lines.append(f"# text = {sentence['text']}") for word in sentence['words']: line = "\t".join([ str(word['id']), word['form'], word['lemma'], word['upos'], word['xpos'], word['feats'], str(word['head']), word['deprel'], '_', # deps '_' # misc ]) lines.append(line) lines.append("") # Blank line between sentences return "\n".join(lines) def format_table(self, parse_result): """Format parse result as markdown table""" if not parse_result: return "" output = [] for sentence in parse_result['sentences']: output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n") output.append("| ID | Form | Lemma | UPOS | Head | DepRel |") output.append("|:---|:-----|:------|:-----|:-----|:-------|") for word in sentence['words']: output.append( f"| {word['id']} | **{word['form']}** | {word['lemma']} | " f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |" ) return "\n".join(output)