File size: 8,580 Bytes
eee0fe0 1fe0b70 eee0fe0 1fe0b70 eee0fe0 1fe0b70 eee0fe0 cc8e202 1fe0b70 cc8e202 eee0fe0 113cd14 eee0fe0 113cd14 eee0fe0 113cd14 eee0fe0 1fe0b70 eee0fe0 1fe0b70 eee0fe0 1fe0b70 eee0fe0 1fe0b70 eee0fe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)
Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic)
for enhanced grammatical validation and error detection.
Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""
import stanza
import warnings
warnings.filterwarnings('ignore')
class CopticParserCore:
"""Lightweight neural-symbolic Coptic parser for web applications"""
def __init__(self):
self.nlp = None
self.diaparser = None
self.prolog = None # Prolog engine for grammatical validation
self._init_prolog()
def _init_prolog(self):
"""Initialize Prolog engine for grammatical validation (optional)"""
try:
from coptic_prolog_rules import create_prolog_engine
self.prolog = create_prolog_engine()
if self.prolog and self.prolog.prolog_initialized:
print("✓ Prolog engine initialized successfully")
except Exception as e:
print(f"ℹ Prolog validation not available: {e}")
print(" Parser will continue with neural-only mode")
self.prolog = None
def load_parser(self):
"""Initialize Stanza parser with Coptic models"""
if self.nlp is not None:
return # Already loaded
print("Loading Coptic NLP models...")
try:
# Try to load Stanza with all processors
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic neural parser loaded successfully")
except Exception as e:
# If models not found, download them
if "Resources file not found" in str(e) or "not found" in str(e).lower():
print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
try:
# Download Coptic models
stanza.download('cop', verbose=False)
# Try loading again
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic models downloaded and loaded successfully")
except Exception as download_error:
print(f"❌ Failed to download Coptic models: {download_error}")
raise
else:
print(f"❌ Failed to load parser: {e}")
raise
def parse_text(self, text, include_prolog_validation=True):
"""
Parse Coptic text and return structured results with Prolog validation
Args:
text: Coptic text to parse
include_prolog_validation: Whether to run Prolog grammatical validation (default: True)
Returns:
dict with:
- sentences: list of parsed sentence data
- total_sentences: int
- total_tokens: int
- text: original text
- prolog_validation: dict with validation results (if enabled and available)
"""
if not text or not text.strip():
return None
# Ensure parser is loaded
self.load_parser()
# Parse with Stanza (neural)
doc = self.nlp(text)
if not doc.sentences:
return None
# Extract structured data
sentences = []
total_tokens = 0
for sent_idx, sentence in enumerate(doc.sentences, 1):
words_data = []
for word in sentence.words:
word_data = {
'id': word.id,
'form': word.text,
'lemma': word.lemma or '_',
'upos': word.upos,
'xpos': word.xpos or '_',
'feats': word.feats or '_',
'head': word.head,
'deprel': word.deprel,
'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
}
words_data.append(word_data)
total_tokens += 1
sentences.append({
'id': sent_idx,
'text': sentence.text,
'words': words_data
})
result = {
'sentences': sentences,
'total_sentences': len(sentences),
'total_tokens': total_tokens,
'text': text
}
# Add Prolog validation (symbolic) if available and requested
if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized:
try:
validation = self._validate_with_prolog(sentences)
result['prolog_validation'] = validation
except Exception as e:
print(f"ℹ Prolog validation skipped: {e}")
result['prolog_validation'] = None
return result
def _validate_with_prolog(self, sentences):
"""
Validate parsed sentences using Prolog grammatical rules
Args:
sentences: List of parsed sentence data
Returns:
dict with validation results including patterns detected and warnings
"""
if not self.prolog:
return None
validation_results = {
'patterns_detected': [],
'warnings': [],
'has_errors': False
}
for sentence in sentences:
# Extract tokens, POS tags, heads, and dependency relations
tokens = [word['form'] for word in sentence['words']]
pos_tags = [word['upos'] for word in sentence['words']]
heads = [word['head'] for word in sentence['words']]
deprels = [word['deprel'] for word in sentence['words']]
# Validate with Prolog
try:
sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels)
if sent_validation:
# Merge results
if sent_validation.get('patterns'):
validation_results['patterns_detected'].extend(sent_validation['patterns'])
if sent_validation.get('warnings'):
validation_results['warnings'].extend(sent_validation['warnings'])
validation_results['has_errors'] = True
except Exception as e:
print(f"ℹ Prolog validation error for sentence: {e}")
return validation_results
def format_conllu(self, parse_result):
"""Format parse result as CoNLL-U"""
if not parse_result:
return ""
lines = []
for sentence in parse_result['sentences']:
lines.append(f"# sent_id = {sentence['id']}")
lines.append(f"# text = {sentence['text']}")
for word in sentence['words']:
line = "\t".join([
str(word['id']),
word['form'],
word['lemma'],
word['upos'],
word['xpos'],
word['feats'],
str(word['head']),
word['deprel'],
'_', # deps
'_' # misc
])
lines.append(line)
lines.append("") # Blank line between sentences
return "\n".join(lines)
def format_table(self, parse_result):
"""Format parse result as markdown table"""
if not parse_result:
return ""
output = []
for sentence in parse_result['sentences']:
output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
output.append("|:---|:-----|:------|:-----|:-----|:-------|")
for word in sentence['words']:
output.append(
f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
)
return "\n".join(output)
|