coptic-translation-interface / coptic_parser_core.py
Rogaton
fix: Auto-download Stanza Coptic models on first use
cc8e202
raw
history blame
5.38 kB
#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)
Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""
import stanza
import warnings
warnings.filterwarnings('ignore')
class CopticParserCore:
"""Lightweight Coptic parser for web applications"""
def __init__(self):
self.nlp = None
self.diaparser = None
def load_parser(self):
"""Initialize Stanza parser with Coptic models"""
if self.nlp is not None:
return # Already loaded
print("Loading Coptic NLP models...")
try:
# Try to load Stanza with all processors
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic parser loaded successfully")
except Exception as e:
# If models not found, download them
if "Resources file not found" in str(e) or "not found" in str(e).lower():
print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
try:
# Download Coptic models
stanza.download('cop', verbose=False)
# Try loading again
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic models downloaded and loaded successfully")
except Exception as download_error:
print(f"❌ Failed to download Coptic models: {download_error}")
raise
else:
print(f"❌ Failed to load parser: {e}")
raise
def parse_text(self, text):
"""
Parse Coptic text and return structured results
Args:
text: Coptic text to parse
Returns:
dict with:
- sentences: list of parsed sentence data
- total_sentences: int
- total_tokens: int
- text: original text
"""
if not text or not text.strip():
return None
# Ensure parser is loaded
self.load_parser()
# Parse with Stanza
doc = self.nlp(text)
if not doc.sentences:
return None
# Extract structured data
sentences = []
total_tokens = 0
for sent_idx, sentence in enumerate(doc.sentences, 1):
words_data = []
for word in sentence.words:
word_data = {
'id': word.id,
'form': word.text,
'lemma': word.lemma or '_',
'upos': word.upos,
'xpos': word.xpos or '_',
'feats': word.feats or '_',
'head': word.head,
'deprel': word.deprel,
'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
}
words_data.append(word_data)
total_tokens += 1
sentences.append({
'id': sent_idx,
'text': sentence.text,
'words': words_data
})
return {
'sentences': sentences,
'total_sentences': len(sentences),
'total_tokens': total_tokens,
'text': text
}
def format_conllu(self, parse_result):
"""Format parse result as CoNLL-U"""
if not parse_result:
return ""
lines = []
for sentence in parse_result['sentences']:
lines.append(f"# sent_id = {sentence['id']}")
lines.append(f"# text = {sentence['text']}")
for word in sentence['words']:
line = "\t".join([
str(word['id']),
word['form'],
word['lemma'],
word['upos'],
word['xpos'],
word['feats'],
str(word['head']),
word['deprel'],
'_', # deps
'_' # misc
])
lines.append(line)
lines.append("") # Blank line between sentences
return "\n".join(lines)
def format_table(self, parse_result):
"""Format parse result as markdown table"""
if not parse_result:
return ""
output = []
for sentence in parse_result['sentences']:
output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
output.append("|:---|:-----|:------|:-----|:-----|:-------|")
for word in sentence['words']:
output.append(
f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
)
return "\n".join(output)