|
|
|
|
|
""" |
|
|
Coptic Dependency Parser - Core Module (Web-Compatible) |
|
|
|
|
|
Extracted from coptic-parser.py for integration with web interfaces. |
|
|
Author: André Linden (2025) |
|
|
License: CC BY-NC-SA 4.0 |
|
|
""" |
|
|
|
|
|
import stanza |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
class CopticParserCore: |
|
|
"""Lightweight Coptic parser for web applications""" |
|
|
|
|
|
def __init__(self): |
|
|
self.nlp = None |
|
|
self.diaparser = None |
|
|
|
|
|
def load_parser(self): |
|
|
"""Initialize Stanza parser with Coptic models""" |
|
|
if self.nlp is not None: |
|
|
return |
|
|
|
|
|
print("Loading Coptic NLP models...") |
|
|
|
|
|
try: |
|
|
|
|
|
self.nlp = stanza.Pipeline( |
|
|
lang='cop', |
|
|
processors='tokenize,pos,lemma,depparse', |
|
|
download_method=None, |
|
|
verbose=False |
|
|
) |
|
|
print("✓ Coptic parser loaded successfully") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
if "Resources file not found" in str(e) or "not found" in str(e).lower(): |
|
|
print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...") |
|
|
try: |
|
|
|
|
|
stanza.download('cop', verbose=False) |
|
|
|
|
|
|
|
|
self.nlp = stanza.Pipeline( |
|
|
lang='cop', |
|
|
processors='tokenize,pos,lemma,depparse', |
|
|
download_method=None, |
|
|
verbose=False |
|
|
) |
|
|
print("✓ Coptic models downloaded and loaded successfully") |
|
|
except Exception as download_error: |
|
|
print(f"❌ Failed to download Coptic models: {download_error}") |
|
|
raise |
|
|
else: |
|
|
print(f"❌ Failed to load parser: {e}") |
|
|
raise |
|
|
|
|
|
def parse_text(self, text): |
|
|
""" |
|
|
Parse Coptic text and return structured results |
|
|
|
|
|
Args: |
|
|
text: Coptic text to parse |
|
|
|
|
|
Returns: |
|
|
dict with: |
|
|
- sentences: list of parsed sentence data |
|
|
- total_sentences: int |
|
|
- total_tokens: int |
|
|
- text: original text |
|
|
""" |
|
|
if not text or not text.strip(): |
|
|
return None |
|
|
|
|
|
|
|
|
self.load_parser() |
|
|
|
|
|
|
|
|
doc = self.nlp(text) |
|
|
|
|
|
if not doc.sentences: |
|
|
return None |
|
|
|
|
|
|
|
|
sentences = [] |
|
|
total_tokens = 0 |
|
|
|
|
|
for sent_idx, sentence in enumerate(doc.sentences, 1): |
|
|
words_data = [] |
|
|
|
|
|
for word in sentence.words: |
|
|
word_data = { |
|
|
'id': word.id, |
|
|
'form': word.text, |
|
|
'lemma': word.lemma or '_', |
|
|
'upos': word.upos, |
|
|
'xpos': word.xpos or '_', |
|
|
'feats': word.feats or '_', |
|
|
'head': word.head, |
|
|
'deprel': word.deprel, |
|
|
'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text |
|
|
} |
|
|
words_data.append(word_data) |
|
|
total_tokens += 1 |
|
|
|
|
|
sentences.append({ |
|
|
'id': sent_idx, |
|
|
'text': sentence.text, |
|
|
'words': words_data |
|
|
}) |
|
|
|
|
|
return { |
|
|
'sentences': sentences, |
|
|
'total_sentences': len(sentences), |
|
|
'total_tokens': total_tokens, |
|
|
'text': text |
|
|
} |
|
|
|
|
|
def format_conllu(self, parse_result): |
|
|
"""Format parse result as CoNLL-U""" |
|
|
if not parse_result: |
|
|
return "" |
|
|
|
|
|
lines = [] |
|
|
for sentence in parse_result['sentences']: |
|
|
lines.append(f"# sent_id = {sentence['id']}") |
|
|
lines.append(f"# text = {sentence['text']}") |
|
|
|
|
|
for word in sentence['words']: |
|
|
line = "\t".join([ |
|
|
str(word['id']), |
|
|
word['form'], |
|
|
word['lemma'], |
|
|
word['upos'], |
|
|
word['xpos'], |
|
|
word['feats'], |
|
|
str(word['head']), |
|
|
word['deprel'], |
|
|
'_', |
|
|
'_' |
|
|
]) |
|
|
lines.append(line) |
|
|
|
|
|
lines.append("") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def format_table(self, parse_result): |
|
|
"""Format parse result as markdown table""" |
|
|
if not parse_result: |
|
|
return "" |
|
|
|
|
|
output = [] |
|
|
|
|
|
for sentence in parse_result['sentences']: |
|
|
output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n") |
|
|
output.append("| ID | Form | Lemma | UPOS | Head | DepRel |") |
|
|
output.append("|:---|:-----|:------|:-----|:-----|:-------|") |
|
|
|
|
|
for word in sentence['words']: |
|
|
output.append( |
|
|
f"| {word['id']} | **{word['form']}** | {word['lemma']} | " |
|
|
f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |" |
|
|
) |
|
|
|
|
|
return "\n".join(output) |
|
|
|