File size: 5,376 Bytes
eee0fe0 cc8e202 eee0fe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)
Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""
import stanza
import warnings
warnings.filterwarnings('ignore')
class CopticParserCore:
"""Lightweight Coptic parser for web applications"""
def __init__(self):
self.nlp = None
self.diaparser = None
def load_parser(self):
"""Initialize Stanza parser with Coptic models"""
if self.nlp is not None:
return # Already loaded
print("Loading Coptic NLP models...")
try:
# Try to load Stanza with all processors
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic parser loaded successfully")
except Exception as e:
# If models not found, download them
if "Resources file not found" in str(e) or "not found" in str(e).lower():
print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
try:
# Download Coptic models
stanza.download('cop', verbose=False)
# Try loading again
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic models downloaded and loaded successfully")
except Exception as download_error:
print(f"❌ Failed to download Coptic models: {download_error}")
raise
else:
print(f"❌ Failed to load parser: {e}")
raise
def parse_text(self, text):
"""
Parse Coptic text and return structured results
Args:
text: Coptic text to parse
Returns:
dict with:
- sentences: list of parsed sentence data
- total_sentences: int
- total_tokens: int
- text: original text
"""
if not text or not text.strip():
return None
# Ensure parser is loaded
self.load_parser()
# Parse with Stanza
doc = self.nlp(text)
if not doc.sentences:
return None
# Extract structured data
sentences = []
total_tokens = 0
for sent_idx, sentence in enumerate(doc.sentences, 1):
words_data = []
for word in sentence.words:
word_data = {
'id': word.id,
'form': word.text,
'lemma': word.lemma or '_',
'upos': word.upos,
'xpos': word.xpos or '_',
'feats': word.feats or '_',
'head': word.head,
'deprel': word.deprel,
'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
}
words_data.append(word_data)
total_tokens += 1
sentences.append({
'id': sent_idx,
'text': sentence.text,
'words': words_data
})
return {
'sentences': sentences,
'total_sentences': len(sentences),
'total_tokens': total_tokens,
'text': text
}
def format_conllu(self, parse_result):
"""Format parse result as CoNLL-U"""
if not parse_result:
return ""
lines = []
for sentence in parse_result['sentences']:
lines.append(f"# sent_id = {sentence['id']}")
lines.append(f"# text = {sentence['text']}")
for word in sentence['words']:
line = "\t".join([
str(word['id']),
word['form'],
word['lemma'],
word['upos'],
word['xpos'],
word['feats'],
str(word['head']),
word['deprel'],
'_', # deps
'_' # misc
])
lines.append(line)
lines.append("") # Blank line between sentences
return "\n".join(lines)
def format_table(self, parse_result):
"""Format parse result as markdown table"""
if not parse_result:
return ""
output = []
for sentence in parse_result['sentences']:
output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
output.append("|:---|:-----|:------|:-----|:-----|:-------|")
for word in sentence['words']:
output.append(
f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
)
return "\n".join(output)
|