File size: 5,376 Bytes
eee0fe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc8e202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eee0fe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)

Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""

import stanza
import warnings
warnings.filterwarnings('ignore')

class CopticParserCore:
    """Lightweight Coptic parser for web applications"""

    def __init__(self):
        self.nlp = None
        self.diaparser = None

    def load_parser(self):
        """Initialize Stanza parser with Coptic models"""
        if self.nlp is not None:
            return  # Already loaded

        print("Loading Coptic NLP models...")

        try:
            # Try to load Stanza with all processors
            self.nlp = stanza.Pipeline(
                lang='cop',
                processors='tokenize,pos,lemma,depparse',
                download_method=None,
                verbose=False
            )
            print("✓ Coptic parser loaded successfully")

        except Exception as e:
            # If models not found, download them
            if "Resources file not found" in str(e) or "not found" in str(e).lower():
                print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
                try:
                    # Download Coptic models
                    stanza.download('cop', verbose=False)

                    # Try loading again
                    self.nlp = stanza.Pipeline(
                        lang='cop',
                        processors='tokenize,pos,lemma,depparse',
                        download_method=None,
                        verbose=False
                    )
                    print("✓ Coptic models downloaded and loaded successfully")
                except Exception as download_error:
                    print(f"❌ Failed to download Coptic models: {download_error}")
                    raise
            else:
                print(f"❌ Failed to load parser: {e}")
                raise

    def parse_text(self, text):
        """
        Parse Coptic text and return structured results

        Args:
            text: Coptic text to parse

        Returns:
            dict with:
                - sentences: list of parsed sentence data
                - total_sentences: int
                - total_tokens: int
                - text: original text
        """
        if not text or not text.strip():
            return None

        # Ensure parser is loaded
        self.load_parser()

        # Parse with Stanza
        doc = self.nlp(text)

        if not doc.sentences:
            return None

        # Extract structured data
        sentences = []
        total_tokens = 0

        for sent_idx, sentence in enumerate(doc.sentences, 1):
            words_data = []

            for word in sentence.words:
                word_data = {
                    'id': word.id,
                    'form': word.text,
                    'lemma': word.lemma or '_',
                    'upos': word.upos,
                    'xpos': word.xpos or '_',
                    'feats': word.feats or '_',
                    'head': word.head,
                    'deprel': word.deprel,
                    'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
                }
                words_data.append(word_data)
                total_tokens += 1

            sentences.append({
                'id': sent_idx,
                'text': sentence.text,
                'words': words_data
            })

        return {
            'sentences': sentences,
            'total_sentences': len(sentences),
            'total_tokens': total_tokens,
            'text': text
        }

    def format_conllu(self, parse_result):
        """Format parse result as CoNLL-U"""
        if not parse_result:
            return ""

        lines = []
        for sentence in parse_result['sentences']:
            lines.append(f"# sent_id = {sentence['id']}")
            lines.append(f"# text = {sentence['text']}")

            for word in sentence['words']:
                line = "\t".join([
                    str(word['id']),
                    word['form'],
                    word['lemma'],
                    word['upos'],
                    word['xpos'],
                    word['feats'],
                    str(word['head']),
                    word['deprel'],
                    '_',  # deps
                    '_'   # misc
                ])
                lines.append(line)

            lines.append("")  # Blank line between sentences

        return "\n".join(lines)

    def format_table(self, parse_result):
        """Format parse result as markdown table"""
        if not parse_result:
            return ""

        output = []

        for sentence in parse_result['sentences']:
            output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
            output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
            output.append("|:---|:-----|:------|:-----|:-----|:-------|")

            for word in sentence['words']:
                output.append(
                    f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
                    f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
                )

        return "\n".join(output)