Rogaton Claude commited on
Commit
eee0fe0
·
1 Parent(s): 90f0f33

feat: Integrate dependency parser with translation interface

Browse files

**NEW: Unified Coptic NLP Interface**

Adds three powerful analysis modes:
1. **Dependency Parse** - Full syntactic analysis with CoNLL-U export
2. **Translation** - Apertus-8B translation to 8 languages
3. **Parse + Translate** - Combined workflow showing both side-by-side

**Parser Integration:**
- Created lightweight `coptic_parser_core.py` (web-compatible)
- Extracted from full GUI parser, optimized for Streamlit
- Uses Stanza for tokenization, POS tagging, lemmatization, dependencies
- Cached initialization for performance

**New Features:**
- 📊 Dependency tables with markdown formatting
- 📥 CoNLL-U export for linguistic research
- 🔍 Parse-first workflow (validates before translation)
- 🌍 Side-by-side parse + translation view
- ⚡ No API token needed for parsing (only translation)

**UI Enhancements:**
- Reorganized analysis type dropdown
- "Dependency Parse" as first option
- "Parse And Translate" for comprehensive analysis
- Clear section headers for parse vs. translate results

**Dependencies:**
- Added stanza (Coptic NLP models)
- Added torch (required by Stanza)

**Use Cases:**
- Scholars: Validate syntax before trusting translation
- Researchers: Export CoNLL-U for corpus analysis
- Students: Learn Coptic grammar through visualization
- Linguists: Compare parse structure across texts

This creates the first integrated web interface combining:
- Dependency parsing (your parser)
- Neural translation (Apertus-8B)
- Lexicon lookup (Comprehensive Coptic Lexicon)
- All in one tool!

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (3) hide show
  1. apertus_ui.py +123 -6
  2. coptic_parser_core.py +146 -0
  3. requirements.txt +2 -0
apertus_ui.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import xml.etree.ElementTree as ET
4
  import re
5
  from huggingface_hub import InferenceClient
 
6
 
7
  # Coptic alphabet helper
8
  COPTIC_ALPHABET = {
@@ -266,11 +267,11 @@ with st.sidebar:
266
  if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
267
  st.subheader("Analysis Type")
268
  analysis_type = st.selectbox("Choose analysis:",
269
- options=['translation', 'dialect_analysis', 'transcription', 'morphology', 'lexicon_lookup'],
270
  format_func=lambda x: x.replace('_', ' ').title())
271
 
272
  # Target language selector for translation
273
- if analysis_type == 'translation':
274
  st.subheader("Target Language")
275
  target_lang = st.selectbox("Translate to:",
276
  options=[k for k in LANGUAGES.keys() if k not in ['cop', 'cop-sa', 'cop-bo']],
@@ -281,8 +282,9 @@ with st.sidebar:
281
  # For non-translation tasks, use English as default output language
282
  target_language_name = "English"
283
 
284
- # Get prompts for the target language
285
- COPTIC_PROMPTS = get_coptic_prompts(target_language_name)
 
286
 
287
  # Use HuggingFace Inference API instead of loading model locally
288
  # This is much faster and doesn't require GPU
@@ -305,6 +307,18 @@ def get_inference_client(token=None):
305
  st.error(f"Error initializing inference client: {e}")
306
  return None
307
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  # Chat interface
309
  if "messages" not in st.session_state:
310
  st.session_state.messages = []
@@ -316,7 +330,49 @@ for message in st.session_state.messages:
316
 
317
  # User input
318
  if prompt := st.chat_input("Type your message..."):
319
- # Check if API token is available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  if not hf_token_input:
321
  st.error("⚠️ Please enter your HuggingFace API token in the sidebar to use translation.")
322
  st.stop()
@@ -328,7 +384,68 @@ if prompt := st.chat_input("Type your message..."):
328
  st.error("❌ Failed to initialize inference client. Please check your API token.")
329
  st.stop()
330
 
331
- # Add Coptic-specific prompt prefix if applicable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
333
  full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
334
 
 
3
  import xml.etree.ElementTree as ET
4
  import re
5
  from huggingface_hub import InferenceClient
6
+ from coptic_parser_core import CopticParserCore
7
 
8
  # Coptic alphabet helper
9
  COPTIC_ALPHABET = {
 
267
  if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
268
  st.subheader("Analysis Type")
269
  analysis_type = st.selectbox("Choose analysis:",
270
+ options=['dependency_parse', 'translation', 'parse_and_translate', 'dialect_analysis', 'transcription', 'morphology', 'lexicon_lookup'],
271
  format_func=lambda x: x.replace('_', ' ').title())
272
 
273
  # Target language selector for translation
274
+ if analysis_type in ['translation', 'parse_and_translate']:
275
  st.subheader("Target Language")
276
  target_lang = st.selectbox("Translate to:",
277
  options=[k for k in LANGUAGES.keys() if k not in ['cop', 'cop-sa', 'cop-bo']],
 
282
  # For non-translation tasks, use English as default output language
283
  target_language_name = "English"
284
 
285
+ # Get prompts for the target language (only for LLM-based tasks)
286
+ if analysis_type not in ['dependency_parse', 'parse_and_translate']:
287
+ COPTIC_PROMPTS = get_coptic_prompts(target_language_name)
288
 
289
  # Use HuggingFace Inference API instead of loading model locally
290
  # This is much faster and doesn't require GPU
 
307
  st.error(f"Error initializing inference client: {e}")
308
  return None
309
 
310
+ # Initialize Coptic Dependency Parser
311
+ @st.cache_resource
312
+ def get_parser():
313
+ """Initialize and cache the Coptic parser"""
314
+ try:
315
+ parser = CopticParserCore()
316
+ parser.load_parser() # Pre-load to avoid delays
317
+ return parser
318
+ except Exception as e:
319
+ st.error(f"Failed to initialize parser: {e}")
320
+ return None
321
+
322
  # Chat interface
323
  if "messages" not in st.session_state:
324
  st.session_state.messages = []
 
330
 
331
  # User input
332
  if prompt := st.chat_input("Type your message..."):
333
+ # Handle dependency parsing (doesn't need API token)
334
+ if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'dependency_parse':
335
+ st.session_state.messages.append({"role": "user", "content": prompt})
336
+
337
+ with st.chat_message("user"):
338
+ st.markdown(f"**Parse this text:** {prompt}")
339
+
340
+ with st.chat_message("assistant"):
341
+ with st.spinner("🔍 Parsing Coptic text..."):
342
+ parser = get_parser()
343
+ if parser:
344
+ try:
345
+ parse_result = parser.parse_text(prompt)
346
+
347
+ if parse_result:
348
+ # Display parse results
349
+ st.success(f"✅ Parsed {parse_result['total_sentences']} sentence(s), {parse_result['total_tokens']} tokens")
350
+
351
+ # Show formatted table
352
+ table_output = parser.format_table(parse_result)
353
+ st.markdown(table_output)
354
+
355
+ # Offer CoNLL-U download
356
+ conllu_output = parser.format_conllu(parse_result)
357
+ st.download_button(
358
+ label="📥 Download CoNLL-U",
359
+ data=conllu_output,
360
+ file_name="coptic_parse.conllu",
361
+ mime="text/plain"
362
+ )
363
+
364
+ response = f"Parse complete. {parse_result['total_sentences']} sentences analyzed."
365
+ st.session_state.messages.append({"role": "assistant", "content": response})
366
+ else:
367
+ st.error("Failed to parse text. Please check the input.")
368
+ except Exception as e:
369
+ st.error(f"Parsing error: {e}")
370
+ else:
371
+ st.error("Parser not available. Please check Stanza installation.")
372
+
373
+ st.stop() # Don't continue to translation
374
+
375
+ # For translation tasks, check API token
376
  if not hf_token_input:
377
  st.error("⚠️ Please enter your HuggingFace API token in the sidebar to use translation.")
378
  st.stop()
 
384
  st.error("❌ Failed to initialize inference client. Please check your API token.")
385
  st.stop()
386
 
387
+ # Handle parse_and_translate mode
388
+ if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'parse_and_translate':
389
+ st.session_state.messages.append({"role": "user", "content": prompt})
390
+
391
+ with st.chat_message("user"):
392
+ st.markdown(f"**Parse and translate:** {prompt}")
393
+
394
+ with st.chat_message("assistant"):
395
+ # First, parse
396
+ st.subheader("📊 Dependency Analysis")
397
+ with st.spinner("🔍 Parsing..."):
398
+ parser = get_parser()
399
+ if parser:
400
+ parse_result = parser.parse_text(prompt)
401
+ if parse_result:
402
+ table_output = parser.format_table(parse_result)
403
+ st.markdown(table_output)
404
+
405
+ # Then, translate
406
+ st.divider()
407
+ st.subheader(f"🌍 Translation to {LANGUAGES[target_lang]}")
408
+
409
+ # Get translation prompts
410
+ COPTIC_PROMPTS_TRANSLATE = get_coptic_prompts(target_language_name)
411
+ translate_prompt = f"{COPTIC_PROMPTS_TRANSLATE['translation']} {prompt}"
412
+
413
+ with st.spinner("🤖 Translating..."):
414
+ try:
415
+ messages = [
416
+ {"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
417
+ {"role": "user", "content": translate_prompt}
418
+ ]
419
+
420
+ response_stream = inference_client.chat_completion(
421
+ model=MODEL_NAME,
422
+ messages=messages,
423
+ max_tokens=512,
424
+ temperature=0.5,
425
+ top_p=0.9,
426
+ stream=True
427
+ )
428
+
429
+ # Stream the translation
430
+ response_placeholder = st.empty()
431
+ full_response = ""
432
+
433
+ for message in response_stream:
434
+ if message.choices[0].delta.content:
435
+ full_response += message.choices[0].delta.content
436
+ response_placeholder.markdown(full_response + "▌")
437
+
438
+ response_placeholder.markdown(full_response)
439
+
440
+ combined_response = f"Parse complete. Translation: {full_response}"
441
+ st.session_state.messages.append({"role": "assistant", "content": combined_response})
442
+
443
+ except Exception as e:
444
+ st.error(f"❌ Translation error: {e}")
445
+
446
+ st.stop() # Special handling complete
447
+
448
+ # Standard translation/analysis handling
449
  if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
450
  full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
451
 
coptic_parser_core.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Coptic Dependency Parser - Core Module (Web-Compatible)
4
+
5
+ Extracted from coptic-parser.py for integration with web interfaces.
6
+ Author: André Linden (2025)
7
+ License: CC BY-NC-SA 4.0
8
+ """
9
+
10
+ import stanza
11
+ import warnings
12
+ warnings.filterwarnings('ignore')
13
+
14
+ class CopticParserCore:
15
+ """Lightweight Coptic parser for web applications"""
16
+
17
+ def __init__(self):
18
+ self.nlp = None
19
+ self.diaparser = None
20
+
21
+ def load_parser(self):
22
+ """Initialize Stanza parser with Coptic models"""
23
+ if self.nlp is not None:
24
+ return # Already loaded
25
+
26
+ print("Loading Coptic NLP models...")
27
+
28
+ # Load Stanza with all processors
29
+ self.nlp = stanza.Pipeline(
30
+ lang='cop',
31
+ processors='tokenize,pos,lemma,depparse',
32
+ download_method=None,
33
+ verbose=False
34
+ )
35
+
36
+ print("✓ Coptic parser loaded successfully")
37
+
38
+ def parse_text(self, text):
39
+ """
40
+ Parse Coptic text and return structured results
41
+
42
+ Args:
43
+ text: Coptic text to parse
44
+
45
+ Returns:
46
+ dict with:
47
+ - sentences: list of parsed sentence data
48
+ - total_sentences: int
49
+ - total_tokens: int
50
+ - text: original text
51
+ """
52
+ if not text or not text.strip():
53
+ return None
54
+
55
+ # Ensure parser is loaded
56
+ self.load_parser()
57
+
58
+ # Parse with Stanza
59
+ doc = self.nlp(text)
60
+
61
+ if not doc.sentences:
62
+ return None
63
+
64
+ # Extract structured data
65
+ sentences = []
66
+ total_tokens = 0
67
+
68
+ for sent_idx, sentence in enumerate(doc.sentences, 1):
69
+ words_data = []
70
+
71
+ for word in sentence.words:
72
+ word_data = {
73
+ 'id': word.id,
74
+ 'form': word.text,
75
+ 'lemma': word.lemma or '_',
76
+ 'upos': word.upos,
77
+ 'xpos': word.xpos or '_',
78
+ 'feats': word.feats or '_',
79
+ 'head': word.head,
80
+ 'deprel': word.deprel,
81
+ 'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
82
+ }
83
+ words_data.append(word_data)
84
+ total_tokens += 1
85
+
86
+ sentences.append({
87
+ 'id': sent_idx,
88
+ 'text': sentence.text,
89
+ 'words': words_data
90
+ })
91
+
92
+ return {
93
+ 'sentences': sentences,
94
+ 'total_sentences': len(sentences),
95
+ 'total_tokens': total_tokens,
96
+ 'text': text
97
+ }
98
+
99
+ def format_conllu(self, parse_result):
100
+ """Format parse result as CoNLL-U"""
101
+ if not parse_result:
102
+ return ""
103
+
104
+ lines = []
105
+ for sentence in parse_result['sentences']:
106
+ lines.append(f"# sent_id = {sentence['id']}")
107
+ lines.append(f"# text = {sentence['text']}")
108
+
109
+ for word in sentence['words']:
110
+ line = "\t".join([
111
+ str(word['id']),
112
+ word['form'],
113
+ word['lemma'],
114
+ word['upos'],
115
+ word['xpos'],
116
+ word['feats'],
117
+ str(word['head']),
118
+ word['deprel'],
119
+ '_', # deps
120
+ '_' # misc
121
+ ])
122
+ lines.append(line)
123
+
124
+ lines.append("") # Blank line between sentences
125
+
126
+ return "\n".join(lines)
127
+
128
+ def format_table(self, parse_result):
129
+ """Format parse result as markdown table"""
130
+ if not parse_result:
131
+ return ""
132
+
133
+ output = []
134
+
135
+ for sentence in parse_result['sentences']:
136
+ output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
137
+ output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
138
+ output.append("|:---|:-----|:------|:-----|:-----|:-------|")
139
+
140
+ for word in sentence['words']:
141
+ output.append(
142
+ f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
143
+ f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
144
+ )
145
+
146
+ return "\n".join(output)
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  streamlit
2
  huggingface_hub
3
  lxml
 
 
 
1
  streamlit
2
  huggingface_hub
3
  lxml
4
+ stanza
5
+ torch