Spaces:

Norelad
/

coptic-translation-interface

Running

coptic-translation-interface / coptic_parser_core.py

Rogaton

fix: Auto-download Stanza Coptic models on first use

cc8e202 about 1 month ago

5.38 kB

	#!/usr/bin/env python3
	"""
	Coptic Dependency Parser - Core Module (Web-Compatible)

	Extracted from coptic-parser.py for integration with web interfaces.
	Author: André Linden (2025)
	License: CC BY-NC-SA 4.0
	"""

	import stanza
	import warnings
	warnings.filterwarnings('ignore')

	class CopticParserCore:
	"""Lightweight Coptic parser for web applications"""

	def __init__(self):
	self.nlp = None
	self.diaparser = None

	def load_parser(self):
	"""Initialize Stanza parser with Coptic models"""
	if self.nlp is not None:
	return # Already loaded

	print("Loading Coptic NLP models...")

	try:
	# Try to load Stanza with all processors
	self.nlp = stanza.Pipeline(
	lang='cop',
	processors='tokenize,pos,lemma,depparse',
	download_method=None,
	verbose=False
	)
	print("✓ Coptic parser loaded successfully")

	except Exception as e:
	# If models not found, download them
	if "Resources file not found" in str(e) or "not found" in str(e).lower():
	print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
	try:
	# Download Coptic models
	stanza.download('cop', verbose=False)

	# Try loading again
	self.nlp = stanza.Pipeline(
	lang='cop',
	processors='tokenize,pos,lemma,depparse',
	download_method=None,
	verbose=False
	)
	print("✓ Coptic models downloaded and loaded successfully")
	except Exception as download_error:
	print(f"❌ Failed to download Coptic models: {download_error}")
	raise
	else:
	print(f"❌ Failed to load parser: {e}")
	raise

	def parse_text(self, text):
	"""
	Parse Coptic text and return structured results

	Args:
	text: Coptic text to parse

	Returns:
	dict with:
	- sentences: list of parsed sentence data
	- total_sentences: int
	- total_tokens: int
	- text: original text
	"""
	if not text or not text.strip():
	return None

	# Ensure parser is loaded
	self.load_parser()

	# Parse with Stanza
	doc = self.nlp(text)

	if not doc.sentences:
	return None

	# Extract structured data
	sentences = []
	total_tokens = 0

	for sent_idx, sentence in enumerate(doc.sentences, 1):
	words_data = []

	for word in sentence.words:
	word_data = {
	'id': word.id,
	'form': word.text,
	'lemma': word.lemma or '_',
	'upos': word.upos,
	'xpos': word.xpos or '_',
	'feats': word.feats or '_',
	'head': word.head,
	'deprel': word.deprel,
	'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
	}
	words_data.append(word_data)
	total_tokens += 1

	sentences.append({
	'id': sent_idx,
	'text': sentence.text,
	'words': words_data
	})

	return {
	'sentences': sentences,
	'total_sentences': len(sentences),
	'total_tokens': total_tokens,
	'text': text
	}

	def format_conllu(self, parse_result):
	"""Format parse result as CoNLL-U"""
	if not parse_result:
	return ""

	lines = []
	for sentence in parse_result['sentences']:
	lines.append(f"# sent_id = {sentence['id']}")
	lines.append(f"# text = {sentence['text']}")

	for word in sentence['words']:
	line = "\t".join([
	str(word['id']),
	word['form'],
	word['lemma'],
	word['upos'],
	word['xpos'],
	word['feats'],
	str(word['head']),
	word['deprel'],
	'_', # deps
	'_' # misc
	])
	lines.append(line)

	lines.append("") # Blank line between sentences

	return "\n".join(lines)

	def format_table(self, parse_result):
	"""Format parse result as markdown table"""
	if not parse_result:
	return ""

	output = []

	for sentence in parse_result['sentences']:
	output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
	output.append("\| ID \| Form \| Lemma \| UPOS \| Head \| DepRel \|")
	output.append("\|:---\|:-----\|:------\|:-----\|:-----\|:-------\|")

	for word in sentence['words']:
	output.append(
	f"\| {word['id']} \| {word['form']} \| {word['lemma']} \| "
	f"`{word['upos']}` \| {word['head_text']} \| `{word['deprel']}` \|"
	)

	return "\n".join(output)