Spaces:

Norelad
/

coptic-translation-interface

Sleeping

coptic-translation-interface / apertus_ui.py

Rogaton

fix: Add missing coptic_keyboard module and update UI with LFS

c0bf168 30 days ago

10.9 kB

	import streamlit as st
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import os
	import xml.etree.ElementTree as ET
	import re

	# Coptic alphabet helper
	COPTIC_ALPHABET = {
	'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
	'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
	'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
	'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
	'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
	}

	# Coptic linguistic prompts
	COPTIC_PROMPTS = {
	'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
	'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
	'transcription': "Provide a romanized transcription of this Coptic text:",
	'morphology': "Analyze the morphological structure of these Coptic words:",
	'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
	}

	# Lexicon loader
	@st.cache_data
	def load_coptic_lexicon(file_path=None):
	"""Load Coptic lexicon from various formats including TEI XML"""
	if not file_path or not os.path.exists(file_path):
	return {}

	lexicon = {}

	try:
	# Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
	if file_path.endswith('.xml'):
	tree = ET.parse(file_path)
	root = tree.getroot()

	# Handle TEI namespace
	ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

	# Find entries in TEI format
	entries = root.findall('.//tei:entry', ns)

	for entry in entries[:100]: # Limit to first 100 entries for performance
	coptic_word = ""
	definition = ""

	# Extract Coptic headword from TEI structure
	form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns)
	if form is not None:
	orth = form.find('.//tei:orth', ns)
	if orth is not None and orth.text:
	coptic_word = orth.text.strip()

	# Extract definition from sense elements
	senses = entry.findall('.//tei:sense', ns)
	definitions = []
	for sense in senses[:2]: # Limit to first 2 senses
	def_elem = sense.find('.//tei:def', ns)
	if def_elem is not None and def_elem.text:
	definitions.append(def_elem.text.strip())

	if definitions:
	definition = "; ".join(definitions)

	# Clean and store
	if coptic_word and definition:
	# Clean Coptic word (preserve Coptic and Greek Unicode)
	coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip()
	if coptic_word:
	lexicon[coptic_word] = definition[:200] # Limit definition length

	# Handle text formats
	else:
	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line:
	continue

	# Support multiple separators
	separator = None
	for sep in ['\t', '\|', ',', ';']:
	if sep in line:
	separator = sep
	break

	if separator:
	parts = line.split(separator, 1)
	if len(parts) >= 2:
	coptic_word = parts[0].strip()
	definition = parts[1].strip()
	lexicon[coptic_word] = definition

	except Exception as e:
	st.error(f"Error loading lexicon: {str(e)}")

	return lexicon

	# Language detection and UI
	LANGUAGES = {
	'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
	'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी',
	'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic'
	}

	st.set_page_config(page_title="Apertus Chat", layout="wide")

	# Language selector
	selected_lang = st.selectbox("Language / Langue / Idioma",
	options=list(LANGUAGES.keys()),
	format_func=lambda x: LANGUAGES[x])

	# Sidebar for Coptic tools
	with st.sidebar:
	st.header("Coptic Tools")

	# Lexicon file uploader
	lexicon_file = st.file_uploader("Upload Coptic Lexicon",
	type=['txt', 'tsv', 'csv', 'xml'],
	help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")

	# Load lexicon
	if lexicon_file:
	# Save uploaded file temporarily
	with open("temp_lexicon.txt", "wb") as f:
	f.write(lexicon_file.getbuffer())
	coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
	st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
	else:
	# Try to load the comprehensive lexicon if available
	comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
	if os.path.exists(comprehensive_lexicon_path):
	coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
	if coptic_lexicon:
	st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
	else:
	coptic_lexicon = {}
	else:
	coptic_lexicon = {}

	# Coptic alphabet reference
	if st.expander("Coptic Alphabet"):
	for letter, name in COPTIC_ALPHABET.items():
	st.text(f"{letter} - {name}")

	# Lexicon search
	if coptic_lexicon:
	st.subheader("Lexicon Search")

	# Virtual Coptic keyboard
	st.write("Virtual Keyboard:")
	coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ']

	# Create keyboard layout in rows
	cols1 = st.columns(8)
	cols2 = st.columns(8)
	cols3 = st.columns(8)
	cols4 = st.columns(8)

	keyboard_input = ""
	for i, letter in enumerate(coptic_letters):
	col_idx = i % 8
	if i < 8:
	if cols1[col_idx].button(letter, key=f"key_{letter}"):
	keyboard_input = letter
	elif i < 16:
	if cols2[col_idx].button(letter, key=f"key_{letter}"):
	keyboard_input = letter
	elif i < 24:
	if cols3[col_idx].button(letter, key=f"key_{letter}"):
	keyboard_input = letter
	else:
	if cols4[col_idx].button(letter, key=f"key_{letter}"):
	keyboard_input = letter

	# Search input
	search_term = st.text_input("Search Coptic word:", value=keyboard_input if keyboard_input else "")

	if search_term:
	if search_term in coptic_lexicon:
	st.write(f"{search_term}")
	st.write(coptic_lexicon[search_term])
	else:
	# Partial matches
	matches = [k for k in coptic_lexicon.keys() if search_term in k]
	if matches:
	st.write("Partial matches:")
	for match in matches[:5]: # Show first 5 matches
	st.write(f"{match} → {coptic_lexicon[match][:100]}...")
	else:
	st.write("No matches found")

	# Linguistic analysis options
	if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
	st.subheader("Analysis Type")
	analysis_type = st.selectbox("Choose analysis:",
	options=list(COPTIC_PROMPTS.keys()),
	format_func=lambda x: x.replace('_', ' ').title())

	# Load model (cached)
	@st.cache_resource
	def load_model():
	model_path = "swiss-ai/Apertus-8B-Instruct-2509"
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
	return tokenizer, model
	except Exception as e:
	st.error(f"Failed to load model: {str(e)}")
	return None, None

	tokenizer, model = load_model()

	# Chat interface
	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Display chat history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# User input
	if prompt := st.chat_input("Type your message..."):
	# Add Coptic-specific prompt prefix if applicable
	if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
	full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"

	# Add lexicon context for lexicon lookup
	if analysis_type == 'lexicon_lookup' and coptic_lexicon:
	words_in_prompt = prompt.split()
	lexicon_matches = []
	for word in words_in_prompt:
	if word in coptic_lexicon:
	lexicon_matches.append(f"{word} = {coptic_lexicon[word]}")

	if lexicon_matches:
	full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}"
	else:
	full_prompt = prompt

	st.session_state.messages.append({"role": "user", "content": full_prompt})

	with st.chat_message("user"):
	st.markdown(full_prompt)

	# Generate response
	with st.chat_message("assistant"):
	messages = [{"role": "user", "content": full_prompt}]
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer([text], return_tensors="pt")

	with torch.no_grad():
	outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.8, top_p=0.9)

	response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
	st.markdown(response)
	st.session_state.messages.append({"role": "assistant", "content": response})