|
|
import streamlit as st |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
import torch |
|
|
import os |
|
|
import xml.etree.ElementTree as ET |
|
|
import re |
|
|
|
|
|
|
|
|
COPTIC_ALPHABET = { |
|
|
'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta', |
|
|
'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu', |
|
|
'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma', |
|
|
'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega', |
|
|
'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti' |
|
|
} |
|
|
|
|
|
|
|
|
COPTIC_PROMPTS = { |
|
|
'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:", |
|
|
'translation': "Translate this Coptic text to English, preserving theological and cultural context:", |
|
|
'transcription': "Provide a romanized transcription of this Coptic text:", |
|
|
'morphology': "Analyze the morphological structure of these Coptic words:", |
|
|
'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:" |
|
|
} |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_coptic_lexicon(file_path=None): |
|
|
"""Load Coptic lexicon from various formats including TEI XML""" |
|
|
if not file_path or not os.path.exists(file_path): |
|
|
return {} |
|
|
|
|
|
lexicon = {} |
|
|
|
|
|
try: |
|
|
|
|
|
if file_path.endswith('.xml'): |
|
|
tree = ET.parse(file_path) |
|
|
root = tree.getroot() |
|
|
|
|
|
|
|
|
ns = {'tei': 'http://www.tei-c.org/ns/1.0'} |
|
|
|
|
|
|
|
|
entries = root.findall('.//tei:entry', ns) |
|
|
|
|
|
for entry in entries[:100]: |
|
|
coptic_word = "" |
|
|
definition = "" |
|
|
|
|
|
|
|
|
form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns) |
|
|
if form is not None: |
|
|
orth = form.find('.//tei:orth', ns) |
|
|
if orth is not None and orth.text: |
|
|
coptic_word = orth.text.strip() |
|
|
|
|
|
|
|
|
senses = entry.findall('.//tei:sense', ns) |
|
|
definitions = [] |
|
|
for sense in senses[:2]: |
|
|
def_elem = sense.find('.//tei:def', ns) |
|
|
if def_elem is not None and def_elem.text: |
|
|
definitions.append(def_elem.text.strip()) |
|
|
|
|
|
if definitions: |
|
|
definition = "; ".join(definitions) |
|
|
|
|
|
|
|
|
if coptic_word and definition: |
|
|
|
|
|
coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip() |
|
|
if coptic_word: |
|
|
lexicon[coptic_word] = definition[:200] |
|
|
|
|
|
|
|
|
else: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
separator = None |
|
|
for sep in ['\t', '|', ',', ';']: |
|
|
if sep in line: |
|
|
separator = sep |
|
|
break |
|
|
|
|
|
if separator: |
|
|
parts = line.split(separator, 1) |
|
|
if len(parts) >= 2: |
|
|
coptic_word = parts[0].strip() |
|
|
definition = parts[1].strip() |
|
|
lexicon[coptic_word] = definition |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error loading lexicon: {str(e)}") |
|
|
|
|
|
return lexicon |
|
|
|
|
|
|
|
|
LANGUAGES = { |
|
|
'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch', |
|
|
'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी', |
|
|
'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic' |
|
|
} |
|
|
|
|
|
st.set_page_config(page_title="Apertus Chat", layout="wide") |
|
|
|
|
|
|
|
|
selected_lang = st.selectbox("Language / Langue / Idioma", |
|
|
options=list(LANGUAGES.keys()), |
|
|
format_func=lambda x: LANGUAGES[x]) |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.header("Coptic Tools") |
|
|
|
|
|
|
|
|
lexicon_file = st.file_uploader("Upload Coptic Lexicon", |
|
|
type=['txt', 'tsv', 'csv', 'xml'], |
|
|
help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV") |
|
|
|
|
|
|
|
|
if lexicon_file: |
|
|
|
|
|
with open("temp_lexicon.txt", "wb") as f: |
|
|
f.write(lexicon_file.getbuffer()) |
|
|
coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt") |
|
|
st.success(f"Loaded {len(coptic_lexicon)} lexicon entries") |
|
|
else: |
|
|
|
|
|
comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml" |
|
|
if os.path.exists(comprehensive_lexicon_path): |
|
|
coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path) |
|
|
if coptic_lexicon: |
|
|
st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries") |
|
|
else: |
|
|
coptic_lexicon = {} |
|
|
else: |
|
|
coptic_lexicon = {} |
|
|
|
|
|
|
|
|
if st.expander("Coptic Alphabet"): |
|
|
for letter, name in COPTIC_ALPHABET.items(): |
|
|
st.text(f"{letter} - {name}") |
|
|
|
|
|
|
|
|
if coptic_lexicon: |
|
|
st.subheader("Lexicon Search") |
|
|
|
|
|
|
|
|
st.write("**Virtual Keyboard:**") |
|
|
coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'] |
|
|
|
|
|
|
|
|
cols1 = st.columns(8) |
|
|
cols2 = st.columns(8) |
|
|
cols3 = st.columns(8) |
|
|
cols4 = st.columns(8) |
|
|
|
|
|
keyboard_input = "" |
|
|
for i, letter in enumerate(coptic_letters): |
|
|
col_idx = i % 8 |
|
|
if i < 8: |
|
|
if cols1[col_idx].button(letter, key=f"key_{letter}"): |
|
|
keyboard_input = letter |
|
|
elif i < 16: |
|
|
if cols2[col_idx].button(letter, key=f"key_{letter}"): |
|
|
keyboard_input = letter |
|
|
elif i < 24: |
|
|
if cols3[col_idx].button(letter, key=f"key_{letter}"): |
|
|
keyboard_input = letter |
|
|
else: |
|
|
if cols4[col_idx].button(letter, key=f"key_{letter}"): |
|
|
keyboard_input = letter |
|
|
|
|
|
|
|
|
search_term = st.text_input("Search Coptic word:", value=keyboard_input if keyboard_input else "") |
|
|
|
|
|
if search_term: |
|
|
if search_term in coptic_lexicon: |
|
|
st.write(f"**{search_term}**") |
|
|
st.write(coptic_lexicon[search_term]) |
|
|
else: |
|
|
|
|
|
matches = [k for k in coptic_lexicon.keys() if search_term in k] |
|
|
if matches: |
|
|
st.write("Partial matches:") |
|
|
for match in matches[:5]: |
|
|
st.write(f"**{match}** → {coptic_lexicon[match][:100]}...") |
|
|
else: |
|
|
st.write("No matches found") |
|
|
|
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo']: |
|
|
st.subheader("Analysis Type") |
|
|
analysis_type = st.selectbox("Choose analysis:", |
|
|
options=list(COPTIC_PROMPTS.keys()), |
|
|
format_func=lambda x: x.replace('_', ' ').title()) |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_model(): |
|
|
model_path = "swiss-ai/Apertus-8B-Instruct-2509" |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16) |
|
|
return tokenizer, model |
|
|
except Exception as e: |
|
|
st.error(f"Failed to load model: {str(e)}") |
|
|
return None, None |
|
|
|
|
|
tokenizer, model = load_model() |
|
|
|
|
|
|
|
|
if "messages" not in st.session_state: |
|
|
st.session_state.messages = [] |
|
|
|
|
|
|
|
|
for message in st.session_state.messages: |
|
|
with st.chat_message(message["role"]): |
|
|
st.markdown(message["content"]) |
|
|
|
|
|
|
|
|
if prompt := st.chat_input("Type your message..."): |
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals(): |
|
|
full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}" |
|
|
|
|
|
|
|
|
if analysis_type == 'lexicon_lookup' and coptic_lexicon: |
|
|
words_in_prompt = prompt.split() |
|
|
lexicon_matches = [] |
|
|
for word in words_in_prompt: |
|
|
if word in coptic_lexicon: |
|
|
lexicon_matches.append(f"{word} = {coptic_lexicon[word]}") |
|
|
|
|
|
if lexicon_matches: |
|
|
full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}" |
|
|
else: |
|
|
full_prompt = prompt |
|
|
|
|
|
st.session_state.messages.append({"role": "user", "content": full_prompt}) |
|
|
|
|
|
with st.chat_message("user"): |
|
|
st.markdown(full_prompt) |
|
|
|
|
|
|
|
|
with st.chat_message("assistant"): |
|
|
messages = [{"role": "user", "content": full_prompt}] |
|
|
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
inputs = tokenizer([text], return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.8, top_p=0.9) |
|
|
|
|
|
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True) |
|
|
st.markdown(response) |
|
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
|