Spaces:

Norelad
/

coptic-translation-interface

Sleeping

App Files Files Community

Norelad commited on Oct 15

Commit

3e1d91b

verified ·

1 Parent(s): 6e7487a

Upload apertus_ui.py with huggingface_hub

Browse files

Files changed (1) hide show

apertus_ui.py +258 -0

apertus_ui.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import os
+import xml.etree.ElementTree as ET
+import re
+# Coptic alphabet helper
+COPTIC_ALPHABET = {
+    'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
+    'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
+    'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
+    'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
+    'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
+}
+# Coptic linguistic prompts
+COPTIC_PROMPTS = {
+    'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
+    'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
+    'transcription': "Provide a romanized transcription of this Coptic text:",
+    'morphology': "Analyze the morphological structure of these Coptic words:",
+    'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
+}
+# Lexicon loader
+@st.cache_data
+def load_coptic_lexicon(file_path=None):
+    """Load Coptic lexicon from various formats including TEI XML"""
+    if not file_path or not os.path.exists(file_path):
+        return {}
+    lexicon = {}
+    try:
+        # Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
+        if file_path.endswith('.xml'):
+            tree = ET.parse(file_path)
+            root = tree.getroot()
+            # Handle TEI namespace
+            ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
+            # Find entries in TEI format
+            entries = root.findall('.//tei:entry', ns)
+            for entry in entries[:100]:  # Limit to first 100 entries for performance
+                coptic_word = ""
+                definition = ""
+                # Extract Coptic headword from TEI structure
+                form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns)
+                if form is not None:
+                    orth = form.find('.//tei:orth', ns)
+                    if orth is not None and orth.text:
+                        coptic_word = orth.text.strip()
+                # Extract definition from sense elements
+                senses = entry.findall('.//tei:sense', ns)
+                definitions = []
+                for sense in senses[:2]:  # Limit to first 2 senses
+                    def_elem = sense.find('.//tei:def', ns)
+                    if def_elem is not None and def_elem.text:
+                        definitions.append(def_elem.text.strip())
+                if definitions:
+                    definition = "; ".join(definitions)
+                # Clean and store
+                if coptic_word and definition:
+                    # Clean Coptic word (preserve Coptic and Greek Unicode)
+                    coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip()
+                    if coptic_word:
+                        lexicon[coptic_word] = definition[:200]  # Limit definition length
+        # Handle text formats
+        else:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # Support multiple separators
+                    separator = None
+                    for sep in ['\t', '|', ',', ';']:
+                        if sep in line:
+                            separator = sep
+                            break
+                    if separator:
+                        parts = line.split(separator, 1)
+                        if len(parts) >= 2:
+                            coptic_word = parts[0].strip()
+                            definition = parts[1].strip()
+                            lexicon[coptic_word] = definition
+    except Exception as e:
+        st.error(f"Error loading lexicon: {str(e)}")
+    return lexicon
+# Language detection and UI
+LANGUAGES = {
+    'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
+    'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी',
+    'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic'
+}
+st.set_page_config(page_title="Apertus Chat", layout="wide")
+# Language selector
+selected_lang = st.selectbox("Language / Langue / Idioma",
+                           options=list(LANGUAGES.keys()),
+                           format_func=lambda x: LANGUAGES[x])
+# Sidebar for Coptic tools
+with st.sidebar:
+    st.header("Coptic Tools")
+    # Lexicon file uploader
+    lexicon_file = st.file_uploader("Upload Coptic Lexicon",
+                                   type=['txt', 'tsv', 'csv', 'xml'],
+                                   help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")
+    # Load lexicon
+    if lexicon_file:
+        # Save uploaded file temporarily
+        with open("temp_lexicon.txt", "wb") as f:
+            f.write(lexicon_file.getbuffer())
+        coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
+        st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
+    else:
+        # Try to load the comprehensive lexicon if available
+        comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
+        if os.path.exists(comprehensive_lexicon_path):
+            coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
+            if coptic_lexicon:
+                st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
+            else:
+                coptic_lexicon = {}
+        else:
+            coptic_lexicon = {}
+    # Coptic alphabet reference
+    if st.expander("Coptic Alphabet"):
+        for letter, name in COPTIC_ALPHABET.items():
+            st.text(f"{letter} - {name}")
+    # Lexicon search
+    if coptic_lexicon:
+        st.subheader("Lexicon Search")
+        # Virtual Coptic keyboard
+        st.write("**Virtual Keyboard:**")
+        coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ']
+        # Create keyboard layout in rows
+        cols1 = st.columns(8)
+        cols2 = st.columns(8)
+        cols3 = st.columns(8)
+        cols4 = st.columns(8)
+        keyboard_input = ""
+        for i, letter in enumerate(coptic_letters):
+            col_idx = i % 8
+            if i < 8:
+                if cols1[col_idx].button(letter, key=f"key_{letter}"):
+                    keyboard_input = letter
+            elif i < 16:
+                if cols2[col_idx].button(letter, key=f"key_{letter}"):
+                    keyboard_input = letter
+            elif i < 24:
+                if cols3[col_idx].button(letter, key=f"key_{letter}"):
+                    keyboard_input = letter
+            else:
+                if cols4[col_idx].button(letter, key=f"key_{letter}"):
+                    keyboard_input = letter
+        # Search input
+        search_term = st.text_input("Search Coptic word:", value=keyboard_input if keyboard_input else "")
+        if search_term:
+            if search_term in coptic_lexicon:
+                st.write(f"**{search_term}**")
+                st.write(coptic_lexicon[search_term])
+            else:
+                # Partial matches
+                matches = [k for k in coptic_lexicon.keys() if search_term in k]
+                if matches:
+                    st.write("Partial matches:")
+                    for match in matches[:5]:  # Show first 5 matches
+                        st.write(f"**{match}** → {coptic_lexicon[match][:100]}...")
+                else:
+                    st.write("No matches found")
+    # Linguistic analysis options
+    if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
+        st.subheader("Analysis Type")
+        analysis_type = st.selectbox("Choose analysis:",
+                                   options=list(COPTIC_PROMPTS.keys()),
+                                   format_func=lambda x: x.replace('_', ' ').title())
+# Load model (cached)
+@st.cache_resource
+def load_model():
+    model_path = "/home/aldn/Téléchargements/Apertus8B"
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
+    return tokenizer, model
+tokenizer, model = load_model()
+# Chat interface
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat history
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# User input
+if prompt := st.chat_input("Type your message..."):
+    # Add Coptic-specific prompt prefix if applicable
+    if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
+        full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
+        # Add lexicon context for lexicon lookup
+        if analysis_type == 'lexicon_lookup' and coptic_lexicon:
+            words_in_prompt = prompt.split()
+            lexicon_matches = []
+            for word in words_in_prompt:
+                if word in coptic_lexicon:
+                    lexicon_matches.append(f"{word} = {coptic_lexicon[word]}")
+            if lexicon_matches:
+                full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}"
+    else:
+        full_prompt = prompt
+    st.session_state.messages.append({"role": "user", "content": full_prompt})
+    with st.chat_message("user"):
+        st.markdown(full_prompt)
+    # Generate response
+    with st.chat_message("assistant"):
+        messages = [{"role": "user", "content": full_prompt}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer([text], return_tensors="pt")
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.8, top_p=0.9)
+        response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+        st.markdown(response)
+        st.session_state.messages.append({"role": "assistant", "content": response})