File size: 10,896 Bytes
7e208b2
 
 
 
 
 
 
c0bf168
7e208b2
c0bf168
7e208b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0bf168
7e208b2
 
 
 
c0bf168
 
 
 
 
7e208b2
c0bf168
 
 
 
 
 
 
7e208b2
c0bf168
 
7e208b2
c0bf168
7e208b2
c0bf168
 
 
 
7e208b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0bf168
 
 
 
 
 
 
 
7e208b2
c0bf168
 
 
 
7e208b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0bf168
7e208b2
 
 
c0bf168
 
 
7e208b2
c0bf168
 
 
 
 
7e208b2
c0bf168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e208b2
c0bf168
 
7e208b2
 
 
c0bf168
 
 
 
 
 
 
 
 
 
 
7e208b2
c0bf168
 
 
 
 
 
 
7e208b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0bf168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e208b2
c0bf168
7e208b2
 
c0bf168
7e208b2
c0bf168
7e208b2
c0bf168
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
import xml.etree.ElementTree as ET
import re

# Coptic alphabet helper
COPTIC_ALPHABET = {
    'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
    'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
    'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
    'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
    'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
}

# Coptic linguistic prompts
COPTIC_PROMPTS = {
    'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
    'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
    'transcription': "Provide a romanized transcription of this Coptic text:",
    'morphology': "Analyze the morphological structure of these Coptic words:",
    'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
}

# Lexicon loader
@st.cache_data
def load_coptic_lexicon(file_path=None):
    """Load Coptic lexicon from various formats including TEI XML"""
    if not file_path or not os.path.exists(file_path):
        return {}
    
    lexicon = {}
    
    try:
        # Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
        if file_path.endswith('.xml'):
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            # Handle TEI namespace
            ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
            
            # Find entries in TEI format
            entries = root.findall('.//tei:entry', ns)
            
            for entry in entries[:100]:  # Limit to first 100 entries for performance
                coptic_word = ""
                definition = ""
                
                # Extract Coptic headword from TEI structure
                form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns)
                if form is not None:
                    orth = form.find('.//tei:orth', ns)
                    if orth is not None and orth.text:
                        coptic_word = orth.text.strip()
                
                # Extract definition from sense elements
                senses = entry.findall('.//tei:sense', ns)
                definitions = []
                for sense in senses[:2]:  # Limit to first 2 senses
                    def_elem = sense.find('.//tei:def', ns)
                    if def_elem is not None and def_elem.text:
                        definitions.append(def_elem.text.strip())
                
                if definitions:
                    definition = "; ".join(definitions)
                
                # Clean and store
                if coptic_word and definition:
                    # Clean Coptic word (preserve Coptic and Greek Unicode)
                    coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip()
                    if coptic_word:
                        lexicon[coptic_word] = definition[:200]  # Limit definition length
        
        # Handle text formats
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    
                    # Support multiple separators
                    separator = None
                    for sep in ['\t', '|', ',', ';']:
                        if sep in line:
                            separator = sep
                            break
                    
                    if separator:
                        parts = line.split(separator, 1)
                        if len(parts) >= 2:
                            coptic_word = parts[0].strip()
                            definition = parts[1].strip()
                            lexicon[coptic_word] = definition
    
    except Exception as e:
        st.error(f"Error loading lexicon: {str(e)}")
    
    return lexicon

# Language detection and UI
LANGUAGES = {
    'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
    'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी',
    'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic'
}

st.set_page_config(page_title="Apertus Chat", layout="wide")

# Language selector
selected_lang = st.selectbox("Language / Langue / Idioma", 
                           options=list(LANGUAGES.keys()),
                           format_func=lambda x: LANGUAGES[x])

# Sidebar for Coptic tools
with st.sidebar:
    st.header("Coptic Tools")
    
    # Lexicon file uploader
    lexicon_file = st.file_uploader("Upload Coptic Lexicon", 
                                   type=['txt', 'tsv', 'csv', 'xml'],
                                   help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")
    
    # Load lexicon
    if lexicon_file:
        # Save uploaded file temporarily
        with open("temp_lexicon.txt", "wb") as f:
            f.write(lexicon_file.getbuffer())
        coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
        st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
    else:
        # Try to load the comprehensive lexicon if available
        comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
        if os.path.exists(comprehensive_lexicon_path):
            coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
            if coptic_lexicon:
                st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
            else:
                coptic_lexicon = {}
        else:
            coptic_lexicon = {}
    
    # Coptic alphabet reference
    if st.expander("Coptic Alphabet"):
        for letter, name in COPTIC_ALPHABET.items():
            st.text(f"{letter} - {name}")
    
    # Lexicon search
    if coptic_lexicon:
        st.subheader("Lexicon Search")
        
        # Virtual Coptic keyboard
        st.write("**Virtual Keyboard:**")
        coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ']
        
        # Create keyboard layout in rows
        cols1 = st.columns(8)
        cols2 = st.columns(8) 
        cols3 = st.columns(8)
        cols4 = st.columns(8)
        
        keyboard_input = ""
        for i, letter in enumerate(coptic_letters):
            col_idx = i % 8
            if i < 8:
                if cols1[col_idx].button(letter, key=f"key_{letter}"):
                    keyboard_input = letter
            elif i < 16:
                if cols2[col_idx].button(letter, key=f"key_{letter}"):
                    keyboard_input = letter
            elif i < 24:
                if cols3[col_idx].button(letter, key=f"key_{letter}"):
                    keyboard_input = letter
            else:
                if cols4[col_idx].button(letter, key=f"key_{letter}"):
                    keyboard_input = letter
        
        # Search input
        search_term = st.text_input("Search Coptic word:", value=keyboard_input if keyboard_input else "")
        
        if search_term:
            if search_term in coptic_lexicon:
                st.write(f"**{search_term}**")
                st.write(coptic_lexicon[search_term])
            else:
                # Partial matches
                matches = [k for k in coptic_lexicon.keys() if search_term in k]
                if matches:
                    st.write("Partial matches:")
                    for match in matches[:5]:  # Show first 5 matches
                        st.write(f"**{match}** → {coptic_lexicon[match][:100]}...")
                else:
                    st.write("No matches found")
    
    # Linguistic analysis options
    if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
        st.subheader("Analysis Type")
        analysis_type = st.selectbox("Choose analysis:", 
                                   options=list(COPTIC_PROMPTS.keys()),
                                   format_func=lambda x: x.replace('_', ' ').title())

# Load model (cached)
@st.cache_resource
def load_model():
    model_path = "swiss-ai/Apertus-8B-Instruct-2509"
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
        return tokenizer, model
    except Exception as e:
        st.error(f"Failed to load model: {str(e)}")
        return None, None

tokenizer, model = load_model()

# Chat interface
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# User input
if prompt := st.chat_input("Type your message..."):
    # Add Coptic-specific prompt prefix if applicable
    if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
        full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
        
        # Add lexicon context for lexicon lookup
        if analysis_type == 'lexicon_lookup' and coptic_lexicon:
            words_in_prompt = prompt.split()
            lexicon_matches = []
            for word in words_in_prompt:
                if word in coptic_lexicon:
                    lexicon_matches.append(f"{word} = {coptic_lexicon[word]}")
            
            if lexicon_matches:
                full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}"
    else:
        full_prompt = prompt
    
    st.session_state.messages.append({"role": "user", "content": full_prompt})
    
    with st.chat_message("user"):
        st.markdown(full_prompt)
    
    # Generate response
    with st.chat_message("assistant"):
        messages = [{"role": "user", "content": full_prompt}]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer([text], return_tensors="pt")
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.8, top_p=0.9)
        
        response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
        st.markdown(response)
        st.session_state.messages.append({"role": "assistant", "content": response})