Norelad commited on
Commit
3e1d91b
·
verified ·
1 Parent(s): 6e7487a

Upload apertus_ui.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. apertus_ui.py +258 -0
apertus_ui.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+ import os
5
+ import xml.etree.ElementTree as ET
6
+ import re
7
+
8
+ # Coptic alphabet helper
9
+ COPTIC_ALPHABET = {
10
+ 'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
11
+ 'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
12
+ 'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
13
+ 'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
14
+ 'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
15
+ }
16
+
17
+ # Coptic linguistic prompts
18
+ COPTIC_PROMPTS = {
19
+ 'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
20
+ 'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
21
+ 'transcription': "Provide a romanized transcription of this Coptic text:",
22
+ 'morphology': "Analyze the morphological structure of these Coptic words:",
23
+ 'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
24
+ }
25
+
26
+ # Lexicon loader
27
+ @st.cache_data
28
+ def load_coptic_lexicon(file_path=None):
29
+ """Load Coptic lexicon from various formats including TEI XML"""
30
+ if not file_path or not os.path.exists(file_path):
31
+ return {}
32
+
33
+ lexicon = {}
34
+
35
+ try:
36
+ # Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
37
+ if file_path.endswith('.xml'):
38
+ tree = ET.parse(file_path)
39
+ root = tree.getroot()
40
+
41
+ # Handle TEI namespace
42
+ ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
43
+
44
+ # Find entries in TEI format
45
+ entries = root.findall('.//tei:entry', ns)
46
+
47
+ for entry in entries[:100]: # Limit to first 100 entries for performance
48
+ coptic_word = ""
49
+ definition = ""
50
+
51
+ # Extract Coptic headword from TEI structure
52
+ form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns)
53
+ if form is not None:
54
+ orth = form.find('.//tei:orth', ns)
55
+ if orth is not None and orth.text:
56
+ coptic_word = orth.text.strip()
57
+
58
+ # Extract definition from sense elements
59
+ senses = entry.findall('.//tei:sense', ns)
60
+ definitions = []
61
+ for sense in senses[:2]: # Limit to first 2 senses
62
+ def_elem = sense.find('.//tei:def', ns)
63
+ if def_elem is not None and def_elem.text:
64
+ definitions.append(def_elem.text.strip())
65
+
66
+ if definitions:
67
+ definition = "; ".join(definitions)
68
+
69
+ # Clean and store
70
+ if coptic_word and definition:
71
+ # Clean Coptic word (preserve Coptic and Greek Unicode)
72
+ coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip()
73
+ if coptic_word:
74
+ lexicon[coptic_word] = definition[:200] # Limit definition length
75
+
76
+ # Handle text formats
77
+ else:
78
+ with open(file_path, 'r', encoding='utf-8') as f:
79
+ for line in f:
80
+ line = line.strip()
81
+ if not line:
82
+ continue
83
+
84
+ # Support multiple separators
85
+ separator = None
86
+ for sep in ['\t', '|', ',', ';']:
87
+ if sep in line:
88
+ separator = sep
89
+ break
90
+
91
+ if separator:
92
+ parts = line.split(separator, 1)
93
+ if len(parts) >= 2:
94
+ coptic_word = parts[0].strip()
95
+ definition = parts[1].strip()
96
+ lexicon[coptic_word] = definition
97
+
98
+ except Exception as e:
99
+ st.error(f"Error loading lexicon: {str(e)}")
100
+
101
+ return lexicon
102
+
103
+ # Language detection and UI
104
+ LANGUAGES = {
105
+ 'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
106
+ 'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी',
107
+ 'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic'
108
+ }
109
+
110
+ st.set_page_config(page_title="Apertus Chat", layout="wide")
111
+
112
+ # Language selector
113
+ selected_lang = st.selectbox("Language / Langue / Idioma",
114
+ options=list(LANGUAGES.keys()),
115
+ format_func=lambda x: LANGUAGES[x])
116
+
117
+ # Sidebar for Coptic tools
118
+ with st.sidebar:
119
+ st.header("Coptic Tools")
120
+
121
+ # Lexicon file uploader
122
+ lexicon_file = st.file_uploader("Upload Coptic Lexicon",
123
+ type=['txt', 'tsv', 'csv', 'xml'],
124
+ help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")
125
+
126
+ # Load lexicon
127
+ if lexicon_file:
128
+ # Save uploaded file temporarily
129
+ with open("temp_lexicon.txt", "wb") as f:
130
+ f.write(lexicon_file.getbuffer())
131
+ coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
132
+ st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
133
+ else:
134
+ # Try to load the comprehensive lexicon if available
135
+ comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
136
+ if os.path.exists(comprehensive_lexicon_path):
137
+ coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
138
+ if coptic_lexicon:
139
+ st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
140
+ else:
141
+ coptic_lexicon = {}
142
+ else:
143
+ coptic_lexicon = {}
144
+
145
+ # Coptic alphabet reference
146
+ if st.expander("Coptic Alphabet"):
147
+ for letter, name in COPTIC_ALPHABET.items():
148
+ st.text(f"{letter} - {name}")
149
+
150
+ # Lexicon search
151
+ if coptic_lexicon:
152
+ st.subheader("Lexicon Search")
153
+
154
+ # Virtual Coptic keyboard
155
+ st.write("**Virtual Keyboard:**")
156
+ coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ']
157
+
158
+ # Create keyboard layout in rows
159
+ cols1 = st.columns(8)
160
+ cols2 = st.columns(8)
161
+ cols3 = st.columns(8)
162
+ cols4 = st.columns(8)
163
+
164
+ keyboard_input = ""
165
+ for i, letter in enumerate(coptic_letters):
166
+ col_idx = i % 8
167
+ if i < 8:
168
+ if cols1[col_idx].button(letter, key=f"key_{letter}"):
169
+ keyboard_input = letter
170
+ elif i < 16:
171
+ if cols2[col_idx].button(letter, key=f"key_{letter}"):
172
+ keyboard_input = letter
173
+ elif i < 24:
174
+ if cols3[col_idx].button(letter, key=f"key_{letter}"):
175
+ keyboard_input = letter
176
+ else:
177
+ if cols4[col_idx].button(letter, key=f"key_{letter}"):
178
+ keyboard_input = letter
179
+
180
+ # Search input
181
+ search_term = st.text_input("Search Coptic word:", value=keyboard_input if keyboard_input else "")
182
+
183
+ if search_term:
184
+ if search_term in coptic_lexicon:
185
+ st.write(f"**{search_term}**")
186
+ st.write(coptic_lexicon[search_term])
187
+ else:
188
+ # Partial matches
189
+ matches = [k for k in coptic_lexicon.keys() if search_term in k]
190
+ if matches:
191
+ st.write("Partial matches:")
192
+ for match in matches[:5]: # Show first 5 matches
193
+ st.write(f"**{match}** → {coptic_lexicon[match][:100]}...")
194
+ else:
195
+ st.write("No matches found")
196
+
197
+ # Linguistic analysis options
198
+ if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
199
+ st.subheader("Analysis Type")
200
+ analysis_type = st.selectbox("Choose analysis:",
201
+ options=list(COPTIC_PROMPTS.keys()),
202
+ format_func=lambda x: x.replace('_', ' ').title())
203
+
204
+ # Load model (cached)
205
+ @st.cache_resource
206
+ def load_model():
207
+ model_path = "/home/aldn/Téléchargements/Apertus8B"
208
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
209
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
210
+ return tokenizer, model
211
+
212
+ tokenizer, model = load_model()
213
+
214
+ # Chat interface
215
+ if "messages" not in st.session_state:
216
+ st.session_state.messages = []
217
+
218
+ # Display chat history
219
+ for message in st.session_state.messages:
220
+ with st.chat_message(message["role"]):
221
+ st.markdown(message["content"])
222
+
223
+ # User input
224
+ if prompt := st.chat_input("Type your message..."):
225
+ # Add Coptic-specific prompt prefix if applicable
226
+ if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
227
+ full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
228
+
229
+ # Add lexicon context for lexicon lookup
230
+ if analysis_type == 'lexicon_lookup' and coptic_lexicon:
231
+ words_in_prompt = prompt.split()
232
+ lexicon_matches = []
233
+ for word in words_in_prompt:
234
+ if word in coptic_lexicon:
235
+ lexicon_matches.append(f"{word} = {coptic_lexicon[word]}")
236
+
237
+ if lexicon_matches:
238
+ full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}"
239
+ else:
240
+ full_prompt = prompt
241
+
242
+ st.session_state.messages.append({"role": "user", "content": full_prompt})
243
+
244
+ with st.chat_message("user"):
245
+ st.markdown(full_prompt)
246
+
247
+ # Generate response
248
+ with st.chat_message("assistant"):
249
+ messages = [{"role": "user", "content": full_prompt}]
250
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
251
+ inputs = tokenizer([text], return_tensors="pt")
252
+
253
+ with torch.no_grad():
254
+ outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.8, top_p=0.9)
255
+
256
+ response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
257
+ st.markdown(response)
258
+ st.session_state.messages.append({"role": "assistant", "content": response})