Rogaton commited on
Commit
5461265
Β·
1 Parent(s): cc8e202

Add automatic model upload to HuggingFace Hub

Browse files
hf_space_megalaa_training/app.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HuggingFace Space for fine-tuning megalaa Coptic translation model
4
+
5
+ This Gradio app provides a user-friendly interface for training the
6
+ megalaa/coptic-english-translator model on your CopticScriptorium corpus.
7
+ """
8
+
9
+ import gradio as gr
10
+ import os
11
+ import subprocess
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+
16
+ # Global variable to track training status
17
+ training_status = {
18
+ "running": False,
19
+ "log": [],
20
+ "completed": False,
21
+ "error": None
22
+ }
23
+
24
+
25
+ def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
26
+ """
27
+ Start model training with uploaded data files
28
+ """
29
+ global training_status
30
+
31
+ # Reset status
32
+ training_status = {
33
+ "running": True,
34
+ "log": ["πŸš€ Starting training setup...\n"],
35
+ "completed": False,
36
+ "error": None
37
+ }
38
+
39
+ try:
40
+ # Save uploaded files
41
+ train_path = "train.jsonl"
42
+ val_path = "val.jsonl"
43
+
44
+ with open(train_path, "wb") as f:
45
+ f.write(train_file)
46
+ with open(val_path, "wb") as f:
47
+ f.write(val_file)
48
+
49
+ training_status["log"].append(f"βœ“ Training data saved: {train_path}\n")
50
+ training_status["log"].append(f"βœ“ Validation data saved: {val_path}\n")
51
+
52
+ # Create training script
53
+ script_content = f'''#!/usr/bin/env python3
54
+ import os
55
+ import json
56
+ import torch
57
+ from datasets import load_dataset
58
+ from transformers import (
59
+ AutoTokenizer,
60
+ AutoModelForSeq2SeqLM,
61
+ Seq2SeqTrainingArguments,
62
+ Seq2SeqTrainer,
63
+ DataCollatorForSeq2Seq,
64
+ )
65
+ from huggingface_hub import HfApi, login
66
+ from evaluate import load
67
+ import numpy as np
68
+ import logging
69
+
70
+ logging.basicConfig(level=logging.INFO)
71
+ logger = logging.getLogger(__name__)
72
+
73
+ # HuggingFace Hub configuration
74
+ HF_TOKEN = "{hf_token}"
75
+ MODEL_REPO_NAME = "{model_repo_name}"
76
+
77
+ if HF_TOKEN:
78
+ login(token=HF_TOKEN)
79
+ logger.info("βœ“ Logged in to HuggingFace Hub")
80
+
81
+ # Greekification for megalaa models
82
+ COPTIC_TO_GREEK = {{
83
+ "ⲁ": "Ξ±", "ⲃ": "Ξ²", "β²…": "Ξ³", "ⲇ": "Ξ΄", "ⲉ": "Ξ΅", "ⲋ": "Ο›",
84
+ "ⲍ": "ΞΆ", "ⲏ": "Ξ·", "ⲑ": "ΞΈ", "ⲓ": "ΞΉ", "ⲕ": "ΞΊ", "β²—": "Ξ»",
85
+ "β²™": "ΞΌ", "β²›": "Ξ½", "ⲝ": "ΞΎ", "ⲟ": "ΞΏ", "ⲑ": "Ο€", "β²£": "ρ",
86
+ "β²₯": "Οƒ", "β²§": "Ο„", "ⲩ": "Ο…", "ⲫ": "Ο†", "β²­": "Ο‡", "β²―": "ψ",
87
+ "β²±": "Ο‰", "Ο£": "s", "Ο₯": "f", "Ο§": "k", "Ο©": "h", "Ο«": "j",
88
+ "Ο­": "c", "Ο―": "t",
89
+ }}
90
+
91
+ def greekify(text):
92
+ if not text:
93
+ return ""
94
+ return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text)
95
+
96
+ def extract_parallel_texts(examples):
97
+ coptic_texts = []
98
+ english_texts = []
99
+
100
+ for messages in examples['messages']:
101
+ coptic_text = None
102
+ english_text = None
103
+
104
+ for msg in messages:
105
+ if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']:
106
+ coptic_text = msg['content'].split('Coptic text to English:')[-1].strip()
107
+ elif msg['role'] == 'assistant':
108
+ english_text = msg['content']
109
+
110
+ coptic_texts.append(coptic_text)
111
+ english_texts.append(english_text)
112
+
113
+ return {{'coptic': coptic_texts, 'english': english_texts}}
114
+
115
+ def preprocess_function(examples, tokenizer, max_length=256):
116
+ greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]]
117
+
118
+ model_inputs = tokenizer(
119
+ greekified_coptic,
120
+ max_length=max_length,
121
+ truncation=True,
122
+ padding="max_length"
123
+ )
124
+
125
+ labels = tokenizer(
126
+ text_target=examples["english"],
127
+ max_length=max_length,
128
+ truncation=True,
129
+ padding="max_length"
130
+ )
131
+
132
+ labels["input_ids"] = [
133
+ [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
134
+ for labels_example in labels["input_ids"]
135
+ ]
136
+
137
+ model_inputs["labels"] = labels["input_ids"]
138
+ return model_inputs
139
+
140
+ def compute_metrics(eval_preds, tokenizer, metric):
141
+ preds, labels = eval_preds
142
+
143
+ if isinstance(preds, tuple):
144
+ preds = preds[0]
145
+
146
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
147
+
148
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
149
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
150
+ decoded_labels = [[label] for label in decoded_labels]
151
+
152
+ result = metric.compute(predictions=decoded_preds, references=decoded_labels)
153
+ return {{"bleu": result["score"]}}
154
+
155
+ # Configuration
156
+ model_name = "megalaa/coptic-english-translator"
157
+ output_dir = "coptic_megalaa_finetuned"
158
+ num_epochs = {num_epochs}
159
+ batch_size = {batch_size}
160
+ learning_rate = {learning_rate}
161
+
162
+ logger.info("="*60)
163
+ logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES")
164
+ logger.info("="*60)
165
+ logger.info(f"Base model: {{model_name}}")
166
+ logger.info(f"Epochs: {{num_epochs}}")
167
+ logger.info(f"Batch size: {{batch_size}}")
168
+ logger.info(f"Learning rate: {{learning_rate}}")
169
+
170
+ # Check GPU
171
+ if torch.cuda.is_available():
172
+ logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}")
173
+ logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB")
174
+ else:
175
+ logger.warning("No GPU detected!")
176
+
177
+ # Load model
178
+ logger.info("\\nLoading model...")
179
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
180
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
181
+
182
+ # Load datasets
183
+ logger.info("Loading datasets...")
184
+ train_dataset = load_dataset('json', data_files='{train_path}', split='train')
185
+ val_dataset = load_dataset('json', data_files='{val_path}', split='train')
186
+
187
+ logger.info(f"Train samples: {{len(train_dataset):,}}")
188
+ logger.info(f"Validation samples: {{len(val_dataset):,}}")
189
+
190
+ # Extract and tokenize
191
+ logger.info("Processing datasets...")
192
+ train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
193
+ val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
194
+
195
+ tokenized_train = train_dataset.map(
196
+ lambda examples: preprocess_function(examples, tokenizer),
197
+ batched=True,
198
+ remove_columns=['coptic', 'english']
199
+ )
200
+ tokenized_val = val_dataset.map(
201
+ lambda examples: preprocess_function(examples, tokenizer),
202
+ batched=True,
203
+ remove_columns=['coptic', 'english']
204
+ )
205
+
206
+ # Setup training
207
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
208
+ metric = load("sacrebleu")
209
+
210
+ training_args = Seq2SeqTrainingArguments(
211
+ output_dir=output_dir,
212
+ num_train_epochs=num_epochs,
213
+ per_device_train_batch_size=batch_size,
214
+ per_device_eval_batch_size=batch_size,
215
+ gradient_accumulation_steps=2,
216
+ learning_rate=learning_rate,
217
+ warmup_steps=500,
218
+ max_grad_norm=1.0,
219
+ weight_decay=0.01,
220
+ eval_strategy="steps",
221
+ eval_steps=500,
222
+ logging_steps=50,
223
+ save_steps=500,
224
+ save_total_limit=3,
225
+ load_best_model_at_end=True,
226
+ metric_for_best_model="bleu",
227
+ greater_is_better=True,
228
+ predict_with_generate=True,
229
+ generation_max_length=256,
230
+ generation_num_beams=5,
231
+ fp16=torch.cuda.is_available(),
232
+ report_to="tensorboard",
233
+ logging_dir=f"{{output_dir}}/logs",
234
+ push_to_hub=False,
235
+ )
236
+
237
+ trainer = Seq2SeqTrainer(
238
+ model=model,
239
+ args=training_args,
240
+ train_dataset=tokenized_train,
241
+ eval_dataset=tokenized_val,
242
+ tokenizer=tokenizer,
243
+ data_collator=data_collator,
244
+ compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric)
245
+ )
246
+
247
+ logger.info("\\nSTARTING TRAINING")
248
+ logger.info("="*60)
249
+
250
+ # Train
251
+ trainer.train()
252
+
253
+ # Save locally
254
+ logger.info("\\nSaving final model...")
255
+ trainer.save_model(f"{{output_dir}}/final")
256
+ tokenizer.save_pretrained(f"{{output_dir}}/final")
257
+
258
+ # Push to HuggingFace Hub
259
+ if HF_TOKEN and MODEL_REPO_NAME:
260
+ logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}")
261
+ try:
262
+ api = HfApi()
263
+ api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True)
264
+
265
+ # Upload all files
266
+ api.upload_folder(
267
+ folder_path=f"{{output_dir}}/final",
268
+ repo_id=MODEL_REPO_NAME,
269
+ repo_type="model",
270
+ )
271
+ logger.info(f"βœ… Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}")
272
+ except Exception as e:
273
+ logger.error(f"❌ Failed to push to Hub: {{e}}")
274
+
275
+ # Final evaluation
276
+ logger.info("\\nFinal evaluation...")
277
+ eval_results = trainer.evaluate()
278
+
279
+ logger.info("\\n" + "="*60)
280
+ logger.info("TRAINING COMPLETE!")
281
+ logger.info("="*60)
282
+ for key, value in eval_results.items():
283
+ logger.info(f"{{key}}: {{value}}")
284
+
285
+ logger.info(f"\\nβœ… Model saved locally to: {{output_dir}}/final")
286
+ if HF_TOKEN and MODEL_REPO_NAME:
287
+ logger.info(f"βœ… Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}")
288
+ '''
289
+
290
+ with open("train_script.py", "w") as f:
291
+ f.write(script_content)
292
+
293
+ training_status["log"].append("βœ“ Training script created\n")
294
+ training_status["log"].append("πŸš€ Starting training...\n\n")
295
+
296
+ # Run training in subprocess
297
+ process = subprocess.Popen(
298
+ ["python", "train_script.py"],
299
+ stdout=subprocess.PIPE,
300
+ stderr=subprocess.STDOUT,
301
+ text=True,
302
+ bufsize=1
303
+ )
304
+
305
+ # Stream output
306
+ for line in process.stdout:
307
+ training_status["log"].append(line)
308
+ time.sleep(0.01) # Small delay to allow UI updates
309
+
310
+ process.wait()
311
+
312
+ if process.returncode == 0:
313
+ training_status["completed"] = True
314
+ training_status["log"].append("\n\nβœ… TRAINING COMPLETED SUCCESSFULLY!\n")
315
+ training_status["log"].append("πŸ“¦ Model saved locally to: coptic_megalaa_finetuned/final\n")
316
+ if hf_token and model_repo_name:
317
+ training_status["log"].append(f"πŸ“¦ Model pushed to: https://huggingface.co/{model_repo_name}\n")
318
+ else:
319
+ training_status["error"] = f"Training failed with exit code {process.returncode}"
320
+ training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n")
321
+
322
+ except Exception as e:
323
+ training_status["error"] = str(e)
324
+ training_status["log"].append(f"\n\n❌ Error: {str(e)}\n")
325
+
326
+ finally:
327
+ training_status["running"] = False
328
+
329
+
330
+ def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
331
+ """
332
+ Start training in background thread
333
+ """
334
+ if training_status["running"]:
335
+ return "⚠️ Training already in progress!"
336
+
337
+ if not hf_token or not model_repo_name:
338
+ return "⚠️ Please provide both HuggingFace Token and Model Repository Name!"
339
+
340
+ # Start training thread
341
+ thread = threading.Thread(
342
+ target=train_model,
343
+ args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name)
344
+ )
345
+ thread.daemon = True
346
+ thread.start()
347
+
348
+ return "πŸš€ Training started! Monitor progress in the logs below."
349
+
350
+
351
+ def get_training_log():
352
+ """
353
+ Return current training log
354
+ """
355
+ return "".join(training_status["log"])
356
+
357
+
358
+ def check_status():
359
+ """
360
+ Return training status
361
+ """
362
+ if training_status["completed"]:
363
+ return "βœ… Training completed!"
364
+ elif training_status["error"]:
365
+ return f"❌ Error: {training_status['error']}"
366
+ elif training_status["running"]:
367
+ return "πŸ”„ Training in progress..."
368
+ else:
369
+ return "⏸️ Ready to train"
370
+
371
+
372
+ # Create Gradio interface
373
+ with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo:
374
+ gr.Markdown("""
375
+ # πŸ›οΈ Megalaa Coptic Translation Fine-tuning
376
+
377
+ Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus.
378
+
379
+ **βš™οΈ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance!
380
+ """)
381
+
382
+ with gr.Row():
383
+ with gr.Column():
384
+ gr.Markdown("### πŸ”‘ HuggingFace Hub Configuration")
385
+ hf_token_input = gr.Textbox(
386
+ label="HuggingFace Token",
387
+ placeholder="hf_...",
388
+ type="password",
389
+ info="Get your token from https://huggingface.co/settings/tokens"
390
+ )
391
+ model_repo_input = gr.Textbox(
392
+ label="Model Repository Name",
393
+ placeholder="username/coptic-megalaa-finetuned",
394
+ info="Example: john-doe/coptic-megalaa-finetuned"
395
+ )
396
+
397
+ gr.Markdown("### πŸ“€ Upload Training Data")
398
+ train_file_upload = gr.File(
399
+ label="Training Data (train.jsonl)",
400
+ file_types=[".jsonl"]
401
+ )
402
+ val_file_upload = gr.File(
403
+ label="Validation Data (val.jsonl)",
404
+ file_types=[".jsonl"]
405
+ )
406
+
407
+ gr.Markdown("### βš™οΈ Training Parameters")
408
+ num_epochs = gr.Slider(
409
+ minimum=1,
410
+ maximum=10,
411
+ value=5,
412
+ step=1,
413
+ label="Number of Epochs"
414
+ )
415
+ batch_size = gr.Slider(
416
+ minimum=4,
417
+ maximum=16,
418
+ value=8,
419
+ step=4,
420
+ label="Batch Size"
421
+ )
422
+ learning_rate = gr.Number(
423
+ value=2e-5,
424
+ label="Learning Rate"
425
+ )
426
+
427
+ start_btn = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
428
+ status_text = gr.Textbox(label="Status", value="⏸️ Ready to train")
429
+
430
+ with gr.Column():
431
+ gr.Markdown("### πŸ“Š Training Log")
432
+ log_output = gr.Textbox(
433
+ label="Real-time Training Log",
434
+ lines=30,
435
+ max_lines=30,
436
+ autoscroll=True,
437
+ every=2
438
+ )
439
+
440
+ # Button actions
441
+ start_btn.click(
442
+ fn=start_training,
443
+ inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input],
444
+ outputs=status_text
445
+ )
446
+
447
+ # Auto-refresh log and status
448
+ demo.load(fn=get_training_log, outputs=log_output, every=2)
449
+ demo.load(fn=check_status, outputs=status_text, every=2)
450
+
451
+ gr.Markdown("""
452
+ ---
453
+ ### πŸ“₯ After Training
454
+
455
+ When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub!
456
+
457
+ **Next steps:**
458
+ 1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME`
459
+ 2. Download and test with: `python evaluate_megalaa_model.py`
460
+ 3. Integrate into your Coptic translation interface
461
+ 4. Share your model with the community!
462
+
463
+ **Estimated training time:** 6-8 hours on T4 GPU
464
+
465
+ **Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training,
466
+ but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version!
467
+ """)
468
+
469
+ if __name__ == "__main__":
470
+ demo.launch()
hf_space_megalaa_training/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.35.0
3
+ datasets>=2.14.0
4
+ accelerate>=0.24.0
5
+ evaluate>=0.4.1
6
+ sacrebleu>=2.3.1
7
+ sentencepiece>=0.1.99
8
+ protobuf>=3.20.0
9
+ gradio>=4.44.0
10
+ tensorboard>=2.15.0
11
+ huggingface_hub>=0.20.0