#!/usr/bin/env python3 """ HuggingFace Space for fine-tuning megalaa Coptic translation model This Gradio app provides a user-friendly interface for training the megalaa/coptic-english-translator model on your CopticScriptorium corpus. """ import gradio as gr import os import subprocess import threading import time from pathlib import Path # Global variable to track training status training_status = { "running": False, "log": [], "completed": False, "error": None } def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name): """ Start model training with uploaded data files """ global training_status # Reset status training_status = { "running": True, "log": ["🚀 Starting training setup...\n"], "completed": False, "error": None } try: # Save uploaded files train_path = "train.jsonl" val_path = "val.jsonl" with open(train_path, "wb") as f: f.write(train_file) with open(val_path, "wb") as f: f.write(val_file) training_status["log"].append(f"✓ Training data saved: {train_path}\n") training_status["log"].append(f"✓ Validation data saved: {val_path}\n") # Create training script script_content = f'''#!/usr/bin/env python3 import os import json import torch from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, ) from huggingface_hub import HfApi, login from evaluate import load import numpy as np import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # HuggingFace Hub configuration HF_TOKEN = "{hf_token}" MODEL_REPO_NAME = "{model_repo_name}" if HF_TOKEN: login(token=HF_TOKEN) logger.info("✓ Logged in to HuggingFace Hub") # Greekification for megalaa models COPTIC_TO_GREEK = {{ "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ", "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ", "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ", "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", "ⲱ": "ω", "ϣ": "s", "ϥ": "f", "ϧ": "k", "ϩ": "h", "ϫ": "j", "ϭ": "c", "ϯ": "t", }} def greekify(text): if not text: return "" return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text) def extract_parallel_texts(examples): coptic_texts = [] english_texts = [] for messages in examples['messages']: coptic_text = None english_text = None for msg in messages: if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']: coptic_text = msg['content'].split('Coptic text to English:')[-1].strip() elif msg['role'] == 'assistant': english_text = msg['content'] coptic_texts.append(coptic_text) english_texts.append(english_text) return {{'coptic': coptic_texts, 'english': english_texts}} def preprocess_function(examples, tokenizer, max_length=256): greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]] model_inputs = tokenizer( greekified_coptic, max_length=max_length, truncation=True, padding="max_length" ) labels = tokenizer( text_target=examples["english"], max_length=max_length, truncation=True, padding="max_length" ) labels["input_ids"] = [ [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs def compute_metrics(eval_preds, tokenizer, metric): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_labels = [[label] for label in decoded_labels] result = metric.compute(predictions=decoded_preds, references=decoded_labels) return {{"bleu": result["score"]}} # Configuration model_name = "megalaa/coptic-english-translator" output_dir = "coptic_megalaa_finetuned" num_epochs = {num_epochs} batch_size = {batch_size} learning_rate = {learning_rate} logger.info("="*60) logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES") logger.info("="*60) logger.info(f"Base model: {{model_name}}") logger.info(f"Epochs: {{num_epochs}}") logger.info(f"Batch size: {{batch_size}}") logger.info(f"Learning rate: {{learning_rate}}") # Check GPU if torch.cuda.is_available(): logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}") logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB") else: logger.warning("No GPU detected!") # Load model logger.info("\\nLoading model...") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Load datasets logger.info("Loading datasets...") train_dataset = load_dataset('json', data_files='{train_path}', split='train') val_dataset = load_dataset('json', data_files='{val_path}', split='train') logger.info(f"Train samples: {{len(train_dataset):,}}") logger.info(f"Validation samples: {{len(val_dataset):,}}") # Extract and tokenize logger.info("Processing datasets...") train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages']) val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages']) tokenized_train = train_dataset.map( lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=['coptic', 'english'] ) tokenized_val = val_dataset.map( lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=['coptic', 'english'] ) # Setup training data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True) metric = load("sacrebleu") training_args = Seq2SeqTrainingArguments( output_dir=output_dir, num_train_epochs=num_epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=2, learning_rate=learning_rate, warmup_steps=500, max_grad_norm=1.0, weight_decay=0.01, eval_strategy="steps", eval_steps=500, logging_steps=50, save_steps=500, save_total_limit=3, load_best_model_at_end=True, metric_for_best_model="bleu", greater_is_better=True, predict_with_generate=True, generation_max_length=256, generation_num_beams=5, fp16=torch.cuda.is_available(), report_to="tensorboard", logging_dir=f"{{output_dir}}/logs", push_to_hub=False, ) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_val, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric) ) logger.info("\\nSTARTING TRAINING") logger.info("="*60) # Train trainer.train() # Save locally logger.info("\\nSaving final model...") trainer.save_model(f"{{output_dir}}/final") tokenizer.save_pretrained(f"{{output_dir}}/final") # Push to HuggingFace Hub if HF_TOKEN and MODEL_REPO_NAME: logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}") try: api = HfApi() api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True) # Upload all files api.upload_folder( folder_path=f"{{output_dir}}/final", repo_id=MODEL_REPO_NAME, repo_type="model", ) logger.info(f"✅ Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}") except Exception as e: logger.error(f"❌ Failed to push to Hub: {{e}}") # Final evaluation logger.info("\\nFinal evaluation...") eval_results = trainer.evaluate() logger.info("\\n" + "="*60) logger.info("TRAINING COMPLETE!") logger.info("="*60) for key, value in eval_results.items(): logger.info(f"{{key}}: {{value}}") logger.info(f"\\n✅ Model saved locally to: {{output_dir}}/final") if HF_TOKEN and MODEL_REPO_NAME: logger.info(f"✅ Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}") ''' with open("train_script.py", "w") as f: f.write(script_content) training_status["log"].append("✓ Training script created\n") training_status["log"].append("🚀 Starting training...\n\n") # Run training in subprocess process = subprocess.Popen( ["python", "train_script.py"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) # Stream output for line in process.stdout: training_status["log"].append(line) time.sleep(0.01) # Small delay to allow UI updates process.wait() if process.returncode == 0: training_status["completed"] = True training_status["log"].append("\n\n✅ TRAINING COMPLETED SUCCESSFULLY!\n") training_status["log"].append("📦 Model saved locally to: coptic_megalaa_finetuned/final\n") if hf_token and model_repo_name: training_status["log"].append(f"📦 Model pushed to: https://huggingface.co/{model_repo_name}\n") else: training_status["error"] = f"Training failed with exit code {process.returncode}" training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n") except Exception as e: training_status["error"] = str(e) training_status["log"].append(f"\n\n❌ Error: {str(e)}\n") finally: training_status["running"] = False def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name): """ Start training in background thread """ if training_status["running"]: return "⚠️ Training already in progress!" if not hf_token or not model_repo_name: return "⚠️ Please provide both HuggingFace Token and Model Repository Name!" # Start training thread thread = threading.Thread( target=train_model, args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name) ) thread.daemon = True thread.start() return "🚀 Training started! Monitor progress in the logs below." def get_training_log(): """ Return current training log """ return "".join(training_status["log"]) def check_status(): """ Return training status """ if training_status["completed"]: return "✅ Training completed!" elif training_status["error"]: return f"❌ Error: {training_status['error']}" elif training_status["running"]: return "🔄 Training in progress..." else: return "⏸️ Ready to train" # Create Gradio interface with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo: gr.Markdown(""" # 🏛️ Megalaa Coptic Translation Fine-tuning Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus. **⚙️ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance! """) with gr.Row(): with gr.Column(): gr.Markdown("### 🔑 HuggingFace Hub Configuration") hf_token_input = gr.Textbox( label="HuggingFace Token", placeholder="hf_...", type="password", info="Get your token from https://huggingface.co/settings/tokens" ) model_repo_input = gr.Textbox( label="Model Repository Name", placeholder="username/coptic-megalaa-finetuned", info="Example: john-doe/coptic-megalaa-finetuned" ) gr.Markdown("### 📤 Upload Training Data") train_file_upload = gr.File( label="Training Data (train.jsonl)", file_types=[".jsonl"] ) val_file_upload = gr.File( label="Validation Data (val.jsonl)", file_types=[".jsonl"] ) gr.Markdown("### ⚙️ Training Parameters") num_epochs = gr.Slider( minimum=1, maximum=10, value=5, step=1, label="Number of Epochs" ) batch_size = gr.Slider( minimum=4, maximum=16, value=8, step=4, label="Batch Size" ) learning_rate = gr.Number( value=2e-5, label="Learning Rate" ) start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg") status_text = gr.Textbox(label="Status", value="⏸️ Ready to train") with gr.Column(): gr.Markdown("### 📊 Training Log") log_output = gr.Textbox( label="Real-time Training Log", lines=30, max_lines=30, autoscroll=True, every=2 ) # Button actions start_btn.click( fn=start_training, inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input], outputs=status_text ) # Auto-refresh log and status demo.load(fn=get_training_log, outputs=log_output, every=2) demo.load(fn=check_status, outputs=status_text, every=2) gr.Markdown(""" --- ### 📥 After Training When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub! **Next steps:** 1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME` 2. Download and test with: `python evaluate_megalaa_model.py` 3. Integrate into your Coptic translation interface 4. Share your model with the community! **Estimated training time:** 6-8 hours on T4 GPU **Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training, but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version! """) if __name__ == "__main__": demo.launch()