Spaces:

Norelad
/

coptic-translation-interface

Sleeping

App Files Files Community

Rogaton commited on 28 days ago

Commit

5461265

1 Parent(s): cc8e202

Add automatic model upload to HuggingFace Hub

Browse files

Files changed (2) hide show

hf_space_megalaa_training/app.py +470 -0
hf_space_megalaa_training/requirements.txt +11 -0

hf_space_megalaa_training/app.py ADDED Viewed

	@@ -0,0 +1,470 @@

+#!/usr/bin/env python3
+"""
+HuggingFace Space for fine-tuning megalaa Coptic translation model
+This Gradio app provides a user-friendly interface for training the
+megalaa/coptic-english-translator model on your CopticScriptorium corpus.
+"""
+import gradio as gr
+import os
+import subprocess
+import threading
+import time
+from pathlib import Path
+# Global variable to track training status
+training_status = {
+    "running": False,
+    "log": [],
+    "completed": False,
+    "error": None
+}
+def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
+    """
+    Start model training with uploaded data files
+    """
+    global training_status
+    # Reset status
+    training_status = {
+        "running": True,
+        "log": ["🚀 Starting training setup...\n"],
+        "completed": False,
+        "error": None
+    }
+    try:
+        # Save uploaded files
+        train_path = "train.jsonl"
+        val_path = "val.jsonl"
+        with open(train_path, "wb") as f:
+            f.write(train_file)
+        with open(val_path, "wb") as f:
+            f.write(val_file)
+        training_status["log"].append(f"✓ Training data saved: {train_path}\n")
+        training_status["log"].append(f"✓ Validation data saved: {val_path}\n")
+        # Create training script
+        script_content = f'''#!/usr/bin/env python3
+import os
+import json
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+    DataCollatorForSeq2Seq,
+)
+from huggingface_hub import HfApi, login
+from evaluate import load
+import numpy as np
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# HuggingFace Hub configuration
+HF_TOKEN = "{hf_token}"
+MODEL_REPO_NAME = "{model_repo_name}"
+if HF_TOKEN:
+    login(token=HF_TOKEN)
+    logger.info("✓ Logged in to HuggingFace Hub")
+# Greekification for megalaa models
+COPTIC_TO_GREEK = {{
+    "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
+    "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
+    "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
+    "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ",
+    "ⲱ": "ω", "ϣ": "s", "ϥ": "f", "ϧ": "k", "ϩ": "h", "ϫ": "j",
+    "ϭ": "c", "ϯ": "t",
+}}
+def greekify(text):
+    if not text:
+        return ""
+    return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text)
+def extract_parallel_texts(examples):
+    coptic_texts = []
+    english_texts = []
+    for messages in examples['messages']:
+        coptic_text = None
+        english_text = None
+        for msg in messages:
+            if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']:
+                coptic_text = msg['content'].split('Coptic text to English:')[-1].strip()
+            elif msg['role'] == 'assistant':
+                english_text = msg['content']
+        coptic_texts.append(coptic_text)
+        english_texts.append(english_text)
+    return {{'coptic': coptic_texts, 'english': english_texts}}
+def preprocess_function(examples, tokenizer, max_length=256):
+    greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]]
+    model_inputs = tokenizer(
+        greekified_coptic,
+        max_length=max_length,
+        truncation=True,
+        padding="max_length"
+    )
+    labels = tokenizer(
+        text_target=examples["english"],
+        max_length=max_length,
+        truncation=True,
+        padding="max_length"
+    )
+    labels["input_ids"] = [
+        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
+        for labels_example in labels["input_ids"]
+    ]
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+def compute_metrics(eval_preds, tokenizer, metric):
+    preds, labels = eval_preds
+    if isinstance(preds, tuple):
+        preds = preds[0]
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    decoded_labels = [[label] for label in decoded_labels]
+    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+    return {{"bleu": result["score"]}}
+# Configuration
+model_name = "megalaa/coptic-english-translator"
+output_dir = "coptic_megalaa_finetuned"
+num_epochs = {num_epochs}
+batch_size = {batch_size}
+learning_rate = {learning_rate}
+logger.info("="*60)
+logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES")
+logger.info("="*60)
+logger.info(f"Base model: {{model_name}}")
+logger.info(f"Epochs: {{num_epochs}}")
+logger.info(f"Batch size: {{batch_size}}")
+logger.info(f"Learning rate: {{learning_rate}}")
+# Check GPU
+if torch.cuda.is_available():
+    logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}")
+    logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB")
+else:
+    logger.warning("No GPU detected!")
+# Load model
+logger.info("\\nLoading model...")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Load datasets
+logger.info("Loading datasets...")
+train_dataset = load_dataset('json', data_files='{train_path}', split='train')
+val_dataset = load_dataset('json', data_files='{val_path}', split='train')
+logger.info(f"Train samples: {{len(train_dataset):,}}")
+logger.info(f"Validation samples: {{len(val_dataset):,}}")
+# Extract and tokenize
+logger.info("Processing datasets...")
+train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
+val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
+tokenized_train = train_dataset.map(
+    lambda examples: preprocess_function(examples, tokenizer),
+    batched=True,
+    remove_columns=['coptic', 'english']
+)
+tokenized_val = val_dataset.map(
+    lambda examples: preprocess_function(examples, tokenizer),
+    batched=True,
+    remove_columns=['coptic', 'english']
+)
+# Setup training
+data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
+metric = load("sacrebleu")
+training_args = Seq2SeqTrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_epochs,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    gradient_accumulation_steps=2,
+    learning_rate=learning_rate,
+    warmup_steps=500,
+    max_grad_norm=1.0,
+    weight_decay=0.01,
+    eval_strategy="steps",
+    eval_steps=500,
+    logging_steps=50,
+    save_steps=500,
+    save_total_limit=3,
+    load_best_model_at_end=True,
+    metric_for_best_model="bleu",
+    greater_is_better=True,
+    predict_with_generate=True,
+    generation_max_length=256,
+    generation_num_beams=5,
+    fp16=torch.cuda.is_available(),
+    report_to="tensorboard",
+    logging_dir=f"{{output_dir}}/logs",
+    push_to_hub=False,
+)
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train,
+    eval_dataset=tokenized_val,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric)
+)
+logger.info("\\nSTARTING TRAINING")
+logger.info("="*60)
+# Train
+trainer.train()
+# Save locally
+logger.info("\\nSaving final model...")
+trainer.save_model(f"{{output_dir}}/final")
+tokenizer.save_pretrained(f"{{output_dir}}/final")
+# Push to HuggingFace Hub
+if HF_TOKEN and MODEL_REPO_NAME:
+    logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}")
+    try:
+        api = HfApi()
+        api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True)
+        # Upload all files
+        api.upload_folder(
+            folder_path=f"{{output_dir}}/final",
+            repo_id=MODEL_REPO_NAME,
+            repo_type="model",
+        )
+        logger.info(f"✅ Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}")
+    except Exception as e:
+        logger.error(f"❌ Failed to push to Hub: {{e}}")
+# Final evaluation
+logger.info("\\nFinal evaluation...")
+eval_results = trainer.evaluate()
+logger.info("\\n" + "="*60)
+logger.info("TRAINING COMPLETE!")
+logger.info("="*60)
+for key, value in eval_results.items():
+    logger.info(f"{{key}}: {{value}}")
+logger.info(f"\\n✅ Model saved locally to: {{output_dir}}/final")
+if HF_TOKEN and MODEL_REPO_NAME:
+    logger.info(f"✅ Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}")
+'''
+        with open("train_script.py", "w") as f:
+            f.write(script_content)
+        training_status["log"].append("✓ Training script created\n")
+        training_status["log"].append("🚀 Starting training...\n\n")
+        # Run training in subprocess
+        process = subprocess.Popen(
+            ["python", "train_script.py"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1
+        )
+        # Stream output
+        for line in process.stdout:
+            training_status["log"].append(line)
+            time.sleep(0.01)  # Small delay to allow UI updates
+        process.wait()
+        if process.returncode == 0:
+            training_status["completed"] = True
+            training_status["log"].append("\n\n✅ TRAINING COMPLETED SUCCESSFULLY!\n")
+            training_status["log"].append("📦 Model saved locally to: coptic_megalaa_finetuned/final\n")
+            if hf_token and model_repo_name:
+                training_status["log"].append(f"📦 Model pushed to: https://huggingface.co/{model_repo_name}\n")
+        else:
+            training_status["error"] = f"Training failed with exit code {process.returncode}"
+            training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n")
+    except Exception as e:
+        training_status["error"] = str(e)
+        training_status["log"].append(f"\n\n❌ Error: {str(e)}\n")
+    finally:
+        training_status["running"] = False
+def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
+    """
+    Start training in background thread
+    """
+    if training_status["running"]:
+        return "⚠️ Training already in progress!"
+    if not hf_token or not model_repo_name:
+        return "⚠️ Please provide both HuggingFace Token and Model Repository Name!"
+    # Start training thread
+    thread = threading.Thread(
+        target=train_model,
+        args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name)
+    )
+    thread.daemon = True
+    thread.start()
+    return "🚀 Training started! Monitor progress in the logs below."
+def get_training_log():
+    """
+    Return current training log
+    """
+    return "".join(training_status["log"])
+def check_status():
+    """
+    Return training status
+    """
+    if training_status["completed"]:
+        return "✅ Training completed!"
+    elif training_status["error"]:
+        return f"❌ Error: {training_status['error']}"
+    elif training_status["running"]:
+        return "🔄 Training in progress..."
+    else:
+        return "⏸️ Ready to train"
+# Create Gradio interface
+with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo:
+    gr.Markdown("""
+    # 🏛️ Megalaa Coptic Translation Fine-tuning
+    Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus.
+    **⚙️ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance!
+    """)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 🔑 HuggingFace Hub Configuration")
+            hf_token_input = gr.Textbox(
+                label="HuggingFace Token",
+                placeholder="hf_...",
+                type="password",
+                info="Get your token from https://huggingface.co/settings/tokens"
+            )
+            model_repo_input = gr.Textbox(
+                label="Model Repository Name",
+                placeholder="username/coptic-megalaa-finetuned",
+                info="Example: john-doe/coptic-megalaa-finetuned"
+            )
+            gr.Markdown("### 📤 Upload Training Data")
+            train_file_upload = gr.File(
+                label="Training Data (train.jsonl)",
+                file_types=[".jsonl"]
+            )
+            val_file_upload = gr.File(
+                label="Validation Data (val.jsonl)",
+                file_types=[".jsonl"]
+            )
+            gr.Markdown("### ⚙️ Training Parameters")
+            num_epochs = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=5,
+                step=1,
+                label="Number of Epochs"
+            )
+            batch_size = gr.Slider(
+                minimum=4,
+                maximum=16,
+                value=8,
+                step=4,
+                label="Batch Size"
+            )
+            learning_rate = gr.Number(
+                value=2e-5,
+                label="Learning Rate"
+            )
+            start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
+            status_text = gr.Textbox(label="Status", value="⏸️ Ready to train")
+        with gr.Column():
+            gr.Markdown("### 📊 Training Log")
+            log_output = gr.Textbox(
+                label="Real-time Training Log",
+                lines=30,
+                max_lines=30,
+                autoscroll=True,
+                every=2
+            )
+    # Button actions
+    start_btn.click(
+        fn=start_training,
+        inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input],
+        outputs=status_text
+    )
+    # Auto-refresh log and status
+    demo.load(fn=get_training_log, outputs=log_output, every=2)
+    demo.load(fn=check_status, outputs=status_text, every=2)
+    gr.Markdown("""
+    ---
+    ### 📥 After Training
+    When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub!
+    **Next steps:**
+    1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME`
+    2. Download and test with: `python evaluate_megalaa_model.py`
+    3. Integrate into your Coptic translation interface
+    4. Share your model with the community!
+    **Estimated training time:** 6-8 hours on T4 GPU
+    **Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training,
+    but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version!
+    """)
+if __name__ == "__main__":
+    demo.launch()

hf_space_megalaa_training/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch>=2.0.0
+transformers>=4.35.0
+datasets>=2.14.0
+accelerate>=0.24.0
+evaluate>=0.4.1
+sacrebleu>=2.3.1
+sentencepiece>=0.1.99
+protobuf>=3.20.0
+gradio>=4.44.0
+tensorboard>=2.15.0
+huggingface_hub>=0.20.0