Spaces:

harismlnaslm
/

Textilindo-AI

Sleeping

File size: 13,472 Bytes

701eb48

#!/usr/bin/env python3
"""
Textilindo AI Training API
Pure API-based training system for Hugging Face Spaces
Uses free GPU tier and your training data/configs
"""

import os
import json
import yaml
import logging
import torch
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, Optional
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel
import uvicorn

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Training API
training_app = FastAPI(title="Textilindo AI Training API")

# Training status storage
training_status = {
    "is_training": False,
    "progress": 0,
    "status": "idle",
    "current_step": 0,
    "total_steps": 0,
    "loss": 0.0,
    "start_time": None,
    "end_time": None,
    "error": None
}

class TrainingRequest(BaseModel):
    model_name: str = "distilgpt2"  # Start with small model
    dataset_path: str = "data/lora_dataset_20250829_113330.jsonl"
    config_path: str = "configs/training_config.yaml"
    max_samples: int = 10  # Limit for free tier
    epochs: int = 1
    batch_size: int = 1
    learning_rate: float = 5e-5

class TrainingResponse(BaseModel):
    success: bool
    message: str
    training_id: str
    status: str

def load_training_config(config_path: str) -> Dict[str, Any]:
    """Load training configuration"""
    try:
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
        return config
    except Exception as e:
        logger.error(f"Error loading config: {e}")
        return {}

def load_training_data(dataset_path: str, max_samples: int = 10) -> list:
    """Load training data from JSONL file"""
    data = []
    try:
        with open(dataset_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= max_samples:
                    break
                if line.strip():
                    item = json.loads(line)
                    # Create training text
                    instruction = item.get('instruction', '')
                    output = item.get('output', '')
                    text = f"Question: {instruction} Answer: {output}"
                    data.append({"text": text})
        logger.info(f"Loaded {len(data)} training samples")
        return data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return []

def check_gpu_availability() -> bool:
    """Check if GPU is available"""
    try:
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            gpu_name = torch.cuda.get_device_name(0)
            logger.info(f"GPU available: {gpu_name} (Count: {gpu_count})")
            return True
        else:
            logger.info("No GPU available, using CPU")
            return False
    except Exception as e:
        logger.error(f"Error checking GPU: {e}")
        return False

def train_model_async(
    model_name: str,
    dataset_path: str,
    config_path: str,
    max_samples: int,
    epochs: int,
    batch_size: int,
    learning_rate: float
):
    """Async training function"""
    global training_status
    
    try:
        training_status.update({
            "is_training": True,
            "status": "starting",
            "progress": 0,
            "start_time": datetime.now().isoformat(),
            "error": None
        })
        
        logger.info("🚀 Starting training...")
        
        # Import training libraries
        from transformers import (
            AutoTokenizer, 
            AutoModelForCausalLM,
            TrainingArguments,
            Trainer,
            DataCollatorForLanguageModeling
        )
        from datasets import Dataset
        
        # Check GPU
        gpu_available = check_gpu_availability()
        
        # Load model and tokenizer
        logger.info(f"📥 Loading model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model with GPU if available
        if gpu_available:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name)
        
        logger.info("✅ Model loaded successfully")
        
        # Load training data
        training_data = load_training_data(dataset_path, max_samples)
        if not training_data:
            raise Exception("No training data loaded")
        
        # Convert to dataset
        dataset = Dataset.from_list(training_data)
        
        def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                truncation=True,
                padding=True,
                max_length=256,
                return_tensors="pt"
            )
        
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir="./models/textilindo-trained",
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=2,
            learning_rate=learning_rate,
            warmup_steps=5,
            save_steps=10,
            logging_steps=1,
            save_total_limit=1,
            prediction_loss_only=True,
            remove_unused_columns=False,
            fp16=gpu_available,  # Use fp16 only if GPU available
            dataloader_pin_memory=gpu_available,
            report_to=None,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,
        )
        
        # Create trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer,
        )
        
        # Custom callback for progress tracking
        class ProgressCallback:
            def __init__(self):
                self.step = 0
                self.total_steps = len(tokenized_dataset) * epochs
            
            def on_log(self, args, state, control, logs=None, **kwargs):
                global training_status
                if logs:
                    training_status.update({
                        "current_step": state.global_step,
                        "total_steps": self.total_steps,
                        "progress": min(100, (state.global_step / self.total_steps) * 100),
                        "loss": logs.get('loss', 0.0),
                        "status": "training"
                    })
        
        # Add callback
        trainer.add_callback(ProgressCallback())
        
        # Start training
        training_status["status"] = "training"
        trainer.train()
        
        # Save model
        model.save_pretrained("./models/textilindo-trained")
        tokenizer.save_pretrained("./models/textilindo-trained")
        
        # Update status
        training_status.update({
            "is_training": False,
            "status": "completed",
            "progress": 100,
            "end_time": datetime.now().isoformat()
        })
        
        logger.info("✅ Training completed successfully!")
        
    except Exception as e:
        logger.error(f"Training failed: {e}")
        training_status.update({
            "is_training": False,
            "status": "failed",
            "error": str(e),
            "end_time": datetime.now().isoformat()
        })

# API Endpoints

@training_app.post("/train/start", response_model=TrainingResponse)
async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks):
    """Start training process"""
    global training_status
    
    if training_status["is_training"]:
        raise HTTPException(status_code=400, detail="Training already in progress")
    
    # Validate inputs
    if not Path(request.dataset_path).exists():
        raise HTTPException(status_code=404, detail=f"Dataset not found: {request.dataset_path}")
    
    if not Path(request.config_path).exists():
        raise HTTPException(status_code=404, detail=f"Config not found: {request.config_path}")
    
    # Start training in background
    training_id = f"train_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    
    background_tasks.add_task(
        train_model_async,
        request.model_name,
        request.dataset_path,
        request.config_path,
        request.max_samples,
        request.epochs,
        request.batch_size,
        request.learning_rate
    )
    
    return TrainingResponse(
        success=True,
        message="Training started successfully",
        training_id=training_id,
        status="started"
    )

@training_app.get("/train/status")
async def get_training_status():
    """Get current training status"""
    return training_status

@training_app.get("/train/data")
async def get_training_data_info():
    """Get information about available training data"""
    data_dir = Path("data")
    if not data_dir.exists():
        return {"files": [], "count": 0}
    
    jsonl_files = list(data_dir.glob("*.jsonl"))
    files_info = []
    
    for file in jsonl_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            files_info.append({
                "name": file.name,
                "size": file.stat().st_size,
                "lines": len(lines)
            })
        except Exception as e:
            files_info.append({
                "name": file.name,
                "error": str(e)
            })
    
    return {
        "files": files_info,
        "count": len(jsonl_files)
    }

@training_app.get("/train/config")
async def get_training_config():
    """Get current training configuration"""
    config_path = "configs/training_config.yaml"
    if not Path(config_path).exists():
        return {"error": "Config file not found"}
    
    try:
        config = load_training_config(config_path)
        return config
    except Exception as e:
        return {"error": str(e)}

@training_app.get("/train/models")
async def get_available_models():
    """Get list of available models"""
    return {
        "models": [
            {
                "name": "distilgpt2",
                "size": "82M",
                "description": "Small, fast model for quick training"
            },
            {
                "name": "gpt2",
                "size": "124M", 
                "description": "Original GPT-2 model"
            },
            {
                "name": "microsoft/DialoGPT-small",
                "size": "117M",
                "description": "Conversational model"
            }
        ]
    }

@training_app.get("/train/gpu")
async def get_gpu_info():
    """Get GPU information"""
    try:
        gpu_available = torch.cuda.is_available()
        if gpu_available:
            gpu_count = torch.cuda.device_count()
            gpu_name = torch.cuda.get_device_name(0)
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
            return {
                "available": True,
                "count": gpu_count,
                "name": gpu_name,
                "memory_gb": round(gpu_memory, 2)
            }
        else:
            return {"available": False}
    except Exception as e:
        return {"error": str(e)}

@training_app.post("/train/stop")
async def stop_training():
    """Stop current training"""
    global training_status
    
    if not training_status["is_training"]:
        return {"message": "No training in progress"}
    
    training_status.update({
        "is_training": False,
        "status": "stopped",
        "end_time": datetime.now().isoformat()
    })
    
    return {"message": "Training stopped"}

@training_app.get("/train/test")
async def test_trained_model():
    """Test the trained model"""
    model_path = "./models/textilindo-trained"
    if not Path(model_path).exists():
        return {"error": "No trained model found"}
    
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(model_path)
        
        # Test prompt
        test_prompt = "Question: dimana lokasi textilindo? Answer:"
        inputs = tokenizer(test_prompt, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=inputs.input_ids.shape[1] + 30,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return {
            "success": True,
            "test_prompt": test_prompt,
            "response": response,
            "model_path": model_path
        }
        
    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    uvicorn.run(training_app, host="0.0.0.0", port=7861)