Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Textilindo AI Training API | |
| Pure API-based training system for Hugging Face Spaces | |
| Uses free GPU tier and your training data/configs | |
| """ | |
| import os | |
| import json | |
| import yaml | |
| import logging | |
| import torch | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Dict, Any, Optional | |
| from fastapi import FastAPI, HTTPException, BackgroundTasks | |
| from pydantic import BaseModel | |
| import uvicorn | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Training API | |
| training_app = FastAPI(title="Textilindo AI Training API") | |
| # Training status storage | |
| training_status = { | |
| "is_training": False, | |
| "progress": 0, | |
| "status": "idle", | |
| "current_step": 0, | |
| "total_steps": 0, | |
| "loss": 0.0, | |
| "start_time": None, | |
| "end_time": None, | |
| "error": None | |
| } | |
| class TrainingRequest(BaseModel): | |
| model_name: str = "distilgpt2" # Start with small model | |
| dataset_path: str = "data/lora_dataset_20250829_113330.jsonl" | |
| config_path: str = "configs/training_config.yaml" | |
| max_samples: int = 10 # Limit for free tier | |
| epochs: int = 1 | |
| batch_size: int = 1 | |
| learning_rate: float = 5e-5 | |
| class TrainingResponse(BaseModel): | |
| success: bool | |
| message: str | |
| training_id: str | |
| status: str | |
| def load_training_config(config_path: str) -> Dict[str, Any]: | |
| """Load training configuration""" | |
| try: | |
| with open(config_path, 'r') as f: | |
| config = yaml.safe_load(f) | |
| return config | |
| except Exception as e: | |
| logger.error(f"Error loading config: {e}") | |
| return {} | |
| def load_training_data(dataset_path: str, max_samples: int = 10) -> list: | |
| """Load training data from JSONL file""" | |
| data = [] | |
| try: | |
| with open(dataset_path, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f): | |
| if i >= max_samples: | |
| break | |
| if line.strip(): | |
| item = json.loads(line) | |
| # Create training text | |
| instruction = item.get('instruction', '') | |
| output = item.get('output', '') | |
| text = f"Question: {instruction} Answer: {output}" | |
| data.append({"text": text}) | |
| logger.info(f"Loaded {len(data)} training samples") | |
| return data | |
| except Exception as e: | |
| logger.error(f"Error loading data: {e}") | |
| return [] | |
| def check_gpu_availability() -> bool: | |
| """Check if GPU is available""" | |
| try: | |
| if torch.cuda.is_available(): | |
| gpu_count = torch.cuda.device_count() | |
| gpu_name = torch.cuda.get_device_name(0) | |
| logger.info(f"GPU available: {gpu_name} (Count: {gpu_count})") | |
| return True | |
| else: | |
| logger.info("No GPU available, using CPU") | |
| return False | |
| except Exception as e: | |
| logger.error(f"Error checking GPU: {e}") | |
| return False | |
| def train_model_async( | |
| model_name: str, | |
| dataset_path: str, | |
| config_path: str, | |
| max_samples: int, | |
| epochs: int, | |
| batch_size: int, | |
| learning_rate: float | |
| ): | |
| """Async training function""" | |
| global training_status | |
| try: | |
| training_status.update({ | |
| "is_training": True, | |
| "status": "starting", | |
| "progress": 0, | |
| "start_time": datetime.now().isoformat(), | |
| "error": None | |
| }) | |
| logger.info("🚀 Starting training...") | |
| # Import training libraries | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorForLanguageModeling | |
| ) | |
| from datasets import Dataset | |
| # Check GPU | |
| gpu_available = check_gpu_availability() | |
| # Load model and tokenizer | |
| logger.info(f"📥 Loading model: {model_name}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model with GPU if available | |
| if gpu_available: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| logger.info("✅ Model loaded successfully") | |
| # Load training data | |
| training_data = load_training_data(dataset_path, max_samples) | |
| if not training_data: | |
| raise Exception("No training data loaded") | |
| # Convert to dataset | |
| dataset = Dataset.from_list(training_data) | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| padding=True, | |
| max_length=256, | |
| return_tensors="pt" | |
| ) | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./models/textilindo-trained", | |
| num_train_epochs=epochs, | |
| per_device_train_batch_size=batch_size, | |
| gradient_accumulation_steps=2, | |
| learning_rate=learning_rate, | |
| warmup_steps=5, | |
| save_steps=10, | |
| logging_steps=1, | |
| save_total_limit=1, | |
| prediction_loss_only=True, | |
| remove_unused_columns=False, | |
| fp16=gpu_available, # Use fp16 only if GPU available | |
| dataloader_pin_memory=gpu_available, | |
| report_to=None, | |
| ) | |
| # Data collator | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False, | |
| ) | |
| # Create trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| # Custom callback for progress tracking | |
| class ProgressCallback: | |
| def __init__(self): | |
| self.step = 0 | |
| self.total_steps = len(tokenized_dataset) * epochs | |
| def on_log(self, args, state, control, logs=None, **kwargs): | |
| global training_status | |
| if logs: | |
| training_status.update({ | |
| "current_step": state.global_step, | |
| "total_steps": self.total_steps, | |
| "progress": min(100, (state.global_step / self.total_steps) * 100), | |
| "loss": logs.get('loss', 0.0), | |
| "status": "training" | |
| }) | |
| # Add callback | |
| trainer.add_callback(ProgressCallback()) | |
| # Start training | |
| training_status["status"] = "training" | |
| trainer.train() | |
| # Save model | |
| model.save_pretrained("./models/textilindo-trained") | |
| tokenizer.save_pretrained("./models/textilindo-trained") | |
| # Update status | |
| training_status.update({ | |
| "is_training": False, | |
| "status": "completed", | |
| "progress": 100, | |
| "end_time": datetime.now().isoformat() | |
| }) | |
| logger.info("✅ Training completed successfully!") | |
| except Exception as e: | |
| logger.error(f"Training failed: {e}") | |
| training_status.update({ | |
| "is_training": False, | |
| "status": "failed", | |
| "error": str(e), | |
| "end_time": datetime.now().isoformat() | |
| }) | |
| # API Endpoints | |
| async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks): | |
| """Start training process""" | |
| global training_status | |
| if training_status["is_training"]: | |
| raise HTTPException(status_code=400, detail="Training already in progress") | |
| # Validate inputs | |
| if not Path(request.dataset_path).exists(): | |
| raise HTTPException(status_code=404, detail=f"Dataset not found: {request.dataset_path}") | |
| if not Path(request.config_path).exists(): | |
| raise HTTPException(status_code=404, detail=f"Config not found: {request.config_path}") | |
| # Start training in background | |
| training_id = f"train_{datetime.now().strftime('%Y%m%d_%H%M%S')}" | |
| background_tasks.add_task( | |
| train_model_async, | |
| request.model_name, | |
| request.dataset_path, | |
| request.config_path, | |
| request.max_samples, | |
| request.epochs, | |
| request.batch_size, | |
| request.learning_rate | |
| ) | |
| return TrainingResponse( | |
| success=True, | |
| message="Training started successfully", | |
| training_id=training_id, | |
| status="started" | |
| ) | |
| async def get_training_status(): | |
| """Get current training status""" | |
| return training_status | |
| async def get_training_data_info(): | |
| """Get information about available training data""" | |
| data_dir = Path("data") | |
| if not data_dir.exists(): | |
| return {"files": [], "count": 0} | |
| jsonl_files = list(data_dir.glob("*.jsonl")) | |
| files_info = [] | |
| for file in jsonl_files: | |
| try: | |
| with open(file, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| files_info.append({ | |
| "name": file.name, | |
| "size": file.stat().st_size, | |
| "lines": len(lines) | |
| }) | |
| except Exception as e: | |
| files_info.append({ | |
| "name": file.name, | |
| "error": str(e) | |
| }) | |
| return { | |
| "files": files_info, | |
| "count": len(jsonl_files) | |
| } | |
| async def get_training_config(): | |
| """Get current training configuration""" | |
| config_path = "configs/training_config.yaml" | |
| if not Path(config_path).exists(): | |
| return {"error": "Config file not found"} | |
| try: | |
| config = load_training_config(config_path) | |
| return config | |
| except Exception as e: | |
| return {"error": str(e)} | |
| async def get_available_models(): | |
| """Get list of available models""" | |
| return { | |
| "models": [ | |
| { | |
| "name": "distilgpt2", | |
| "size": "82M", | |
| "description": "Small, fast model for quick training" | |
| }, | |
| { | |
| "name": "gpt2", | |
| "size": "124M", | |
| "description": "Original GPT-2 model" | |
| }, | |
| { | |
| "name": "microsoft/DialoGPT-small", | |
| "size": "117M", | |
| "description": "Conversational model" | |
| } | |
| ] | |
| } | |
| async def get_gpu_info(): | |
| """Get GPU information""" | |
| try: | |
| gpu_available = torch.cuda.is_available() | |
| if gpu_available: | |
| gpu_count = torch.cuda.device_count() | |
| gpu_name = torch.cuda.get_device_name(0) | |
| gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) | |
| return { | |
| "available": True, | |
| "count": gpu_count, | |
| "name": gpu_name, | |
| "memory_gb": round(gpu_memory, 2) | |
| } | |
| else: | |
| return {"available": False} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| async def stop_training(): | |
| """Stop current training""" | |
| global training_status | |
| if not training_status["is_training"]: | |
| return {"message": "No training in progress"} | |
| training_status.update({ | |
| "is_training": False, | |
| "status": "stopped", | |
| "end_time": datetime.now().isoformat() | |
| }) | |
| return {"message": "Training stopped"} | |
| async def test_trained_model(): | |
| """Test the trained model""" | |
| model_path = "./models/textilindo-trained" | |
| if not Path(model_path).exists(): | |
| return {"error": "No trained model found"} | |
| try: | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForCausalLM.from_pretrained(model_path) | |
| # Test prompt | |
| test_prompt = "Question: dimana lokasi textilindo? Answer:" | |
| inputs = tokenizer(test_prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_length=inputs.input_ids.shape[1] + 30, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return { | |
| "success": True, | |
| "test_prompt": test_prompt, | |
| "response": response, | |
| "model_path": model_path | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| if __name__ == "__main__": | |
| uvicorn.run(training_app, host="0.0.0.0", port=7861) | |