Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Textilindo AI Training API - Pure API Version | |
| No HTML interfaces, only API endpoints for training and chat | |
| """ | |
| import os | |
| import json | |
| import logging | |
| import torch | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Dict, Any, Optional | |
| from fastapi import FastAPI, HTTPException, BackgroundTasks | |
| from pydantic import BaseModel | |
| import uvicorn | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Initialize FastAPI app | |
| app = FastAPI( | |
| title="Textilindo AI Training API", | |
| description="Pure API-based training system for Textilindo AI Assistant", | |
| version="1.0.0" | |
| ) | |
| # Training status storage | |
| training_status = { | |
| "is_training": False, | |
| "progress": 0, | |
| "status": "idle", | |
| "current_step": 0, | |
| "total_steps": 0, | |
| "loss": 0.0, | |
| "start_time": None, | |
| "end_time": None, | |
| "error": None | |
| } | |
| # Request/Response models | |
| class TrainingRequest(BaseModel): | |
| model_name: str = "distilgpt2" | |
| dataset_path: str = "data/lora_dataset_20250910_145055.jsonl" | |
| config_path: str = "configs/training_config.yaml" | |
| max_samples: int = 20 | |
| epochs: int = 1 | |
| batch_size: int = 1 | |
| learning_rate: float = 5e-5 | |
| class TrainingResponse(BaseModel): | |
| success: bool | |
| message: str | |
| training_id: str | |
| status: str | |
| class ChatRequest(BaseModel): | |
| message: str | |
| conversation_id: Optional[str] = None | |
| class ChatResponse(BaseModel): | |
| response: str | |
| conversation_id: str | |
| status: str = "success" | |
| # API Information | |
| async def api_info(): | |
| """API information endpoint""" | |
| return { | |
| "name": "Textilindo AI Training API", | |
| "version": "1.0.0", | |
| "description": "Pure API-based training system for Textilindo AI Assistant", | |
| "hardware": "2 vCPU, 16 GB RAM (CPU basic)", | |
| "status": "ready", | |
| "endpoints": { | |
| "training": { | |
| "start": "POST /api/train/start", | |
| "status": "GET /api/train/status", | |
| "data": "GET /api/train/data", | |
| "gpu": "GET /api/train/gpu", | |
| "test": "POST /api/train/test" | |
| }, | |
| "chat": { | |
| "chat": "POST /chat", | |
| "health": "GET /health" | |
| } | |
| } | |
| } | |
| # Health check | |
| async def health_check(): | |
| """Health check endpoint""" | |
| return { | |
| "status": "healthy", | |
| "timestamp": datetime.now().isoformat(), | |
| "hardware": "2 vCPU, 16 GB RAM" | |
| } | |
| # Training API endpoints | |
| async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks): | |
| """Start training process""" | |
| global training_status | |
| if training_status["is_training"]: | |
| raise HTTPException(status_code=400, detail="Training already in progress") | |
| # Validate inputs | |
| if not Path(request.dataset_path).exists(): | |
| raise HTTPException(status_code=404, detail=f"Dataset not found: {request.dataset_path}") | |
| if not Path(request.config_path).exists(): | |
| raise HTTPException(status_code=404, detail=f"Config not found: {request.config_path}") | |
| # Start training in background | |
| training_id = f"train_{datetime.now().strftime('%Y%m%d_%H%M%S')}" | |
| background_tasks.add_task( | |
| train_model_async, | |
| request.model_name, | |
| request.dataset_path, | |
| request.config_path, | |
| request.max_samples, | |
| request.epochs, | |
| request.batch_size, | |
| request.learning_rate | |
| ) | |
| return TrainingResponse( | |
| success=True, | |
| message="Training started successfully", | |
| training_id=training_id, | |
| status="started" | |
| ) | |
| async def get_training_status(): | |
| """Get current training status""" | |
| return training_status | |
| async def get_training_data_info(): | |
| """Get information about available training data""" | |
| data_dir = Path("data") | |
| if not data_dir.exists(): | |
| return {"files": [], "count": 0} | |
| jsonl_files = list(data_dir.glob("*.jsonl")) | |
| files_info = [] | |
| for file in jsonl_files: | |
| try: | |
| with open(file, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| files_info.append({ | |
| "name": file.name, | |
| "size": file.stat().st_size, | |
| "lines": len(lines) | |
| }) | |
| except Exception as e: | |
| files_info.append({ | |
| "name": file.name, | |
| "error": str(e) | |
| }) | |
| return { | |
| "files": files_info, | |
| "count": len(jsonl_files) | |
| } | |
| async def get_gpu_info(): | |
| """Get GPU information""" | |
| try: | |
| gpu_available = torch.cuda.is_available() | |
| if gpu_available: | |
| gpu_count = torch.cuda.device_count() | |
| gpu_name = torch.cuda.get_device_name(0) | |
| gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) | |
| return { | |
| "available": True, | |
| "count": gpu_count, | |
| "name": gpu_name, | |
| "memory_gb": round(gpu_memory, 2) | |
| } | |
| else: | |
| return {"available": False} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| async def test_trained_model(): | |
| """Test the trained model""" | |
| model_path = "./models/textilindo-trained" | |
| if not Path(model_path).exists(): | |
| return {"error": "No trained model found"} | |
| try: | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForCausalLM.from_pretrained(model_path) | |
| # Test prompt | |
| test_prompt = "Question: dimana lokasi textilindo? Answer:" | |
| inputs = tokenizer(test_prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_length=inputs.input_ids.shape[1] + 30, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return { | |
| "success": True, | |
| "test_prompt": test_prompt, | |
| "response": response, | |
| "model_path": model_path | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| # Chat API endpoint | |
| async def chat(request: ChatRequest): | |
| """Chat with the AI assistant""" | |
| try: | |
| # Simple mock response for now | |
| mock_responses = { | |
| "dimana lokasi textilindo": "Textilindo berkantor pusat di Jl. Raya Prancis No.39, Kosambi Tim., Kec. Kosambi, Kabupaten Tangerang, Banten 15213", | |
| "jam berapa textilindo beroperasional": "Jam operasional Senin-Jumat 08:00-17:00, Sabtu 08:00-12:00.", | |
| "berapa ketentuan pembelian": "Minimal order 1 roll per jenis kain", | |
| "apa ada gratis ongkir": "Gratis ongkir untuk order minimal 5 roll.", | |
| "apa bisa dikirimkan sample": "Hallo kak untuk sampel kita bisa kirimkan gratis ya kak 😊" | |
| } | |
| # Simple keyword matching | |
| user_lower = request.message.lower() | |
| response = "Halo! Saya adalah asisten AI Textilindo. Bagaimana saya bisa membantu Anda hari ini? 😊" | |
| for key, mock_response in mock_responses.items(): | |
| if any(word in user_lower for word in key.split()): | |
| response = mock_response | |
| break | |
| return ChatResponse( | |
| response=response, | |
| conversation_id=request.conversation_id or "default", | |
| status="success" | |
| ) | |
| except Exception as e: | |
| logger.error(f"Chat error: {e}") | |
| return ChatResponse( | |
| response="Maaf, terjadi kesalahan. Silakan coba lagi.", | |
| conversation_id=request.conversation_id or "default", | |
| status="error" | |
| ) | |
| # Training function | |
| async def train_model_async( | |
| model_name: str, | |
| dataset_path: str, | |
| config_path: str, | |
| max_samples: int, | |
| epochs: int, | |
| batch_size: int, | |
| learning_rate: float | |
| ): | |
| """Async training function""" | |
| global training_status | |
| try: | |
| training_status.update({ | |
| "is_training": True, | |
| "status": "starting", | |
| "progress": 0, | |
| "start_time": datetime.now().isoformat(), | |
| "error": None | |
| }) | |
| logger.info("🚀 Starting training...") | |
| # Import training libraries | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorForLanguageModeling | |
| ) | |
| from datasets import Dataset | |
| # Check GPU | |
| gpu_available = torch.cuda.is_available() | |
| logger.info(f"GPU available: {gpu_available}") | |
| # Load model and tokenizer | |
| logger.info(f"📥 Loading model: {model_name}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model | |
| if gpu_available: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| logger.info("✅ Model loaded successfully") | |
| # Load training data | |
| training_data = load_training_data(dataset_path, max_samples) | |
| if not training_data: | |
| raise Exception("No training data loaded") | |
| # Convert to dataset | |
| dataset = Dataset.from_list(training_data) | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| padding=True, | |
| max_length=256, | |
| return_tensors="pt" | |
| ) | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./models/textilindo-trained", | |
| num_train_epochs=epochs, | |
| per_device_train_batch_size=batch_size, | |
| gradient_accumulation_steps=2, | |
| learning_rate=learning_rate, | |
| warmup_steps=5, | |
| save_steps=10, | |
| logging_steps=1, | |
| save_total_limit=1, | |
| prediction_loss_only=True, | |
| remove_unused_columns=False, | |
| fp16=gpu_available, | |
| dataloader_pin_memory=gpu_available, | |
| report_to=None, | |
| ) | |
| # Data collator | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False, | |
| ) | |
| # Create trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| # Start training | |
| training_status["status"] = "training" | |
| trainer.train() | |
| # Save model | |
| model.save_pretrained("./models/textilindo-trained") | |
| tokenizer.save_pretrained("./models/textilindo-trained") | |
| # Update status | |
| training_status.update({ | |
| "is_training": False, | |
| "status": "completed", | |
| "progress": 100, | |
| "end_time": datetime.now().isoformat() | |
| }) | |
| logger.info("✅ Training completed successfully!") | |
| except Exception as e: | |
| logger.error(f"Training failed: {e}") | |
| training_status.update({ | |
| "is_training": False, | |
| "status": "failed", | |
| "error": str(e), | |
| "end_time": datetime.now().isoformat() | |
| }) | |
| def load_training_data(dataset_path: str, max_samples: int = 20) -> list: | |
| """Load training data from JSONL file""" | |
| data = [] | |
| try: | |
| with open(dataset_path, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f): | |
| if i >= max_samples: | |
| break | |
| if line.strip(): | |
| item = json.loads(line) | |
| # Create training text | |
| instruction = item.get('instruction', '') | |
| output = item.get('output', '') | |
| text = f"Question: {instruction} Answer: {output}" | |
| data.append({"text": text}) | |
| logger.info(f"Loaded {len(data)} training samples") | |
| return data | |
| except Exception as e: | |
| logger.error(f"Error loading data: {e}") | |
| return [] | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |