Spaces:

harismlnaslm
/

Textilindo-AI

Sleeping

App Files Files Community

Textilindo-AI / training_api.py

harismlnaslm

Add pure API-based training system with GPU support and background processing

701eb48 about 2 months ago

raw

history blame contribute delete

13.5 kB

	#!/usr/bin/env python3
	"""
	Textilindo AI Training API
	Pure API-based training system for Hugging Face Spaces
	Uses free GPU tier and your training data/configs
	"""

	import os
	import json
	import yaml
	import logging
	import torch
	from pathlib import Path
	from datetime import datetime
	from typing import Dict, Any, Optional
	from fastapi import FastAPI, HTTPException, BackgroundTasks
	from pydantic import BaseModel
	import uvicorn

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Training API
	training_app = FastAPI(title="Textilindo AI Training API")

	# Training status storage
	training_status = {
	"is_training": False,
	"progress": 0,
	"status": "idle",
	"current_step": 0,
	"total_steps": 0,
	"loss": 0.0,
	"start_time": None,
	"end_time": None,
	"error": None
	}

	class TrainingRequest(BaseModel):
	model_name: str = "distilgpt2" # Start with small model
	dataset_path: str = "data/lora_dataset_20250829_113330.jsonl"
	config_path: str = "configs/training_config.yaml"
	max_samples: int = 10 # Limit for free tier
	epochs: int = 1
	batch_size: int = 1
	learning_rate: float = 5e-5

	class TrainingResponse(BaseModel):
	success: bool
	message: str
	training_id: str
	status: str

	def load_training_config(config_path: str) -> Dict[str, Any]:
	"""Load training configuration"""
	try:
	with open(config_path, 'r') as f:
	config = yaml.safe_load(f)
	return config
	except Exception as e:
	logger.error(f"Error loading config: {e}")
	return {}

	def load_training_data(dataset_path: str, max_samples: int = 10) -> list:
	"""Load training data from JSONL file"""
	data = []
	try:
	with open(dataset_path, 'r', encoding='utf-8') as f:
	for i, line in enumerate(f):
	if i >= max_samples:
	break
	if line.strip():
	item = json.loads(line)
	# Create training text
	instruction = item.get('instruction', '')
	output = item.get('output', '')
	text = f"Question: {instruction} Answer: {output}"
	data.append({"text": text})
	logger.info(f"Loaded {len(data)} training samples")
	return data
	except Exception as e:
	logger.error(f"Error loading data: {e}")
	return []

	def check_gpu_availability() -> bool:
	"""Check if GPU is available"""
	try:
	if torch.cuda.is_available():
	gpu_count = torch.cuda.device_count()
	gpu_name = torch.cuda.get_device_name(0)
	logger.info(f"GPU available: {gpu_name} (Count: {gpu_count})")
	return True
	else:
	logger.info("No GPU available, using CPU")
	return False
	except Exception as e:
	logger.error(f"Error checking GPU: {e}")
	return False

	def train_model_async(
	model_name: str,
	dataset_path: str,
	config_path: str,
	max_samples: int,
	epochs: int,
	batch_size: int,
	learning_rate: float
	):
	"""Async training function"""
	global training_status

	try:
	training_status.update({
	"is_training": True,
	"status": "starting",
	"progress": 0,
	"start_time": datetime.now().isoformat(),
	"error": None
	})

	logger.info("🚀 Starting training...")

	# Import training libraries
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling
	)
	from datasets import Dataset

	# Check GPU
	gpu_available = check_gpu_availability()

	# Load model and tokenizer
	logger.info(f"📥 Loading model: {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model with GPU if available
	if gpu_available:
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto"
	)
	else:
	model = AutoModelForCausalLM.from_pretrained(model_name)

	logger.info("✅ Model loaded successfully")

	# Load training data
	training_data = load_training_data(dataset_path, max_samples)
	if not training_data:
	raise Exception("No training data loaded")

	# Convert to dataset
	dataset = Dataset.from_list(training_data)

	def tokenize_function(examples):
	return tokenizer(
	examples["text"],
	truncation=True,
	padding=True,
	max_length=256,
	return_tensors="pt"
	)

	tokenized_dataset = dataset.map(tokenize_function, batched=True)

	# Training arguments
	training_args = TrainingArguments(
	output_dir="./models/textilindo-trained",
	num_train_epochs=epochs,
	per_device_train_batch_size=batch_size,
	gradient_accumulation_steps=2,
	learning_rate=learning_rate,
	warmup_steps=5,
	save_steps=10,
	logging_steps=1,
	save_total_limit=1,
	prediction_loss_only=True,
	remove_unused_columns=False,
	fp16=gpu_available, # Use fp16 only if GPU available
	dataloader_pin_memory=gpu_available,
	report_to=None,
	)

	# Data collator
	data_collator = DataCollatorForLanguageModeling(
	tokenizer=tokenizer,
	mlm=False,
	)

	# Create trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
	data_collator=data_collator,
	tokenizer=tokenizer,
	)

	# Custom callback for progress tracking
	class ProgressCallback:
	def __init__(self):
	self.step = 0
	self.total_steps = len(tokenized_dataset) * epochs

	def on_log(self, args, state, control, logs=None, **kwargs):
	global training_status
	if logs:
	training_status.update({
	"current_step": state.global_step,
	"total_steps": self.total_steps,
	"progress": min(100, (state.global_step / self.total_steps) * 100),
	"loss": logs.get('loss', 0.0),
	"status": "training"
	})

	# Add callback
	trainer.add_callback(ProgressCallback())

	# Start training
	training_status["status"] = "training"
	trainer.train()

	# Save model
	model.save_pretrained("./models/textilindo-trained")
	tokenizer.save_pretrained("./models/textilindo-trained")

	# Update status
	training_status.update({
	"is_training": False,
	"status": "completed",
	"progress": 100,
	"end_time": datetime.now().isoformat()
	})

	logger.info("✅ Training completed successfully!")

	except Exception as e:
	logger.error(f"Training failed: {e}")
	training_status.update({
	"is_training": False,
	"status": "failed",
	"error": str(e),
	"end_time": datetime.now().isoformat()
	})

	# API Endpoints

	@training_app.post("/train/start", response_model=TrainingResponse)
	async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks):
	"""Start training process"""
	global training_status

	if training_status["is_training"]:
	raise HTTPException(status_code=400, detail="Training already in progress")

	# Validate inputs
	if not Path(request.dataset_path).exists():
	raise HTTPException(status_code=404, detail=f"Dataset not found: {request.dataset_path}")

	if not Path(request.config_path).exists():
	raise HTTPException(status_code=404, detail=f"Config not found: {request.config_path}")

	# Start training in background
	training_id = f"train_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

	background_tasks.add_task(
	train_model_async,
	request.model_name,
	request.dataset_path,
	request.config_path,
	request.max_samples,
	request.epochs,
	request.batch_size,
	request.learning_rate
	)

	return TrainingResponse(
	success=True,
	message="Training started successfully",
	training_id=training_id,
	status="started"
	)

	@training_app.get("/train/status")
	async def get_training_status():
	"""Get current training status"""
	return training_status

	@training_app.get("/train/data")
	async def get_training_data_info():
	"""Get information about available training data"""
	data_dir = Path("data")
	if not data_dir.exists():
	return {"files": [], "count": 0}

	jsonl_files = list(data_dir.glob("*.jsonl"))
	files_info = []

	for file in jsonl_files:
	try:
	with open(file, 'r', encoding='utf-8') as f:
	lines = f.readlines()
	files_info.append({
	"name": file.name,
	"size": file.stat().st_size,
	"lines": len(lines)
	})
	except Exception as e:
	files_info.append({
	"name": file.name,
	"error": str(e)
	})

	return {
	"files": files_info,
	"count": len(jsonl_files)
	}

	@training_app.get("/train/config")
	async def get_training_config():
	"""Get current training configuration"""
	config_path = "configs/training_config.yaml"
	if not Path(config_path).exists():
	return {"error": "Config file not found"}

	try:
	config = load_training_config(config_path)
	return config
	except Exception as e:
	return {"error": str(e)}

	@training_app.get("/train/models")
	async def get_available_models():
	"""Get list of available models"""
	return {
	"models": [
	{
	"name": "distilgpt2",
	"size": "82M",
	"description": "Small, fast model for quick training"
	},
	{
	"name": "gpt2",
	"size": "124M",
	"description": "Original GPT-2 model"
	},
	{
	"name": "microsoft/DialoGPT-small",
	"size": "117M",
	"description": "Conversational model"
	}
	]
	}

	@training_app.get("/train/gpu")
	async def get_gpu_info():
	"""Get GPU information"""
	try:
	gpu_available = torch.cuda.is_available()
	if gpu_available:
	gpu_count = torch.cuda.device_count()
	gpu_name = torch.cuda.get_device_name(0)
	gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
	return {
	"available": True,
	"count": gpu_count,
	"name": gpu_name,
	"memory_gb": round(gpu_memory, 2)
	}
	else:
	return {"available": False}
	except Exception as e:
	return {"error": str(e)}

	@training_app.post("/train/stop")
	async def stop_training():
	"""Stop current training"""
	global training_status

	if not training_status["is_training"]:
	return {"message": "No training in progress"}

	training_status.update({
	"is_training": False,
	"status": "stopped",
	"end_time": datetime.now().isoformat()
	})

	return {"message": "Training stopped"}

	@training_app.get("/train/test")
	async def test_trained_model():
	"""Test the trained model"""
	model_path = "./models/textilindo-trained"
	if not Path(model_path).exists():
	return {"error": "No trained model found"}

	try:
	from transformers import AutoTokenizer, AutoModelForCausalLM

	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(model_path)

	# Test prompt
	test_prompt = "Question: dimana lokasi textilindo? Answer:"
	inputs = tokenizer(test_prompt, return_tensors="pt")

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=inputs.input_ids.shape[1] + 30,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	return {
	"success": True,
	"test_prompt": test_prompt,
	"response": response,
	"model_path": model_path
	}

	except Exception as e:
	return {"error": str(e)}

	if __name__ == "__main__":
	uvicorn.run(training_app, host="0.0.0.0", port=7861)