Textilindo-AI / quick_train.py
harismlnaslm's picture
Add pure API-based training system with GPU support and background processing
701eb48
#!/usr/bin/env python3
"""
Quick training script for Hugging Face Spaces
Optimized for CPU-only training with limited resources
"""
import os
import json
import logging
from pathlib import Path
from datetime import datetime
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def quick_training():
"""Quick training suitable for HF Spaces"""
print("πŸš€ Starting Quick Training for Hugging Face Spaces")
print("=" * 60)
try:
# Import required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import torch
print("βœ… Successfully imported training libraries")
# Use a very small model for HF Spaces
model_name = "distilgpt2" # Small, fast model
print(f"πŸ“₯ Loading model: {model_name}")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
print("βœ… Model loaded successfully")
# Load training data (limit to small amount for HF Spaces)
data_file = Path("data/lora_dataset_20250829_113330.jsonl")
if not data_file.exists():
print("❌ Training data not found")
return False
# Load and prepare data
training_data = []
with open(data_file, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= 5: # Limit to 5 samples for quick training
break
if line.strip():
data = json.loads(line)
# Create simple training text
text = f"Question: {data.get('instruction', '')} Answer: {data.get('output', '')}"
training_data.append({"text": text})
print(f"βœ… Loaded {len(training_data)} training samples")
if not training_data:
print("❌ No training data found")
return False
# Convert to dataset
dataset = Dataset.from_list(training_data)
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding=True,
max_length=128 # Short sequences for quick training
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Training arguments optimized for HF Spaces
training_args = TrainingArguments(
output_dir="./models/quick-trained",
num_train_epochs=1, # Single epoch
per_device_train_batch_size=1, # Small batch
gradient_accumulation_steps=2,
learning_rate=5e-5,
warmup_steps=2,
save_steps=10,
logging_steps=1,
save_total_limit=1,
prediction_loss_only=True,
remove_unused_columns=False,
fp16=False, # Disable fp16 for CPU
dataloader_pin_memory=False,
report_to=None, # Disable wandb/tensorboard
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
)
print("πŸš€ Starting training...")
print("⚠️ This is a quick demo training with limited data")
# Train
trainer.train()
# Save the model
model.save_pretrained("./models/quick-trained")
tokenizer.save_pretrained("./models/quick-trained")
print("βœ… Quick training completed successfully!")
print("πŸ“ Model saved to: ./models/quick-trained")
# Test the model
print("\nπŸ§ͺ Testing the trained model...")
test_prompt = "Question: dimana lokasi textilindo? Answer:"
inputs = tokenizer(test_prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=inputs.input_ids.shape[1] + 20,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"πŸ“ Test response: {response}")
return True
except ImportError as e:
print(f"❌ Missing required library: {e}")
print("πŸ’‘ Install with: pip install transformers datasets torch")
return False
except Exception as e:
print(f"❌ Training failed: {e}")
return False
def main():
"""Main function"""
print("πŸ€– Textilindo AI - Quick Training on Hugging Face Spaces")
print("=" * 70)
# Check if we're on HF Spaces
if os.getenv('SPACE_ID'):
print("βœ… Running on Hugging Face Spaces")
else:
print("⚠️ Not running on Hugging Face Spaces")
# Check available data
data_dir = Path("data")
if data_dir.exists():
jsonl_files = list(data_dir.glob("*.jsonl"))
print(f"πŸ“Š Found {len(jsonl_files)} training data files")
for file in jsonl_files:
print(f" - {file.name}")
else:
print("❌ No data directory found")
return 1
# Run quick training
if quick_training():
print("\nπŸŽ‰ Quick training completed successfully!")
print("πŸ“‹ Next steps:")
print("1. Check the trained model in ./models/quick-trained/")
print("2. Test the model with your chat interface")
print("3. For full training, use external resources")
return 0
else:
print("\n❌ Quick training failed")
return 1
if __name__ == "__main__":
import sys
sys.exit(main())