Spaces:
Sleeping
Sleeping
File size: 7,641 Bytes
e207dc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
#!/usr/bin/env python3
"""
Script untuk training dengan monitoring GPU dan logging yang lengkap
"""
import os
import sys
import time
import json
import psutil
import GPUtil
from pathlib import Path
from datetime import datetime
import logging
from finetune_lora import main as finetune_main
def setup_logging():
"""Setup logging dengan format yang lengkap"""
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"training_{timestamp}.log"
# Setup logging format
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler(sys.stdout)
]
)
return logging.getLogger(__name__)
def get_system_info():
"""Get system information"""
info = {
"timestamp": datetime.now().isoformat(),
"cpu_count": psutil.cpu_count(),
"memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
"memory_available_gb": round(psutil.virtual_memory().available / (1024**3), 2),
"disk_usage": {}
}
# Disk usage
for partition in psutil.disk_partitions():
try:
usage = psutil.disk_usage(partition.mountpoint)
info["disk_usage"][partition.mountpoint] = {
"total_gb": round(usage.total / (1024**3), 2),
"used_gb": round(usage.used / (1024**3), 2),
"free_gb": round(usage.free / (1024**3), 2),
"percent": usage.percent
}
except PermissionError:
continue
return info
def get_gpu_info():
"""Get GPU information"""
try:
gpus = GPUtil.getGPUs()
gpu_info = []
for gpu in gpus:
gpu_info.append({
"id": gpu.id,
"name": gpu.name,
"memory_total_mb": gpu.memoryTotal,
"memory_used_mb": gpu.memoryUsed,
"memory_free_mb": gpu.memoryFree,
"memory_utilization_percent": gpu.memoryUtil * 100,
"gpu_utilization_percent": gpu.load * 100,
"temperature_celsius": gpu.temperature
})
return gpu_info
except Exception as e:
logging.warning(f"Could not get GPU info: {e}")
return []
def monitor_resources(logger, interval=30):
"""Monitor system resources during training"""
logger.info("๐ Starting resource monitoring...")
start_time = time.time()
monitoring_data = []
try:
while True:
# Get current resource usage
current_time = time.time()
elapsed_time = current_time - start_time
# System info
system_info = get_system_info()
system_info["elapsed_time_seconds"] = elapsed_time
# GPU info
gpu_info = get_gpu_info()
# Memory usage
memory = psutil.virtual_memory()
system_info["memory_used_gb"] = round(memory.used / (1024**3), 2)
system_info["memory_percent"] = memory.percent
# CPU usage
system_info["cpu_percent"] = psutil.cpu_percent(interval=1)
# Combine all info
monitoring_entry = {
"timestamp": datetime.now().isoformat(),
"elapsed_time_seconds": elapsed_time,
"system": system_info,
"gpu": gpu_info
}
monitoring_data.append(monitoring_entry)
# Log summary
logger.info(f"โฑ๏ธ Elapsed: {elapsed_time/60:.1f}min | "
f"CPU: {system_info['cpu_percent']:.1f}% | "
f"RAM: {system_info['memory_percent']:.1f}%")
if gpu_info:
for gpu in gpu_info:
logger.info(f"๐ฎ GPU {gpu['id']}: "
f"Util: {gpu['gpu_utilization_percent']:.1f}% | "
f"Memory: {gpu['memory_utilization_percent']:.1f}% | "
f"Temp: {gpu['temperature_celsius']:.1f}ยฐC")
# Save monitoring data periodically
if len(monitoring_data) % 10 == 0: # Every 10 entries
monitoring_file = Path("logs") / f"monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(monitoring_file, 'w') as f:
json.dump(monitoring_data, f, indent=2)
logger.info(f"๐พ Monitoring data saved: {monitoring_file}")
time.sleep(interval)
except KeyboardInterrupt:
logger.info("โน๏ธ Resource monitoring stopped by user")
return monitoring_data
def main():
"""Main function untuk training dengan monitoring"""
print("๐ Training dengan Monitoring - Llama 3.1 8B LoRA")
print("=" * 60)
# Setup logging
logger = setup_logging()
# Log system information
logger.info("๐ฅ๏ธ System Information:")
system_info = get_system_info()
for key, value in system_info.items():
if key != "disk_usage":
logger.info(f" {key}: {value}")
# Log GPU information
gpu_info = get_gpu_info()
if gpu_info:
logger.info("๐ฎ GPU Information:")
for gpu in gpu_info:
logger.info(f" GPU {gpu['id']}: {gpu['name']}")
logger.info(f" Memory: {gpu['memory_total_mb']}MB total")
logger.info(f" Temperature: {gpu['temperature_celsius']}ยฐC")
else:
logger.warning("โ ๏ธ No GPU detected. Training will be very slow on CPU!")
# Check prerequisites
logger.info("๐ Checking prerequisites...")
# Check if model exists
model_path = Path("models/llama-3.1-8b-instruct")
if not model_path.exists():
logger.error("โ Base model not found. Please run download_model.py first!")
return
# Check if dataset exists
data_path = Path("data/training_data.jsonl")
if not data_path.exists():
logger.error("โ Training dataset not found. Please run create_sample_dataset.py first!")
return
# Check if config exists
config_path = Path("configs/llama_config.yaml")
if not config_path.exists():
logger.error("โ Model configuration not found. Please run download_model.py first!")
return
logger.info("โ
All prerequisites met!")
# Start resource monitoring in background
import threading
monitoring_thread = threading.Thread(
target=monitor_resources,
args=(logger, 30), # Monitor every 30 seconds
daemon=True
)
monitoring_thread.start()
# Start training
logger.info("๐ Starting LoRA fine-tuning...")
try:
finetune_main()
logger.info("โ
Training completed successfully!")
except Exception as e:
logger.error(f"โ Training failed: {e}")
raise
finally:
logger.info("๐ Training session ended")
# Save final monitoring data
monitoring_file = Path("logs") / f"final_monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
# Note: In a real implementation, you'd want to capture the monitoring data
logger.info(f"๐พ Final monitoring data saved: {monitoring_file}")
if __name__ == "__main__":
main()
|