Buckets:

MaximoLopezChenlo's picture
download
raw
16.6 kB
"""
OncoAgent — QLoRA Fine-Tuning Script (Unsloth-Accelerated).
Uses Unsloth's FastLanguageModel for 2x faster training with 60% less VRAM
on AMD Instinct MI300X (ROCm). 4-bit NF4 quantization via bitsandbytes.
Key features:
- Unsloth-optimized kernels for Attention + Loss computation
- Tier-adaptive hyperparameters (batch size, LoRA rank, seq length)
- Automatic checkpoint resume on crash recovery
- Eval split monitoring to detect overfitting
- Training metadata saved for reproducibility audits
Hardware Target: AMD Instinct MI300X (gfx942) via ROCm 7.0 + HIP.
Rule Compliance: #3 (Qwen 3.5/3.6, format ChatML), #14 (QLoRA + bitsandbytes + PEFT),
#22 (reproducibility seeds), #24 (.env), #26 (type hints).
"""
import argparse
import json
import os
import time
import logging
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
# ── Critical AMD MI300X Environment ─────────────────────────────────────────
# Must be set BEFORE any ROCm/HIP library is invoked.
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "9.4.2"
os.environ["HF_HUB_DISABLE_XET"] = "1"
from dotenv import load_dotenv
load_dotenv()
import torch
from datasets import Dataset
# Unsloth MUST be imported before trl to apply monkey-patches correctly
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
# ── Reproducibility (Rule #22) ──────────────────────────────────────────────
SEED: int = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
# ── Logging ─────────────────────────────────────────────────────────────────
logging.basicConfig(
level=os.getenv("LOG_LEVEL", "INFO"),
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)
# ── Tier-Adaptive Configuration ─────────────────────────────────────────────
@dataclass
class TierConfig:
"""Tier-specific hyperparameters tuned for MI300X 192GB HBM3."""
model_id: str
lora_r: int
lora_alpha: int
max_seq_length: int
batch_size: int
gradient_accumulation: int
learning_rate: float
num_epochs: int
save_steps: int
TIER_CONFIGS: Dict[int, TierConfig] = {
1: TierConfig(
model_id="Qwen/Qwen3.5-9B",
lora_r=16,
lora_alpha=32,
max_seq_length=2048,
batch_size=8, # Proven ~16s/step throughput on MI300X with Unsloth
gradient_accumulation=2, # Effective batch = 16
learning_rate=2e-4,
num_epochs=3, # Maximize clinical knowledge retention within ~2.5h on MI300X
save_steps=500,
),
2: TierConfig(
model_id="Qwen/Qwen3.6-27B",
lora_r=32, # Higher rank for 27B capacity
lora_alpha=64,
max_seq_length=2048,
batch_size=4, # 27B needs smaller micro-batch
gradient_accumulation=4, # Effective batch = 16
learning_rate=1e-4, # Lower LR for larger model stability
num_epochs=1,
save_steps=250, # More frequent checkpoints for 27B
),
}
# ── Shared Constants ────────────────────────────────────────────────────────
OUTPUT_DIR: str = os.path.join("models", "oncoagent_adapters")
TRAIN_FILE: str = os.path.join("data", "final", "train_oncoagent.jsonl")
EVAL_FILE: str = os.path.join("data", "final", "train_oncoagent_eval.jsonl")
LORA_DROPOUT: float = 0.0 # Unsloth optimized: 0 dropout for speed
LORA_TARGET_MODULES: List[str] = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
]
WARMUP_RATIO: float = 0.03
WEIGHT_DECAY: float = 0.01
MAX_GRAD_NORM: float = 1.0
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join("data", "final"), exist_ok=True)
# ── Data Loading ────────────────────────────────────────────────────────────
def load_jsonl_dataset(path: str, label: str = "data") -> Dataset:
"""Load a JSONL file into a HuggingFace Dataset.
Args:
path: Path to the JSONL file.
label: Human-readable label for logging.
Returns:
HuggingFace Dataset object.
Raises:
FileNotFoundError: If the file does not exist.
"""
if not os.path.exists(path):
raise FileNotFoundError(
f"{label} file not found: {path}. "
"Run download_hf_datasets.py and synthetic_generator.py first, "
"then unify with dataset_builder.py."
)
texts: List[str] = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
try:
entry = json.loads(line.strip())
text = entry.get("text", "")
if text:
texts.append(text)
except json.JSONDecodeError:
continue
logger.info(f"📊 Loaded {len(texts):,} {label} samples from {path}")
return Dataset.from_dict({"text": texts})
# ── Model Setup (Unsloth) ──────────────────────────────────────────────────
def setup_model_and_tokenizer(
config: TierConfig,
) -> tuple:
"""Load model with Unsloth's FastLanguageModel (4-bit, LoRA-ready).
Args:
config: The tier-specific configuration.
Returns:
Tuple of (model, tokenizer).
"""
# FastLanguageModel imported at module level
hf_token: Optional[str] = os.getenv("HF_TOKEN")
logger.info(f"📦 Loading model via Unsloth: {config.model_id}")
logger.info(f" Device: cuda (ROCm maps cuda→hip, gfx942 via HSA_OVERRIDE)")
logger.info(f" Quantization: 4-bit NF4 (Unsloth managed)")
logger.info(f" LoRA rank: {config.lora_r}, alpha: {config.lora_alpha}")
# Unsloth handles quantization, model loading, and optimization in one call
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=config.model_id,
max_seq_length=config.max_seq_length,
load_in_4bit=True,
token=hf_token,
)
# Extract the actual tokenizer for pad_token setup
# Qwen3.5 returns a Qwen3VLProcessor wrapper
actual_tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
logger.info(f" Tokenizer type: {type(tokenizer).__name__} → inner: {type(actual_tokenizer).__name__}")
logger.info(f" EOS token: {actual_tokenizer.eos_token!r}")
if actual_tokenizer.pad_token is None:
actual_tokenizer.pad_token = actual_tokenizer.eos_token
# Apply LoRA via Unsloth's optimized path
model = FastLanguageModel.get_peft_model(
model,
r=config.lora_r,
lora_alpha=config.lora_alpha,
lora_dropout=LORA_DROPOUT,
target_modules=LORA_TARGET_MODULES,
bias="none",
use_gradient_checkpointing="unsloth", # Unsloth's 2x speedup
random_state=SEED,
)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
logger.info(
f"🧠 LoRA applied: {trainable:,} trainable / {total:,} total "
f"({100 * trainable / total:.2f}%)"
)
return model, actual_tokenizer # Return inner tokenizer for SFTTrainer compatibility
# ── Training Metadata ───────────────────────────────────────────────────────
def _save_training_metadata(
output_dir: str,
config: TierConfig,
train_samples: int,
eval_samples: int,
duration_seconds: float,
final_metrics: Optional[Dict[str, Any]] = None,
) -> None:
"""Save a JSON metadata file alongside the adapter for audit trails.
Args:
output_dir: Directory to save to.
config: The tier config used.
train_samples: Number of training samples.
eval_samples: Number of eval samples.
duration_seconds: Wall-clock training time.
final_metrics: Optional metrics from trainer.
"""
meta: Dict[str, Any] = {
"project": "OncoAgent",
"model_id": config.model_id,
"tier": 1 if "9B" in config.model_id else 2,
"lora_r": config.lora_r,
"lora_alpha": config.lora_alpha,
"max_seq_length": config.max_seq_length,
"effective_batch_size": config.batch_size * config.gradient_accumulation,
"learning_rate": config.learning_rate,
"num_epochs": config.num_epochs,
"train_samples": train_samples,
"eval_samples": eval_samples,
"duration_seconds": round(duration_seconds, 1),
"duration_human": time.strftime("%Hh %Mm %Ss", time.gmtime(duration_seconds)),
"hardware": "AMD Instinct MI300X (192GB HBM3)",
"framework": "Unsloth 2026.5.2 + ROCm 7.0 + bitsandbytes (NF4) + PEFT/LoRA",
"seed": SEED,
}
if final_metrics:
meta["final_metrics"] = {
k: round(v, 6) if isinstance(v, float) else v
for k, v in final_metrics.items()
}
path = os.path.join(output_dir, "training_metadata.json")
with open(path, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=2, ensure_ascii=False)
logger.info(f"📝 Training metadata saved to {path}")
# ── Training ────────────────────────────────────────────────────────────────
def train(
tier: int,
max_steps: int = -1,
resume: bool = False,
) -> str:
"""Execute the Unsloth-accelerated QLoRA training pipeline.
Args:
tier: The architectural tier to train (1 for 9B, 2 for 27B).
max_steps: If > 0, stop training after this many steps (useful for dry runs).
resume: If True, resume from the last checkpoint in the output directory.
Returns:
Path to the saved adapter directory.
"""
# SFTTrainer, SFTConfig imported at module level
config = TIER_CONFIGS.get(tier)
if not config:
raise ValueError(f"Invalid tier: {tier}. Must be 1 or 2.")
logger.info(f"🚀 Starting OncoAgent Unsloth Training Pipeline (Tier {tier})")
logger.info("=" * 60)
logger.info(f" Model: {config.model_id}")
logger.info(f" Engine: Unsloth FastLanguageModel (2x speedup)")
logger.info(f" LoRA rank: {config.lora_r}")
logger.info(f" Batch size: {config.batch_size} × {config.gradient_accumulation} = {config.batch_size * config.gradient_accumulation}")
logger.info(f" Learning rate: {config.learning_rate}")
logger.info(f" Epochs: {config.num_epochs}")
logger.info(f" Seq length: {config.max_seq_length}")
logger.info(f" HSA_GFX: {os.environ.get('HSA_OVERRIDE_GFX_VERSION', 'NOT SET')}")
logger.info("=" * 60)
# Setup
model, tokenizer = setup_model_and_tokenizer(config)
train_dataset = load_jsonl_dataset(TRAIN_FILE, "training")
# Eval dataset (optional — graceful degradation if missing)
eval_dataset: Optional[Dataset] = None
eval_samples: int = 0
try:
eval_dataset = load_jsonl_dataset(EVAL_FILE, "evaluation")
eval_samples = len(eval_dataset)
except FileNotFoundError:
logger.warning("⚠️ Eval file not found — training without evaluation metrics")
# Output directory
tier_output_dir: str = os.path.join(OUTPUT_DIR, f"tier{tier}")
os.makedirs(tier_output_dir, exist_ok=True)
# Detect checkpoint for resume
resume_checkpoint: Optional[str] = None
if resume:
checkpoints = sorted([
os.path.join(tier_output_dir, d)
for d in os.listdir(tier_output_dir)
if d.startswith("checkpoint-")
])
if checkpoints:
resume_checkpoint = checkpoints[-1]
logger.info(f"♻️ Resuming from checkpoint: {resume_checkpoint}")
else:
logger.warning("⚠️ --resume requested but no checkpoint found, starting fresh")
# SFTConfig = TrainingArguments + SFT-specific fields (trl >=0.14)
sft_config = SFTConfig(
output_dir=tier_output_dir,
num_train_epochs=config.num_epochs,
per_device_train_batch_size=config.batch_size,
gradient_accumulation_steps=config.gradient_accumulation,
learning_rate=config.learning_rate,
weight_decay=WEIGHT_DECAY,
warmup_steps=int(WARMUP_RATIO * (len(train_dataset) // (config.batch_size * config.gradient_accumulation))),
lr_scheduler_type="cosine",
logging_steps=10,
max_steps=max_steps,
save_steps=config.save_steps,
save_total_limit=3,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
max_grad_norm=MAX_GRAD_NORM,
seed=SEED,
report_to="none",
optim="adamw_8bit",
# SFT-specific fields
max_length=config.max_seq_length,
packing=True, # Pack multiple short samples into one sequence for ~2.5x throughput
dataset_text_field="text",
eos_token=None, # Prevent EOS_TOKEN mismatch with Qwen3 tokenizers
# No eval during training — too slow for 5h target. Eval post-training.
eval_strategy="no",
)
# SFTTrainer from trl — Unsloth patches this for 2x speed
trainer = SFTTrainer(
model=model,
processing_class=tokenizer, # Inner tokenizer (not VLM processor) for packing compat
train_dataset=train_dataset,
args=sft_config,
)
# Train
t0 = time.time()
logger.info("🏋️ Training started...")
train_result = trainer.train(resume_from_checkpoint=resume_checkpoint)
duration = time.time() - t0
# Save final adapter
final_path: str = os.path.join(tier_output_dir, "final")
model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)
# Save training metadata
_save_training_metadata(
output_dir=final_path,
config=config,
train_samples=len(train_dataset),
eval_samples=eval_samples,
duration_seconds=duration,
final_metrics=train_result.metrics if train_result else None,
)
# Final report
logger.info("=" * 60)
logger.info(f"🏁 TRAINING COMPLETE FOR TIER {tier} ({config.model_id})")
logger.info(f" Adapter saved to: {final_path}")
logger.info(f" Train samples: {len(train_dataset):,}")
logger.info(f" Eval samples: {eval_samples:,}")
logger.info(f" Duration: {time.strftime('%Hh %Mm %Ss', time.gmtime(duration))}")
logger.info(f" Final train loss: {train_result.metrics.get('train_loss', 'N/A')}")
if eval_dataset:
eval_results = trainer.evaluate()
logger.info(f" Final eval loss: {eval_results.get('eval_loss', 'N/A')}")
logger.info("=" * 60)
# Cleanup VRAM
del model, trainer
if torch.cuda.is_available():
torch.cuda.empty_cache()
return final_path
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="OncoAgent Unsloth QLoRA Fine-Tuning (Dual-Tier Qwen Architecture)"
)
parser.add_argument(
"--tier",
type=int,
choices=[1, 2],
required=True,
help="Select the architectural tier to train (1 = Qwen 3.5 9B Speed, 2 = Qwen 3.6 27B Reasoning)",
)
parser.add_argument(
"--max_steps",
type=int,
default=-1,
help="Maximum number of training steps (useful for validation/dry-runs)",
)
parser.add_argument(
"--resume",
action="store_true",
help="Resume from the last checkpoint (critical for crash recovery on cloud instances)",
)
args = parser.parse_args()
train(args.tier, max_steps=args.max_steps, resume=args.resume)

Xet Storage Details

Size:
16.6 kB
·
Xet hash:
3dc829f4e7720590df824d52c99acf5832a19f8c9776322e8bb5c68e13d17636

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.