Buckets:
| """ | |
| OncoAgent — QLoRA Fine-Tuning Script (Unsloth-Accelerated). | |
| Uses Unsloth's FastLanguageModel for 2x faster training with 60% less VRAM | |
| on AMD Instinct MI300X (ROCm). 4-bit NF4 quantization via bitsandbytes. | |
| Key features: | |
| - Unsloth-optimized kernels for Attention + Loss computation | |
| - Tier-adaptive hyperparameters (batch size, LoRA rank, seq length) | |
| - Automatic checkpoint resume on crash recovery | |
| - Eval split monitoring to detect overfitting | |
| - Training metadata saved for reproducibility audits | |
| Hardware Target: AMD Instinct MI300X (gfx942) via ROCm 7.0 + HIP. | |
| Rule Compliance: #3 (Qwen 3.5/3.6, format ChatML), #14 (QLoRA + bitsandbytes + PEFT), | |
| #22 (reproducibility seeds), #24 (.env), #26 (type hints). | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import time | |
| import logging | |
| from dataclasses import dataclass | |
| from typing import Dict, List, Optional, Any | |
| # ── Critical AMD MI300X Environment ───────────────────────────────────────── | |
| # Must be set BEFORE any ROCm/HIP library is invoked. | |
| os.environ["HSA_OVERRIDE_GFX_VERSION"] = "9.4.2" | |
| os.environ["HF_HUB_DISABLE_XET"] = "1" | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import torch | |
| from datasets import Dataset | |
| # Unsloth MUST be imported before trl to apply monkey-patches correctly | |
| from unsloth import FastLanguageModel | |
| from trl import SFTTrainer, SFTConfig | |
| # ── Reproducibility (Rule #22) ────────────────────────────────────────────── | |
| SEED: int = 42 | |
| torch.manual_seed(SEED) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(SEED) | |
| # ── Logging ───────────────────────────────────────────────────────────────── | |
| logging.basicConfig( | |
| level=os.getenv("LOG_LEVEL", "INFO"), | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # ── Tier-Adaptive Configuration ───────────────────────────────────────────── | |
| class TierConfig: | |
| """Tier-specific hyperparameters tuned for MI300X 192GB HBM3.""" | |
| model_id: str | |
| lora_r: int | |
| lora_alpha: int | |
| max_seq_length: int | |
| batch_size: int | |
| gradient_accumulation: int | |
| learning_rate: float | |
| num_epochs: int | |
| save_steps: int | |
| TIER_CONFIGS: Dict[int, TierConfig] = { | |
| 1: TierConfig( | |
| model_id="Qwen/Qwen3.5-9B", | |
| lora_r=16, | |
| lora_alpha=32, | |
| max_seq_length=2048, | |
| batch_size=8, # Proven ~16s/step throughput on MI300X with Unsloth | |
| gradient_accumulation=2, # Effective batch = 16 | |
| learning_rate=2e-4, | |
| num_epochs=3, # Maximize clinical knowledge retention within ~2.5h on MI300X | |
| save_steps=500, | |
| ), | |
| 2: TierConfig( | |
| model_id="Qwen/Qwen3.6-27B", | |
| lora_r=32, # Higher rank for 27B capacity | |
| lora_alpha=64, | |
| max_seq_length=2048, | |
| batch_size=4, # 27B needs smaller micro-batch | |
| gradient_accumulation=4, # Effective batch = 16 | |
| learning_rate=1e-4, # Lower LR for larger model stability | |
| num_epochs=1, | |
| save_steps=250, # More frequent checkpoints for 27B | |
| ), | |
| } | |
| # ── Shared Constants ──────────────────────────────────────────────────────── | |
| OUTPUT_DIR: str = os.path.join("models", "oncoagent_adapters") | |
| TRAIN_FILE: str = os.path.join("data", "final", "train_oncoagent.jsonl") | |
| EVAL_FILE: str = os.path.join("data", "final", "train_oncoagent_eval.jsonl") | |
| LORA_DROPOUT: float = 0.0 # Unsloth optimized: 0 dropout for speed | |
| LORA_TARGET_MODULES: List[str] = [ | |
| "q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj", | |
| ] | |
| WARMUP_RATIO: float = 0.03 | |
| WEIGHT_DECAY: float = 0.01 | |
| MAX_GRAD_NORM: float = 1.0 | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| os.makedirs(os.path.join("data", "final"), exist_ok=True) | |
| # ── Data Loading ──────────────────────────────────────────────────────────── | |
| def load_jsonl_dataset(path: str, label: str = "data") -> Dataset: | |
| """Load a JSONL file into a HuggingFace Dataset. | |
| Args: | |
| path: Path to the JSONL file. | |
| label: Human-readable label for logging. | |
| Returns: | |
| HuggingFace Dataset object. | |
| Raises: | |
| FileNotFoundError: If the file does not exist. | |
| """ | |
| if not os.path.exists(path): | |
| raise FileNotFoundError( | |
| f"{label} file not found: {path}. " | |
| "Run download_hf_datasets.py and synthetic_generator.py first, " | |
| "then unify with dataset_builder.py." | |
| ) | |
| texts: List[str] = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| try: | |
| entry = json.loads(line.strip()) | |
| text = entry.get("text", "") | |
| if text: | |
| texts.append(text) | |
| except json.JSONDecodeError: | |
| continue | |
| logger.info(f"📊 Loaded {len(texts):,} {label} samples from {path}") | |
| return Dataset.from_dict({"text": texts}) | |
| # ── Model Setup (Unsloth) ────────────────────────────────────────────────── | |
| def setup_model_and_tokenizer( | |
| config: TierConfig, | |
| ) -> tuple: | |
| """Load model with Unsloth's FastLanguageModel (4-bit, LoRA-ready). | |
| Args: | |
| config: The tier-specific configuration. | |
| Returns: | |
| Tuple of (model, tokenizer). | |
| """ | |
| # FastLanguageModel imported at module level | |
| hf_token: Optional[str] = os.getenv("HF_TOKEN") | |
| logger.info(f"📦 Loading model via Unsloth: {config.model_id}") | |
| logger.info(f" Device: cuda (ROCm maps cuda→hip, gfx942 via HSA_OVERRIDE)") | |
| logger.info(f" Quantization: 4-bit NF4 (Unsloth managed)") | |
| logger.info(f" LoRA rank: {config.lora_r}, alpha: {config.lora_alpha}") | |
| # Unsloth handles quantization, model loading, and optimization in one call | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=config.model_id, | |
| max_seq_length=config.max_seq_length, | |
| load_in_4bit=True, | |
| token=hf_token, | |
| ) | |
| # Extract the actual tokenizer for pad_token setup | |
| # Qwen3.5 returns a Qwen3VLProcessor wrapper | |
| actual_tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer | |
| logger.info(f" Tokenizer type: {type(tokenizer).__name__} → inner: {type(actual_tokenizer).__name__}") | |
| logger.info(f" EOS token: {actual_tokenizer.eos_token!r}") | |
| if actual_tokenizer.pad_token is None: | |
| actual_tokenizer.pad_token = actual_tokenizer.eos_token | |
| # Apply LoRA via Unsloth's optimized path | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r=config.lora_r, | |
| lora_alpha=config.lora_alpha, | |
| lora_dropout=LORA_DROPOUT, | |
| target_modules=LORA_TARGET_MODULES, | |
| bias="none", | |
| use_gradient_checkpointing="unsloth", # Unsloth's 2x speedup | |
| random_state=SEED, | |
| ) | |
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| total = sum(p.numel() for p in model.parameters()) | |
| logger.info( | |
| f"🧠 LoRA applied: {trainable:,} trainable / {total:,} total " | |
| f"({100 * trainable / total:.2f}%)" | |
| ) | |
| return model, actual_tokenizer # Return inner tokenizer for SFTTrainer compatibility | |
| # ── Training Metadata ─────────────────────────────────────────────────────── | |
| def _save_training_metadata( | |
| output_dir: str, | |
| config: TierConfig, | |
| train_samples: int, | |
| eval_samples: int, | |
| duration_seconds: float, | |
| final_metrics: Optional[Dict[str, Any]] = None, | |
| ) -> None: | |
| """Save a JSON metadata file alongside the adapter for audit trails. | |
| Args: | |
| output_dir: Directory to save to. | |
| config: The tier config used. | |
| train_samples: Number of training samples. | |
| eval_samples: Number of eval samples. | |
| duration_seconds: Wall-clock training time. | |
| final_metrics: Optional metrics from trainer. | |
| """ | |
| meta: Dict[str, Any] = { | |
| "project": "OncoAgent", | |
| "model_id": config.model_id, | |
| "tier": 1 if "9B" in config.model_id else 2, | |
| "lora_r": config.lora_r, | |
| "lora_alpha": config.lora_alpha, | |
| "max_seq_length": config.max_seq_length, | |
| "effective_batch_size": config.batch_size * config.gradient_accumulation, | |
| "learning_rate": config.learning_rate, | |
| "num_epochs": config.num_epochs, | |
| "train_samples": train_samples, | |
| "eval_samples": eval_samples, | |
| "duration_seconds": round(duration_seconds, 1), | |
| "duration_human": time.strftime("%Hh %Mm %Ss", time.gmtime(duration_seconds)), | |
| "hardware": "AMD Instinct MI300X (192GB HBM3)", | |
| "framework": "Unsloth 2026.5.2 + ROCm 7.0 + bitsandbytes (NF4) + PEFT/LoRA", | |
| "seed": SEED, | |
| } | |
| if final_metrics: | |
| meta["final_metrics"] = { | |
| k: round(v, 6) if isinstance(v, float) else v | |
| for k, v in final_metrics.items() | |
| } | |
| path = os.path.join(output_dir, "training_metadata.json") | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(meta, f, indent=2, ensure_ascii=False) | |
| logger.info(f"📝 Training metadata saved to {path}") | |
| # ── Training ──────────────────────────────────────────────────────────────── | |
| def train( | |
| tier: int, | |
| max_steps: int = -1, | |
| resume: bool = False, | |
| ) -> str: | |
| """Execute the Unsloth-accelerated QLoRA training pipeline. | |
| Args: | |
| tier: The architectural tier to train (1 for 9B, 2 for 27B). | |
| max_steps: If > 0, stop training after this many steps (useful for dry runs). | |
| resume: If True, resume from the last checkpoint in the output directory. | |
| Returns: | |
| Path to the saved adapter directory. | |
| """ | |
| # SFTTrainer, SFTConfig imported at module level | |
| config = TIER_CONFIGS.get(tier) | |
| if not config: | |
| raise ValueError(f"Invalid tier: {tier}. Must be 1 or 2.") | |
| logger.info(f"🚀 Starting OncoAgent Unsloth Training Pipeline (Tier {tier})") | |
| logger.info("=" * 60) | |
| logger.info(f" Model: {config.model_id}") | |
| logger.info(f" Engine: Unsloth FastLanguageModel (2x speedup)") | |
| logger.info(f" LoRA rank: {config.lora_r}") | |
| logger.info(f" Batch size: {config.batch_size} × {config.gradient_accumulation} = {config.batch_size * config.gradient_accumulation}") | |
| logger.info(f" Learning rate: {config.learning_rate}") | |
| logger.info(f" Epochs: {config.num_epochs}") | |
| logger.info(f" Seq length: {config.max_seq_length}") | |
| logger.info(f" HSA_GFX: {os.environ.get('HSA_OVERRIDE_GFX_VERSION', 'NOT SET')}") | |
| logger.info("=" * 60) | |
| # Setup | |
| model, tokenizer = setup_model_and_tokenizer(config) | |
| train_dataset = load_jsonl_dataset(TRAIN_FILE, "training") | |
| # Eval dataset (optional — graceful degradation if missing) | |
| eval_dataset: Optional[Dataset] = None | |
| eval_samples: int = 0 | |
| try: | |
| eval_dataset = load_jsonl_dataset(EVAL_FILE, "evaluation") | |
| eval_samples = len(eval_dataset) | |
| except FileNotFoundError: | |
| logger.warning("⚠️ Eval file not found — training without evaluation metrics") | |
| # Output directory | |
| tier_output_dir: str = os.path.join(OUTPUT_DIR, f"tier{tier}") | |
| os.makedirs(tier_output_dir, exist_ok=True) | |
| # Detect checkpoint for resume | |
| resume_checkpoint: Optional[str] = None | |
| if resume: | |
| checkpoints = sorted([ | |
| os.path.join(tier_output_dir, d) | |
| for d in os.listdir(tier_output_dir) | |
| if d.startswith("checkpoint-") | |
| ]) | |
| if checkpoints: | |
| resume_checkpoint = checkpoints[-1] | |
| logger.info(f"♻️ Resuming from checkpoint: {resume_checkpoint}") | |
| else: | |
| logger.warning("⚠️ --resume requested but no checkpoint found, starting fresh") | |
| # SFTConfig = TrainingArguments + SFT-specific fields (trl >=0.14) | |
| sft_config = SFTConfig( | |
| output_dir=tier_output_dir, | |
| num_train_epochs=config.num_epochs, | |
| per_device_train_batch_size=config.batch_size, | |
| gradient_accumulation_steps=config.gradient_accumulation, | |
| learning_rate=config.learning_rate, | |
| weight_decay=WEIGHT_DECAY, | |
| warmup_steps=int(WARMUP_RATIO * (len(train_dataset) // (config.batch_size * config.gradient_accumulation))), | |
| lr_scheduler_type="cosine", | |
| logging_steps=10, | |
| max_steps=max_steps, | |
| save_steps=config.save_steps, | |
| save_total_limit=3, | |
| fp16=not torch.cuda.is_bf16_supported(), | |
| bf16=torch.cuda.is_bf16_supported(), | |
| gradient_checkpointing=True, | |
| gradient_checkpointing_kwargs={"use_reentrant": False}, | |
| max_grad_norm=MAX_GRAD_NORM, | |
| seed=SEED, | |
| report_to="none", | |
| optim="adamw_8bit", | |
| # SFT-specific fields | |
| max_length=config.max_seq_length, | |
| packing=True, # Pack multiple short samples into one sequence for ~2.5x throughput | |
| dataset_text_field="text", | |
| eos_token=None, # Prevent EOS_TOKEN mismatch with Qwen3 tokenizers | |
| # No eval during training — too slow for 5h target. Eval post-training. | |
| eval_strategy="no", | |
| ) | |
| # SFTTrainer from trl — Unsloth patches this for 2x speed | |
| trainer = SFTTrainer( | |
| model=model, | |
| processing_class=tokenizer, # Inner tokenizer (not VLM processor) for packing compat | |
| train_dataset=train_dataset, | |
| args=sft_config, | |
| ) | |
| # Train | |
| t0 = time.time() | |
| logger.info("🏋️ Training started...") | |
| train_result = trainer.train(resume_from_checkpoint=resume_checkpoint) | |
| duration = time.time() - t0 | |
| # Save final adapter | |
| final_path: str = os.path.join(tier_output_dir, "final") | |
| model.save_pretrained(final_path) | |
| tokenizer.save_pretrained(final_path) | |
| # Save training metadata | |
| _save_training_metadata( | |
| output_dir=final_path, | |
| config=config, | |
| train_samples=len(train_dataset), | |
| eval_samples=eval_samples, | |
| duration_seconds=duration, | |
| final_metrics=train_result.metrics if train_result else None, | |
| ) | |
| # Final report | |
| logger.info("=" * 60) | |
| logger.info(f"🏁 TRAINING COMPLETE FOR TIER {tier} ({config.model_id})") | |
| logger.info(f" Adapter saved to: {final_path}") | |
| logger.info(f" Train samples: {len(train_dataset):,}") | |
| logger.info(f" Eval samples: {eval_samples:,}") | |
| logger.info(f" Duration: {time.strftime('%Hh %Mm %Ss', time.gmtime(duration))}") | |
| logger.info(f" Final train loss: {train_result.metrics.get('train_loss', 'N/A')}") | |
| if eval_dataset: | |
| eval_results = trainer.evaluate() | |
| logger.info(f" Final eval loss: {eval_results.get('eval_loss', 'N/A')}") | |
| logger.info("=" * 60) | |
| # Cleanup VRAM | |
| del model, trainer | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| return final_path | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="OncoAgent Unsloth QLoRA Fine-Tuning (Dual-Tier Qwen Architecture)" | |
| ) | |
| parser.add_argument( | |
| "--tier", | |
| type=int, | |
| choices=[1, 2], | |
| required=True, | |
| help="Select the architectural tier to train (1 = Qwen 3.5 9B Speed, 2 = Qwen 3.6 27B Reasoning)", | |
| ) | |
| parser.add_argument( | |
| "--max_steps", | |
| type=int, | |
| default=-1, | |
| help="Maximum number of training steps (useful for validation/dry-runs)", | |
| ) | |
| parser.add_argument( | |
| "--resume", | |
| action="store_true", | |
| help="Resume from the last checkpoint (critical for crash recovery on cloud instances)", | |
| ) | |
| args = parser.parse_args() | |
| train(args.tier, max_steps=args.max_steps, resume=args.resume) | |
Xet Storage Details
- Size:
- 16.6 kB
- Xet hash:
- 3dc829f4e7720590df824d52c99acf5832a19f8c9776322e8bb5c68e13d17636
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.