| version: '3.8' |
|
|
| services: |
| azimuth-training: |
| image: nvcr.io/nvidia/pytorch:24.01-py3 |
| container_name: azimuth-training |
| environment: |
| - NVIDIA_VISIBLE_DEVICES=all |
| volumes: |
| - /workspace:/workspace |
| working_dir: /workspace |
| command: | |
| bash -c ' |
| echo "============================================================" |
| echo " AZIMUTH CONVERSATIONAL TRAINING" |
| echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)" |
| echo "============================================================" |
| |
| pip install datasets transformers einops tqdm torch |
| mkdir -p /workspace/data /workspace/checkpoints |
| |
| # Download and convert conversational data from HuggingFace |
| python -c " |
| import torch |
| from datasets import load_dataset |
| from pathlib import Path |
|
|
| print(\"Downloading conversational datasets from HuggingFace...\") |
| samples = [] |
|
|
| |
| print(\" Loading OpenAssistant/oasst1...\") |
| ds = load_dataset(\"OpenAssistant/oasst1\", split=\"train\") |
| for s in list(ds)[:20000]: |
| text = s.get(\"text\", \"\") |
| if len(text) > 50: |
| b = list(text.encode(\"utf-8\")) |
| if len(b) > 10: |
| samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)}) |
|
|
| |
| print(\" Loading tatsu-lab/alpaca...\") |
| ds = load_dataset(\"tatsu-lab/alpaca\", split=\"train\") |
| for s in list(ds)[:30000]: |
| text = f\"User: {s.get(\"instruction\", \"\")}\\nAssistant: {s.get(\"output\", \"\")}\" |
| if len(text) > 50: |
| b = list(text.encode(\"utf-8\")) |
| samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)}) |
|
|
| print(f\"Total samples: {len(samples)}\") |
| torch.save(samples, \"/workspace/data/train.pt\") |
| print(\"Saved to /workspace/data/train.pt\") |
| " |
| |
| # Training script |
| python -c " |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| import random |
| import time |
|
|
| print(\"============================================================\") |
| print(\" TRAINING AZIMUTH (Binary-Native Transformer)\") |
| print(\"============================================================\") |
|
|
| class AzimuthModel(nn.Module): |
| def __init__(self, d_model=1024, n_layers=24, n_heads=16, max_seq=1024): |
| super().__init__() |
| self.emb = nn.Embedding(256, d_model) |
| self.pos = nn.Embedding(max_seq, d_model) |
| layer = nn.TransformerEncoderLayer(d_model, n_heads, d_model*4, dropout=0.1, batch_first=True, norm_first=True) |
| self.transformer = nn.TransformerEncoder(layer, n_layers) |
| self.head = nn.Linear(d_model, 256) |
| self.d_model = d_model |
| |
| def forward(self, x): |
| B, T = x.shape |
| pos = torch.arange(T, device=x.device) |
| h = self.emb(x) + self.pos(pos) |
| mask = nn.Transformer.generate_square_subsequent_mask(T, device=x.device) |
| h = self.transformer(h, mask=mask, is_causal=True) |
| return self.head(h) |
|
|
| |
| data = torch.load(\"/workspace/data/train.pt\") |
| print(f\"Loaded {len(data)} samples\") |
|
|
| |
| model = AzimuthModel(d_model=1024, n_layers=24, n_heads=16).cuda() |
| params = sum(p.numel() for p in model.parameters()) |
| print(f\"Model: {params:,} parameters\") |
|
|
| opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01) |
| scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=100000) |
|
|
| |
| STEPS = 100000 |
| BATCH = 8 |
| SEQ_LEN = 512 |
|
|
| print(f\"Training for {STEPS} steps...\") |
| print(\"-\" * 60) |
|
|
| start = time.time() |
| for step in range(STEPS): |
| |
| batch_x, batch_y = [], [] |
| for _ in range(BATCH): |
| s = random.choice(data) |
| x = s[\"input_ids\"][:SEQ_LEN] |
| y = s[\"labels\"][:SEQ_LEN] |
| if len(x) < SEQ_LEN: |
| x = F.pad(x, (0, SEQ_LEN - len(x))) |
| y = F.pad(y, (0, SEQ_LEN - len(y))) |
| batch_x.append(x) |
| batch_y.append(y) |
| |
| x = torch.stack(batch_x).cuda() |
| y = torch.stack(batch_y).cuda() |
| |
| |
| logits = model(x) |
| loss = F.cross_entropy(logits.view(-1, 256), y.view(-1), ignore_index=0) |
| |
| |
| opt.zero_grad() |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| opt.step() |
| scheduler.step() |
| |
| |
| if step % 100 == 0: |
| elapsed = time.time() - start |
| eta = elapsed / (step + 1) * (STEPS - step) / 60 |
| print(f\"Step {step:6d}/{STEPS} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.2e} | ETA: {eta:.0f}m\") |
| |
| |
| if step > 0 and step % 5000 == 0: |
| torch.save({\"step\": step, \"model\": model.state_dict()}, f\"/workspace/checkpoints/step_{step}.pt\") |
| print(f\" Saved checkpoint: step_{step}.pt\") |
| |
| |
| if step > 0 and step % 2000 == 0: |
| model.eval() |
| prompt = \"User: Hello!\\nAssistant:\" |
| x = torch.tensor([list(prompt.encode())], device=\"cuda\") |
| with torch.no_grad(): |
| for _ in range(50): |
| logits = model(x[:, -512:]) |
| probs = F.softmax(logits[0, -1] / 0.8, dim=-1) |
| next_byte = torch.multinomial(probs, 1) |
| x = torch.cat([x, next_byte.unsqueeze(0)], dim=1) |
| if next_byte.item() == ord(\"\\n\"): break |
| response = bytes(x[0].tolist()).decode(\"utf-8\", errors=\"replace\") |
| print(f\" Sample: {response[len(prompt):80]}...\") |
| model.train() |
|
|
| |
| torch.save({\"step\": STEPS, \"model\": model.state_dict()}, \"/workspace/checkpoints/final.pt\") |
| print(\"\\n\" + \"=\" * 60) |
| print(\"TRAINING COMPLETE!\") |
| print(f\"Final checkpoint: /workspace/checkpoints/final.pt\") |
| " |
| ' |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| |