Upload docker-compose.yml with huggingface_hub

7abebba verified 4 months ago

5.95 kB

	version: '3.8'

	services:
	azimuth-training:
	image: nvcr.io/nvidia/pytorch:24.01-py3
	container_name: azimuth-training
	environment:
	- NVIDIA_VISIBLE_DEVICES=all
	volumes:
	- /workspace:/workspace
	working_dir: /workspace
	command: \|
	bash -c '
	echo "============================================================"
	echo " AZIMUTH CONVERSATIONAL TRAINING"
	echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)"
	echo "============================================================"

	pip install datasets transformers einops tqdm torch
	mkdir -p /workspace/data /workspace/checkpoints

	# Download and convert conversational data from HuggingFace
	python -c "
	import torch
	from datasets import load_dataset
	from pathlib import Path

	print(\"Downloading conversational datasets from HuggingFace...\")
	samples = []

	# OpenAssistant
	print(\" Loading OpenAssistant/oasst1...\")
	ds = load_dataset(\"OpenAssistant/oasst1\", split=\"train\")
	for s in list(ds)[:20000]:
	text = s.get(\"text\", \"\")
	if len(text) > 50:
	b = list(text.encode(\"utf-8\"))
	if len(b) > 10:
	samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)})

	# Alpaca
	print(\" Loading tatsu-lab/alpaca...\")
	ds = load_dataset(\"tatsu-lab/alpaca\", split=\"train\")
	for s in list(ds)[:30000]:
	text = f\"User: {s.get(\"instruction\", \"\")}\\nAssistant: {s.get(\"output\", \"\")}\"
	if len(text) > 50:
	b = list(text.encode(\"utf-8\"))
	samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)})

	print(f\"Total samples: {len(samples)}\")
	torch.save(samples, \"/workspace/data/train.pt\")
	print(\"Saved to /workspace/data/train.pt\")
	"

	# Training script
	python -c "
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import random
	import time

	print(\"============================================================\")
	print(\" TRAINING AZIMUTH (Binary-Native Transformer)\")
	print(\"============================================================\")

	class AzimuthModel(nn.Module):
	def __init__(self, d_model=1024, n_layers=24, n_heads=16, max_seq=1024):
	super().__init__()
	self.emb = nn.Embedding(256, d_model)
	self.pos = nn.Embedding(max_seq, d_model)
	layer = nn.TransformerEncoderLayer(d_model, n_heads, d_model*4, dropout=0.1, batch_first=True, norm_first=True)
	self.transformer = nn.TransformerEncoder(layer, n_layers)
	self.head = nn.Linear(d_model, 256)
	self.d_model = d_model

	def forward(self, x):
	B, T = x.shape
	pos = torch.arange(T, device=x.device)
	h = self.emb(x) + self.pos(pos)
	mask = nn.Transformer.generate_square_subsequent_mask(T, device=x.device)
	h = self.transformer(h, mask=mask, is_causal=True)
	return self.head(h)

	# Load data
	data = torch.load(\"/workspace/data/train.pt\")
	print(f\"Loaded {len(data)} samples\")

	# Create model
	model = AzimuthModel(d_model=1024, n_layers=24, n_heads=16).cuda()
	params = sum(p.numel() for p in model.parameters())
	print(f\"Model: {params:,} parameters\")

	opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
	scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=100000)

	# Training loop
	STEPS = 100000
	BATCH = 8
	SEQ_LEN = 512

	print(f\"Training for {STEPS} steps...\")
	print(\"-\" * 60)

	start = time.time()
	for step in range(STEPS):
	# Get batch
	batch_x, batch_y = [], []
	for _ in range(BATCH):
	s = random.choice(data)
	x = s[\"input_ids\"][:SEQ_LEN]
	y = s[\"labels\"][:SEQ_LEN]
	if len(x) < SEQ_LEN:
	x = F.pad(x, (0, SEQ_LEN - len(x)))
	y = F.pad(y, (0, SEQ_LEN - len(y)))
	batch_x.append(x)
	batch_y.append(y)

	x = torch.stack(batch_x).cuda()
	y = torch.stack(batch_y).cuda()

	# Forward
	logits = model(x)
	loss = F.cross_entropy(logits.view(-1, 256), y.view(-1), ignore_index=0)

	# Backward
	opt.zero_grad()
	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	opt.step()
	scheduler.step()

	# Log
	if step % 100 == 0:
	elapsed = time.time() - start
	eta = elapsed / (step + 1) * (STEPS - step) / 60
	print(f\"Step {step:6d}/{STEPS} \| Loss: {loss.item():.4f} \| LR: {scheduler.get_last_lr()[0]:.2e} \| ETA: {eta:.0f}m\")

	# Checkpoint
	if step > 0 and step % 5000 == 0:
	torch.save({\"step\": step, \"model\": model.state_dict()}, f\"/workspace/checkpoints/step_{step}.pt\")
	print(f\" Saved checkpoint: step_{step}.pt\")

	# Generation sample
	if step > 0 and step % 2000 == 0:
	model.eval()
	prompt = \"User: Hello!\\nAssistant:\"
	x = torch.tensor([list(prompt.encode())], device=\"cuda\")
	with torch.no_grad():
	for _ in range(50):
	logits = model(x[:, -512:])
	probs = F.softmax(logits[0, -1] / 0.8, dim=-1)
	next_byte = torch.multinomial(probs, 1)
	x = torch.cat([x, next_byte.unsqueeze(0)], dim=1)
	if next_byte.item() == ord(\"\\n\"): break
	response = bytes(x[0].tolist()).decode(\"utf-8\", errors=\"replace\")
	print(f\" Sample: {response[len(prompt):80]}...\")
	model.train()

	# Save final
	torch.save({\"step\": STEPS, \"model\": model.state_dict()}, \"/workspace/checkpoints/final.pt\")
	print(\"\\n\" + \"=\" * 60)
	print(\"TRAINING COMPLETE!\")
	print(f\"Final checkpoint: /workspace/checkpoints/final.pt\")
	"
	'
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]