Spaces:

lablab-ai-amd-developer-hackathon
/

ReplayLab

Running

App Files Files Community

ReplayLab / run_gpu_experiments.sh

Roopalgn

Deploy ReplayLab to HF Spaces

fe42aec 2 days ago

raw

history blame contribute delete

7.8 kB

	#!/bin/bash
	# ReplayLab GPU Experiment Runner for AMD Developer Cloud
	# Run this on the MI300X droplet after cloning the repo.
	#
	# The vLLM Quick Start image runs vLLM inside Docker.
	# This script handles both Docker and direct execution.

	set -e

	echo "========================================="
	echo "ReplayLab - AMD MI300X GPU Experiments"
	echo "========================================="

	# Check ROCm
	echo ""
	echo "[1/6] Checking ROCm and GPU..."
	rocm-smi --showid
	echo ""
	rocm-smi --showmeminfo vram

	# Save GPU telemetry baseline
	echo ""
	echo "[2/6] Saving GPU telemetry baseline..."
	mkdir -p replaylab/runs/gpu_evidence
	rocm-smi --json > replaylab/runs/gpu_evidence/rocm_smi_baseline.json 2>/dev/null \|\| echo "JSON output not available, using text"
	rocm-smi > replaylab/runs/gpu_evidence/rocm_smi_baseline.txt

	# Determine if vLLM is available directly or via Docker
	VLLM_CMD=""
	if python3 -c "import vllm" 2>/dev/null; then
	VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server"
	echo "[*] vLLM available directly"
	elif docker exec -it rocm python3 -c "import vllm" 2>/dev/null; then
	VLLM_CMD="docker exec rocm python3 -m vllm.entrypoints.openai.api_server"
	echo "[*] vLLM available via Docker (rocm container)"
	else
	echo "[!] vLLM not found. Installing..."
	pip install vllm
	VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server"
	fi

	# --- SCENARIO 1: OOM (Bad config) ---
	echo ""
	echo "[3/6] Running OOM scenario (bad config)..."
	echo " Starting vLLM with max_model_len=65536, gpu_memory_utilization=0.99"
	echo " This SHOULD fail with OOM..."

	mkdir -p replaylab/runs/gpu_oom
	timeout 120 $VLLM_CMD \
	--model Qwen/Qwen2.5-7B-Instruct \
	--max-model-len 65536 \
	--gpu-memory-utilization 0.99 \
	--port 8000 \
	> replaylab/runs/gpu_oom/stdout.txt 2> replaylab/runs/gpu_oom/stderr.txt \|\| true

	OOM_EXIT=$?
	echo " Exit code: $OOM_EXIT"

	# Capture GPU state after OOM attempt
	rocm-smi --json > replaylab/runs/gpu_evidence/rocm_smi_after_oom.json 2>/dev/null \|\| true
	rocm-smi > replaylab/runs/gpu_evidence/rocm_smi_after_oom.txt

	# Write metrics
	python3 -c "
	import json, time
	metrics = {
	'status': 'failed',
	'failure_type': 'gpu_oom',
	'scenario': 'oom',
	'exit_code': $OOM_EXIT,
	'model': 'Qwen/Qwen2.5-7B-Instruct',
	'max_model_len': 65536,
	'gpu_memory_utilization': 0.99,
	'timestamp': time.time(),
	'memory_pressure': True,
	'batch_size': 65536,
	'available_memory_mb': 192000,
	'estimated_memory_mb': 250000,
	'throughput_items_per_sec': 0,
	}
	with open('replaylab/runs/gpu_oom/metrics.json', 'w') as f:
	json.dump(metrics, f, indent=2)

	artifact = {
	'status': 'failed',
	'cause': 'gpu_oom_max_model_len_too_large',
	'summary': 'vLLM failed: max_model_len=65536 with gpu_memory_utilization=0.99 exceeds MI300X VRAM.',
	'recommendation': 'Reduce max_model_len to 4096 and gpu_memory_utilization to 0.9.',
	}
	with open('replaylab/runs/gpu_oom/artifact.json', 'w') as f:
	json.dump(artifact, f, indent=2)
	"
	echo " ❌ OOM scenario recorded"

	# --- SCENARIO 2: Recovered (Good config) ---
	echo ""
	echo "[4/6] Running RECOVERED scenario (good config)..."
	echo " Starting vLLM with max_model_len=4096, gpu_memory_utilization=0.9"

	mkdir -p replaylab/runs/gpu_recovered

	# Start vLLM server in background with safe settings
	$VLLM_CMD \
	--model Qwen/Qwen2.5-7B-Instruct \
	--max-model-len 4096 \
	--gpu-memory-utilization 0.9 \
	--port 8000 \
	> replaylab/runs/gpu_recovered/server_stdout.txt 2> replaylab/runs/gpu_recovered/server_stderr.txt &

	VLLM_PID=$!
	echo " vLLM server PID: $VLLM_PID"
	echo " Waiting for server to be ready..."

	# Wait for server health
	for i in $(seq 1 90); do
	if curl -s http://localhost:8000/health > /dev/null 2>&1; then
	echo " Server ready after ${i}s"
	break
	fi
	if ! kill -0 $VLLM_PID 2>/dev/null; then
	echo " Server died. Check logs."
	cat replaylab/runs/gpu_recovered/server_stderr.txt \| tail -20
	break
	fi
	sleep 2
	done

	# Send inference requests
	echo " Sending inference requests..."
	python3 -c "
	import json, time, urllib.request

	prompts = [
	'Explain GPU memory management in one paragraph.',
	'What is ROCm and why does it matter for AI?',
	'Describe the difference between batch size 8 and 64 for inference.',
	'What causes out-of-memory errors in model serving?',
	'How does vLLM optimize GPU memory usage?',
	'What is the MI300X architecture advantage?',
	'Explain KV cache memory in transformer inference.',
	'Why does larger context length require more GPU memory?',
	]

	start = time.perf_counter()
	results = []
	errors = []

	for prompt in prompts:
	payload = json.dumps({
	'model': 'Qwen/Qwen2.5-7B-Instruct',
	'prompt': prompt,
	'max_tokens': 128,
	'temperature': 0.0,
	}).encode('utf-8')
	req = urllib.request.Request(
	'http://localhost:8000/v1/completions',
	data=payload,
	headers={'Content-Type': 'application/json'},
	)
	try:
	with urllib.request.urlopen(req, timeout=60) as resp:
	result = json.loads(resp.read().decode('utf-8'))
	results.append(result)
	print(f' ✓ Prompt {len(results)}: {len(result[\"choices\"][0][\"text\"])} chars')
	except Exception as e:
	errors.append(str(e))
	print(f' ✗ Error: {e}')

	duration = time.perf_counter() - start
	throughput = len(results) / max(duration, 0.001)

	print(f' Completed: {len(results)}/{len(prompts)} in {duration:.1f}s')
	print(f' Throughput: {throughput:.2f} prompts/sec')

	metrics = {
	'status': 'succeeded',
	'failure_type': None,
	'scenario': 'recovered',
	'exit_code': 0,
	'model': 'Qwen/Qwen2.5-7B-Instruct',
	'max_model_len': 4096,
	'gpu_memory_utilization': 0.9,
	'timestamp': time.time(),
	'memory_pressure': False,
	'batch_size': 8,
	'available_memory_mb': 192000,
	'estimated_memory_mb': 45000,
	'throughput_items_per_sec': round(throughput, 3),
	'total_prompts': len(prompts),
	'successful': len(results),
	'errors': len(errors),
	'duration_sec': round(duration, 3),
	}
	with open('replaylab/runs/gpu_recovered/metrics.json', 'w') as f:
	json.dump(metrics, f, indent=2)

	artifact = {
	'status': 'succeeded',
	'cause': None,
	'summary': f'Inference succeeded: {len(results)}/{len(prompts)} prompts in {duration:.1f}s.',
	'recommendation': 'Keep max_model_len=4096 and gpu_memory_utilization=0.9 for stable serving.',
	'sample_output': results[0]['choices'][0]['text'][:200] if results else '',
	}
	with open('replaylab/runs/gpu_recovered/artifact.json', 'w') as f:
	json.dump(artifact, f, indent=2)
	"

	# Kill the vLLM server
	kill $VLLM_PID 2>/dev/null \|\| true
	wait $VLLM_PID 2>/dev/null \|\| true
	echo " ✅ Recovered scenario complete"

	# Capture GPU state after successful run
	rocm-smi --json > replaylab/runs/gpu_evidence/rocm_smi_after_recovery.json 2>/dev/null \|\| true
	rocm-smi > replaylab/runs/gpu_evidence/rocm_smi_after_recovery.txt

	# --- Save final evidence ---
	echo ""
	echo "[5/6] Saving final GPU evidence..."
	rocm-smi --showmeminfo vram > replaylab/runs/gpu_evidence/vram_final.txt
	cat /proc/driver/amdgpu/version > replaylab/runs/gpu_evidence/amdgpu_version.txt 2>/dev/null \|\| true

	# --- Summary ---
	echo ""
	echo "[6/6] Done!"
	echo "========================================="
	echo "Results saved in replaylab/runs/"
	echo " gpu_oom/ - Failed OOM scenario"
	echo " gpu_recovered/ - Successful recovery"
	echo " gpu_evidence/ - GPU telemetry snapshots"
	echo "========================================="
	echo ""
	echo "Next: commit and push results"
	echo " git add -A"
	echo " git commit -m 'Add real MI300X experiment results'"
	echo " git push origin main"