| #!/bin/bash |
| |
| |
| |
| |
| |
|
|
| set -e |
|
|
| echo "=========================================" |
| echo "ReplayLab - AMD MI300X GPU Experiments" |
| echo "=========================================" |
|
|
| |
| echo "" |
| echo "[1/6] Checking ROCm and GPU..." |
| rocm-smi --showid |
| echo "" |
| rocm-smi --showmeminfo vram |
|
|
| |
| echo "" |
| echo "[2/6] Saving GPU telemetry baseline..." |
| mkdir -p replaylab/runs/gpu_evidence |
| rocm-smi --json > replaylab/runs/gpu_evidence/rocm_smi_baseline.json 2>/dev/null || echo "JSON output not available, using text" |
| rocm-smi > replaylab/runs/gpu_evidence/rocm_smi_baseline.txt |
|
|
| |
| VLLM_CMD="" |
| if python3 -c "import vllm" 2>/dev/null; then |
| VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server" |
| echo "[*] vLLM available directly" |
| elif docker exec -it rocm python3 -c "import vllm" 2>/dev/null; then |
| VLLM_CMD="docker exec rocm python3 -m vllm.entrypoints.openai.api_server" |
| echo "[*] vLLM available via Docker (rocm container)" |
| else |
| echo "[!] vLLM not found. Installing..." |
| pip install vllm |
| VLLM_CMD="python3 -m vllm.entrypoints.openai.api_server" |
| fi |
|
|
| |
| echo "" |
| echo "[3/6] Running OOM scenario (bad config)..." |
| echo " Starting vLLM with max_model_len=65536, gpu_memory_utilization=0.99" |
| echo " This SHOULD fail with OOM..." |
|
|
| mkdir -p replaylab/runs/gpu_oom |
| timeout 120 $VLLM_CMD \ |
| --model Qwen/Qwen2.5-7B-Instruct \ |
| --max-model-len 65536 \ |
| --gpu-memory-utilization 0.99 \ |
| --port 8000 \ |
| > replaylab/runs/gpu_oom/stdout.txt 2> replaylab/runs/gpu_oom/stderr.txt || true |
|
|
| OOM_EXIT=$? |
| echo " Exit code: $OOM_EXIT" |
|
|
| |
| rocm-smi --json > replaylab/runs/gpu_evidence/rocm_smi_after_oom.json 2>/dev/null || true |
| rocm-smi > replaylab/runs/gpu_evidence/rocm_smi_after_oom.txt |
|
|
| |
| python3 -c " |
| import json, time |
| metrics = { |
| 'status': 'failed', |
| 'failure_type': 'gpu_oom', |
| 'scenario': 'oom', |
| 'exit_code': $OOM_EXIT, |
| 'model': 'Qwen/Qwen2.5-7B-Instruct', |
| 'max_model_len': 65536, |
| 'gpu_memory_utilization': 0.99, |
| 'timestamp': time.time(), |
| 'memory_pressure': True, |
| 'batch_size': 65536, |
| 'available_memory_mb': 192000, |
| 'estimated_memory_mb': 250000, |
| 'throughput_items_per_sec': 0, |
| } |
| with open('replaylab/runs/gpu_oom/metrics.json', 'w') as f: |
| json.dump(metrics, f, indent=2) |
| |
| artifact = { |
| 'status': 'failed', |
| 'cause': 'gpu_oom_max_model_len_too_large', |
| 'summary': 'vLLM failed: max_model_len=65536 with gpu_memory_utilization=0.99 exceeds MI300X VRAM.', |
| 'recommendation': 'Reduce max_model_len to 4096 and gpu_memory_utilization to 0.9.', |
| } |
| with open('replaylab/runs/gpu_oom/artifact.json', 'w') as f: |
| json.dump(artifact, f, indent=2) |
| " |
| echo " ❌ OOM scenario recorded" |
|
|
| |
| echo "" |
| echo "[4/6] Running RECOVERED scenario (good config)..." |
| echo " Starting vLLM with max_model_len=4096, gpu_memory_utilization=0.9" |
|
|
| mkdir -p replaylab/runs/gpu_recovered |
|
|
| |
| $VLLM_CMD \ |
| --model Qwen/Qwen2.5-7B-Instruct \ |
| --max-model-len 4096 \ |
| --gpu-memory-utilization 0.9 \ |
| --port 8000 \ |
| > replaylab/runs/gpu_recovered/server_stdout.txt 2> replaylab/runs/gpu_recovered/server_stderr.txt & |
|
|
| VLLM_PID=$! |
| echo " vLLM server PID: $VLLM_PID" |
| echo " Waiting for server to be ready..." |
|
|
| |
| for i in $(seq 1 90); do |
| if curl -s http://localhost:8000/health > /dev/null 2>&1; then |
| echo " Server ready after ${i}s" |
| break |
| fi |
| if ! kill -0 $VLLM_PID 2>/dev/null; then |
| echo " Server died. Check logs." |
| cat replaylab/runs/gpu_recovered/server_stderr.txt | tail -20 |
| break |
| fi |
| sleep 2 |
| done |
|
|
| |
| echo " Sending inference requests..." |
| python3 -c " |
| import json, time, urllib.request |
| |
| prompts = [ |
| 'Explain GPU memory management in one paragraph.', |
| 'What is ROCm and why does it matter for AI?', |
| 'Describe the difference between batch size 8 and 64 for inference.', |
| 'What causes out-of-memory errors in model serving?', |
| 'How does vLLM optimize GPU memory usage?', |
| 'What is the MI300X architecture advantage?', |
| 'Explain KV cache memory in transformer inference.', |
| 'Why does larger context length require more GPU memory?', |
| ] |
| |
| start = time.perf_counter() |
| results = [] |
| errors = [] |
| |
| for prompt in prompts: |
| payload = json.dumps({ |
| 'model': 'Qwen/Qwen2.5-7B-Instruct', |
| 'prompt': prompt, |
| 'max_tokens': 128, |
| 'temperature': 0.0, |
| }).encode('utf-8') |
| req = urllib.request.Request( |
| 'http://localhost:8000/v1/completions', |
| data=payload, |
| headers={'Content-Type': 'application/json'}, |
| ) |
| try: |
| with urllib.request.urlopen(req, timeout=60) as resp: |
| result = json.loads(resp.read().decode('utf-8')) |
| results.append(result) |
| print(f' ✓ Prompt {len(results)}: {len(result[\"choices\"][0][\"text\"])} chars') |
| except Exception as e: |
| errors.append(str(e)) |
| print(f' ✗ Error: {e}') |
| |
| duration = time.perf_counter() - start |
| throughput = len(results) / max(duration, 0.001) |
| |
| print(f' Completed: {len(results)}/{len(prompts)} in {duration:.1f}s') |
| print(f' Throughput: {throughput:.2f} prompts/sec') |
| |
| metrics = { |
| 'status': 'succeeded', |
| 'failure_type': None, |
| 'scenario': 'recovered', |
| 'exit_code': 0, |
| 'model': 'Qwen/Qwen2.5-7B-Instruct', |
| 'max_model_len': 4096, |
| 'gpu_memory_utilization': 0.9, |
| 'timestamp': time.time(), |
| 'memory_pressure': False, |
| 'batch_size': 8, |
| 'available_memory_mb': 192000, |
| 'estimated_memory_mb': 45000, |
| 'throughput_items_per_sec': round(throughput, 3), |
| 'total_prompts': len(prompts), |
| 'successful': len(results), |
| 'errors': len(errors), |
| 'duration_sec': round(duration, 3), |
| } |
| with open('replaylab/runs/gpu_recovered/metrics.json', 'w') as f: |
| json.dump(metrics, f, indent=2) |
| |
| artifact = { |
| 'status': 'succeeded', |
| 'cause': None, |
| 'summary': f'Inference succeeded: {len(results)}/{len(prompts)} prompts in {duration:.1f}s.', |
| 'recommendation': 'Keep max_model_len=4096 and gpu_memory_utilization=0.9 for stable serving.', |
| 'sample_output': results[0]['choices'][0]['text'][:200] if results else '', |
| } |
| with open('replaylab/runs/gpu_recovered/artifact.json', 'w') as f: |
| json.dump(artifact, f, indent=2) |
| " |
|
|
| |
| kill $VLLM_PID 2>/dev/null || true |
| wait $VLLM_PID 2>/dev/null || true |
| echo " ✅ Recovered scenario complete" |
|
|
| |
| rocm-smi --json > replaylab/runs/gpu_evidence/rocm_smi_after_recovery.json 2>/dev/null || true |
| rocm-smi > replaylab/runs/gpu_evidence/rocm_smi_after_recovery.txt |
|
|
| |
| echo "" |
| echo "[5/6] Saving final GPU evidence..." |
| rocm-smi --showmeminfo vram > replaylab/runs/gpu_evidence/vram_final.txt |
| cat /proc/driver/amdgpu/version > replaylab/runs/gpu_evidence/amdgpu_version.txt 2>/dev/null || true |
|
|
| |
| echo "" |
| echo "[6/6] Done!" |
| echo "=========================================" |
| echo "Results saved in replaylab/runs/" |
| echo " gpu_oom/ - Failed OOM scenario" |
| echo " gpu_recovered/ - Successful recovery" |
| echo " gpu_evidence/ - GPU telemetry snapshots" |
| echo "=========================================" |
| echo "" |
| echo "Next: commit and push results" |
| echo " git add -A" |
| echo " git commit -m 'Add real MI300X experiment results'" |
| echo " git push origin main" |
|
|