version: "3.9" # Darwin-60B-DUO — full-stack launcher # Spins up: # - vllm-darwin (Darwin-28B-REASON, GPU 0, port 8021 internal) # - vllm-awaxis (AWAXIS-Think-31B, GPU 1, port 8022 internal) # - gateway (FastAPI orchestrator, port 8000 exposed) # # Single-GPU collocation: # Set CUDA_VISIBLE_DEVICES=0 for both vllm-* and lower # --gpu-memory-utilization to 0.45 each (FP8 totals ~30GB on 80GB GPU). services: vllm-darwin: image: vllm/vllm-openai:latest container_name: darwin-60b-duo-vllm-darwin runtime: nvidia environment: - CUDA_VISIBLE_DEVICES=0 - VLLM_DP_MASTER_PORT=45011 - HF_HOME=/root/.cache/huggingface - HF_TOKEN=${HF_TOKEN:-} command: > --model FINAL-Bench/Darwin-28B-REASON --served-model-name darwin-28r --host 0.0.0.0 --port 8021 --tensor-parallel-size 1 --max-model-len 16384 --dtype bfloat16 --quantization fp8 --trust-remote-code --enforce-eager --limit-mm-per-prompt {"image":0,"video":0} --gpu-memory-utilization 0.85 volumes: - hf_cache:/root/.cache/huggingface ports: - "8021:8021" deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8021/v1/models"] interval: 20s timeout: 5s retries: 60 vllm-awaxis: image: vllm/vllm-openai:latest container_name: darwin-60b-duo-vllm-awaxis runtime: nvidia environment: - CUDA_VISIBLE_DEVICES=1 - VLLM_DP_MASTER_PORT=45012 - HF_HOME=/root/.cache/huggingface - HF_TOKEN=${HF_TOKEN:-} command: > --model Anserwise/AWAXIS-Think-31B --served-model-name awaxis-31b --host 0.0.0.0 --port 8022 --tensor-parallel-size 1 --max-model-len 16384 --dtype bfloat16 --quantization fp8 --trust-remote-code --enforce-eager --limit-mm-per-prompt {"image":0,"video":0} --gpu-memory-utilization 0.85 volumes: - hf_cache:/root/.cache/huggingface ports: - "8022:8022" deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8022/v1/models"] interval: 20s timeout: 5s retries: 60 gateway: image: python:3.11-slim container_name: darwin-60b-duo-gateway working_dir: /app command: > bash -c "pip install -q -r requirements.txt && python server.py --host 0.0.0.0 --port 8000 --darwin-url http://vllm-darwin:8021/v1 --awaxis-url http://vllm-awaxis:8022/v1" volumes: - ../gateway:/app ports: - "8000:8000" depends_on: vllm-darwin: condition: service_healthy vllm-awaxis: condition: service_healthy restart: unless-stopped volumes: hf_cache: driver: local