Darwin-60B-DUO / docker /docker-compose.yml
SeaWolf-AI's picture
Initial release — Darwin-60B-DUO (Hybrid-A: Route 70% / Split-Refine 20% / Ensemble V_1 10%)
c2bfdba verified
version: "3.9"
# Darwin-60B-DUO — full-stack launcher
# Spins up:
# - vllm-darwin (Darwin-28B-REASON, GPU 0, port 8021 internal)
# - vllm-awaxis (AWAXIS-Think-31B, GPU 1, port 8022 internal)
# - gateway (FastAPI orchestrator, port 8000 exposed)
#
# Single-GPU collocation:
# Set CUDA_VISIBLE_DEVICES=0 for both vllm-* and lower
# --gpu-memory-utilization to 0.45 each (FP8 totals ~30GB on 80GB GPU).
services:
vllm-darwin:
image: vllm/vllm-openai:latest
container_name: darwin-60b-duo-vllm-darwin
runtime: nvidia
environment:
- CUDA_VISIBLE_DEVICES=0
- VLLM_DP_MASTER_PORT=45011
- HF_HOME=/root/.cache/huggingface
- HF_TOKEN=${HF_TOKEN:-}
command: >
--model FINAL-Bench/Darwin-28B-REASON
--served-model-name darwin-28r
--host 0.0.0.0
--port 8021
--tensor-parallel-size 1
--max-model-len 16384
--dtype bfloat16
--quantization fp8
--trust-remote-code
--enforce-eager
--limit-mm-per-prompt {"image":0,"video":0}
--gpu-memory-utilization 0.85
volumes:
- hf_cache:/root/.cache/huggingface
ports:
- "8021:8021"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8021/v1/models"]
interval: 20s
timeout: 5s
retries: 60
vllm-awaxis:
image: vllm/vllm-openai:latest
container_name: darwin-60b-duo-vllm-awaxis
runtime: nvidia
environment:
- CUDA_VISIBLE_DEVICES=1
- VLLM_DP_MASTER_PORT=45012
- HF_HOME=/root/.cache/huggingface
- HF_TOKEN=${HF_TOKEN:-}
command: >
--model Anserwise/AWAXIS-Think-31B
--served-model-name awaxis-31b
--host 0.0.0.0
--port 8022
--tensor-parallel-size 1
--max-model-len 16384
--dtype bfloat16
--quantization fp8
--trust-remote-code
--enforce-eager
--limit-mm-per-prompt {"image":0,"video":0}
--gpu-memory-utilization 0.85
volumes:
- hf_cache:/root/.cache/huggingface
ports:
- "8022:8022"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8022/v1/models"]
interval: 20s
timeout: 5s
retries: 60
gateway:
image: python:3.11-slim
container_name: darwin-60b-duo-gateway
working_dir: /app
command: >
bash -c "pip install -q -r requirements.txt &&
python server.py --host 0.0.0.0 --port 8000
--darwin-url http://vllm-darwin:8021/v1
--awaxis-url http://vllm-awaxis:8022/v1"
volumes:
- ../gateway:/app
ports:
- "8000:8000"
depends_on:
vllm-darwin:
condition: service_healthy
vllm-awaxis:
condition: service_healthy
restart: unless-stopped
volumes:
hf_cache:
driver: local