Darwin-60B-DUO / docker /docker-compose.yml

Initial release — Darwin-60B-DUO (Hybrid-A: Route 70% / Split-Refine 20% / Ensemble V_1 10%)

c2bfdba verified 5 days ago

3.11 kB

	version: "3.9"

	# Darwin-60B-DUO — full-stack launcher
	# Spins up:
	# - vllm-darwin (Darwin-28B-REASON, GPU 0, port 8021 internal)
	# - vllm-awaxis (AWAXIS-Think-31B, GPU 1, port 8022 internal)
	# - gateway (FastAPI orchestrator, port 8000 exposed)
	#
	# Single-GPU collocation:
	# Set CUDA_VISIBLE_DEVICES=0 for both vllm-* and lower
	# --gpu-memory-utilization to 0.45 each (FP8 totals ~30GB on 80GB GPU).

	services:

	vllm-darwin:
	image: vllm/vllm-openai:latest
	container_name: darwin-60b-duo-vllm-darwin
	runtime: nvidia
	environment:
	- CUDA_VISIBLE_DEVICES=0
	- VLLM_DP_MASTER_PORT=45011
	- HF_HOME=/root/.cache/huggingface
	- HF_TOKEN=${HF_TOKEN:-}
	command: >
	--model FINAL-Bench/Darwin-28B-REASON
	--served-model-name darwin-28r
	--host 0.0.0.0
	--port 8021
	--tensor-parallel-size 1
	--max-model-len 16384
	--dtype bfloat16
	--quantization fp8
	--trust-remote-code
	--enforce-eager
	--limit-mm-per-prompt {"image":0,"video":0}
	--gpu-memory-utilization 0.85
	volumes:
	- hf_cache:/root/.cache/huggingface
	ports:
	- "8021:8021"
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	healthcheck:
	test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8021/v1/models"]
	interval: 20s
	timeout: 5s
	retries: 60

	vllm-awaxis:
	image: vllm/vllm-openai:latest
	container_name: darwin-60b-duo-vllm-awaxis
	runtime: nvidia
	environment:
	- CUDA_VISIBLE_DEVICES=1
	- VLLM_DP_MASTER_PORT=45012
	- HF_HOME=/root/.cache/huggingface
	- HF_TOKEN=${HF_TOKEN:-}
	command: >
	--model Anserwise/AWAXIS-Think-31B
	--served-model-name awaxis-31b
	--host 0.0.0.0
	--port 8022
	--tensor-parallel-size 1
	--max-model-len 16384
	--dtype bfloat16
	--quantization fp8
	--trust-remote-code
	--enforce-eager
	--limit-mm-per-prompt {"image":0,"video":0}
	--gpu-memory-utilization 0.85
	volumes:
	- hf_cache:/root/.cache/huggingface
	ports:
	- "8022:8022"
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	healthcheck:
	test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8022/v1/models"]
	interval: 20s
	timeout: 5s
	retries: 60

	gateway:
	image: python:3.11-slim
	container_name: darwin-60b-duo-gateway
	working_dir: /app
	command: >
	bash -c "pip install -q -r requirements.txt &&
	python server.py --host 0.0.0.0 --port 8000
	--darwin-url http://vllm-darwin:8021/v1
	--awaxis-url http://vllm-awaxis:8022/v1"
	volumes:
	- ../gateway:/app
	ports:
	- "8000:8000"
	depends_on:
	vllm-darwin:
	condition: service_healthy
	vllm-awaxis:
	condition: service_healthy
	restart: unless-stopped

	volumes:
	hf_cache:
	driver: local