File size: 3,111 Bytes
c2bfdba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
version: "3.9"

# Darwin-60B-DUO — full-stack launcher
# Spins up:
#   - vllm-darwin   (Darwin-28B-REASON,   GPU 0, port 8021 internal)
#   - vllm-awaxis   (AWAXIS-Think-31B,    GPU 1, port 8022 internal)
#   - gateway       (FastAPI orchestrator, port 8000 exposed)
#
# Single-GPU collocation:
#   Set CUDA_VISIBLE_DEVICES=0 for both vllm-* and lower
#   --gpu-memory-utilization to 0.45 each (FP8 totals ~30GB on 80GB GPU).

services:

  vllm-darwin:
    image: vllm/vllm-openai:latest
    container_name: darwin-60b-duo-vllm-darwin
    runtime: nvidia
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - VLLM_DP_MASTER_PORT=45011
      - HF_HOME=/root/.cache/huggingface
      - HF_TOKEN=${HF_TOKEN:-}
    command: >
      --model FINAL-Bench/Darwin-28B-REASON
      --served-model-name darwin-28r
      --host 0.0.0.0
      --port 8021
      --tensor-parallel-size 1
      --max-model-len 16384
      --dtype bfloat16
      --quantization fp8
      --trust-remote-code
      --enforce-eager
      --limit-mm-per-prompt {"image":0,"video":0}
      --gpu-memory-utilization 0.85
    volumes:
      - hf_cache:/root/.cache/huggingface
    ports:
      - "8021:8021"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8021/v1/models"]
      interval: 20s
      timeout: 5s
      retries: 60

  vllm-awaxis:
    image: vllm/vllm-openai:latest
    container_name: darwin-60b-duo-vllm-awaxis
    runtime: nvidia
    environment:
      - CUDA_VISIBLE_DEVICES=1
      - VLLM_DP_MASTER_PORT=45012
      - HF_HOME=/root/.cache/huggingface
      - HF_TOKEN=${HF_TOKEN:-}
    command: >
      --model Anserwise/AWAXIS-Think-31B
      --served-model-name awaxis-31b
      --host 0.0.0.0
      --port 8022
      --tensor-parallel-size 1
      --max-model-len 16384
      --dtype bfloat16
      --quantization fp8
      --trust-remote-code
      --enforce-eager
      --limit-mm-per-prompt {"image":0,"video":0}
      --gpu-memory-utilization 0.85
    volumes:
      - hf_cache:/root/.cache/huggingface
    ports:
      - "8022:8022"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8022/v1/models"]
      interval: 20s
      timeout: 5s
      retries: 60

  gateway:
    image: python:3.11-slim
    container_name: darwin-60b-duo-gateway
    working_dir: /app
    command: >
      bash -c "pip install -q -r requirements.txt &&
               python server.py --host 0.0.0.0 --port 8000
                 --darwin-url http://vllm-darwin:8021/v1
                 --awaxis-url http://vllm-awaxis:8022/v1"
    volumes:
      - ../gateway:/app
    ports:
      - "8000:8000"
    depends_on:
      vllm-darwin:
        condition: service_healthy
      vllm-awaxis:
        condition: service_healthy
    restart: unless-stopped

volumes:
  hf_cache:
    driver: local