Update README.md

Files changed (1) hide show

README.md CHANGED Viewed

@@ -66,20 +66,24 @@ The model correctly recalled all embedded facts from a long context:
 ### vLLM (Recommended)
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 vllm serve 0xSero/GLM-4.6-REAP-218B-A32B-W4A16-AutoRound \
-    --host 0.0.0.0 --port 8000 \
-    --tensor-parallel-size 4 --pipeline-parallel-size 2 \
-    --quantization auto-round \
-    --kv-cache-dtype fp8 \
-    --max-model-len 180000 \
-    --gpu-memory-utilization 0.82 \
-    --block-size 32 \
-    --max-num-seqs 12 \
-    --max-num-batched-tokens 8192 \
-    --swap-space 32 \
-    --enable-expert-parallel \
-    --disable-custom-all-reduce \
-    --disable-log-requests
 ```
 ### SGLang

 ### vLLM (Recommended)
 ```bash
+  vllm serve /GLM-4.6-REAP-218B-A32B-W4A16-AutoRound \
+      --host 0.0.0.0 --port 8000 \
+      --tensor-parallel-size 4 --pipeline-parallel-size 2 \
+      --quantization auto-round \
+      --kv-cache-dtype fp8 \
+      --max-model-len 200000 \
+      --gpu-memory-utilization 0.88 \
+      --cpu-offload-gb 4 \
+      --block-size 32 \
+      --max-num-seqs 8 \
+      --max-num-batched-tokens 8192 \
+      --swap-space 32 \
+      --enable-expert-parallel \
+      --enable-prefix-caching \
+      --enable-chunked-prefill \
+      --disable-custom-all-reduce \
+      --disable-log-requests \
+      --trust-remote-code
 ```
 ### SGLang