curryandsun commited on
Commit
4f2c4fd
·
verified ·
1 Parent(s): 7235240

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -16
README.md CHANGED
@@ -124,14 +124,14 @@ print("*" * 30)
124
 
125
  #### Environment Preparation
126
 
127
- We will later submit our model to SGLang official release, now we can prepare the environment following steps:
128
  ```shell
129
- pip3 install sgl-kernel==0.3.9.post2 vllm==0.10.2
130
  ```
131
 
132
- Then you should install our sglang whl package:
133
  ```shell
134
- pip install https://raw.githubusercontent.com/inclusionAI/Ring-V2/main/hybrid_linear/whls/sglang-0.5.2-py3-none-any.whl --no-deps --force-reinstall
135
  ```
136
 
137
  #### Run Inference
@@ -153,7 +153,7 @@ python -m sglang.launch_server \
153
  ```shell
154
  curl -s http://localhost:${PORT}/v1/chat/completions \
155
  -H "Content-Type: application/json" \
156
- -d '{"model": "auto", "messages": [{"role": "user", "content": "What is the capital of France?"}]}'
157
  ```
158
 
159
  More usage can be found [here](https://docs.sglang.ai/basic_usage/send_request.html)
@@ -169,7 +169,7 @@ pip install torch==2.7.0 torchvision==0.22.0
169
 
170
  Then you should install our vLLM wheel package:
171
  ```shell
172
- pip install https://raw.githubusercontent.com/inclusionAI/Ring-V2/main/hybrid_linear/whls/vllm-0.8.5+cuda12_8_gcc10_2_1-cp310-cp310-linux_x86_64.whl --no-deps --force-reinstall
173
  ```
174
 
175
  #### Offline Inference
@@ -180,12 +180,11 @@ from vllm import LLM, SamplingParams
180
 
181
  tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ring-mini-linear-2.0")
182
 
183
- sampling_params = SamplingParams(temperature=0.6, top_p=1.0, max_tokens=16384)
184
 
185
- llm = LLM(model="inclusionAI/Ring-mini-linear-2.0", dtype='bfloat16', enable_prefix_caching=False, max_num_seqs=128)
186
  prompt = "Give me a short introduction to large language models."
187
  messages = [
188
- {"role": "system", "content": "You are Ling, an assistant created by inclusionAI"},
189
  {"role": "user", "content": prompt}
190
  ]
191
 
@@ -200,14 +199,8 @@ outputs = llm.generate([text], sampling_params)
200
  #### Online Inference
201
  ```shell
202
  vllm serve inclusionAI/Ring-mini-linear-2.0 \
203
- --tensor-parallel-size 2 \
204
- --pipeline-parallel-size 1 \
205
  --gpu-memory-utilization 0.90 \
206
- --max-num-seqs 512 \
207
  --no-enable-prefix-caching
208
  ```
209
 
210
-
211
- For more information, please see our [GitHub](https://github.com/inclusionAI/Ring-V2/blob/main/hybrid_linear/README.md).
212
-
213
- ## Citation
 
124
 
125
  #### Environment Preparation
126
 
127
+ We have submitted our [PR](https://github.com/sgl-project/sglang/pull/10917) to SGLang official release and it will be merged later, for now we can prepare the environment following steps, firstly install the community version SGLang and required packages:
128
  ```shell
129
+ pip install sglang==0.5.2 sgl-kernel==0.3.9.post2 vllm==0.10.2 torch==2.8.0 torchvision==0.23.0 torchao
130
  ```
131
 
132
+ Then you should install our sglang wheel package:
133
  ```shell
134
+ pip install http://raw.githubusercontent.com/inclusionAI/Ring-V2/blob/main/hybrid_linear/whls/sglang-0.5.2-py3-none-any.whl --no-deps --force-reinstall
135
  ```
136
 
137
  #### Run Inference
 
153
  ```shell
154
  curl -s http://localhost:${PORT}/v1/chat/completions \
155
  -H "Content-Type: application/json" \
156
+ -d '{"model": "auto", "temperature": 0.6, "messages": [{"role": "user", "content": "Give me a short introduction to large language models."}]}'
157
  ```
158
 
159
  More usage can be found [here](https://docs.sglang.ai/basic_usage/send_request.html)
 
169
 
170
  Then you should install our vLLM wheel package:
171
  ```shell
172
+ pip install https://media.githubusercontent.com/media/inclusionAI/Ring-V2/refs/heads/main/hybrid_linear/whls/vllm-0.8.5%2Bcuda12_8_gcc10_2_1-cp310-cp310-linux_x86_64.whl --no-deps --force-reinstall
173
  ```
174
 
175
  #### Offline Inference
 
180
 
181
  tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ring-mini-linear-2.0")
182
 
183
+ sampling_params = SamplingParams(temperature=0.6, top_p=1.0, max_tokens=8192)
184
 
185
+ llm = LLM(model="inclusionAI/Ring-mini-linear-2.0", dtype='bfloat16', enable_prefix_caching=False)
186
  prompt = "Give me a short introduction to large language models."
187
  messages = [
 
188
  {"role": "user", "content": prompt}
189
  ]
190
 
 
199
  #### Online Inference
200
  ```shell
201
  vllm serve inclusionAI/Ring-mini-linear-2.0 \
202
+ --tensor-parallel-size 1 \
 
203
  --gpu-memory-utilization 0.90 \
 
204
  --no-enable-prefix-caching
205
  ```
206