Update README.md
Browse files
README.md
CHANGED
|
@@ -162,14 +162,22 @@ More usage can be found [here](https://docs.sglang.ai/basic_usage/send_request.h
|
|
| 162 |
|
| 163 |
#### Environment Preparation
|
| 164 |
|
| 165 |
-
Since the Pull Request (PR) has not been submitted to the vLLM community at this stage, please prepare the environment by following the steps below
|
|
|
|
|
|
|
| 166 |
```shell
|
| 167 |
-
|
|
|
|
| 168 |
```
|
| 169 |
|
| 170 |
-
|
| 171 |
```shell
|
| 172 |
-
pip install https://media.githubusercontent.com/media/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
```
|
| 174 |
|
| 175 |
#### Offline Inference
|
|
@@ -178,28 +186,39 @@ pip install https://media.githubusercontent.com/media/inclusionAI/Ring-V2/refs/h
|
|
| 178 |
from transformers import AutoTokenizer
|
| 179 |
from vllm import LLM, SamplingParams
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
```
|
| 198 |
|
| 199 |
#### Online Inference
|
| 200 |
```shell
|
| 201 |
vllm serve inclusionAI/Ring-mini-linear-2.0 \
|
| 202 |
--tensor-parallel-size 1 \
|
|
|
|
| 203 |
--gpu-memory-utilization 0.90 \
|
|
|
|
| 204 |
--no-enable-prefix-caching
|
|
|
|
| 205 |
```
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
#### Environment Preparation
|
| 164 |
|
| 165 |
+
Since the Pull Request (PR) has not been submitted to the vLLM community at this stage, please prepare the environment by following the steps below.
|
| 166 |
+
|
| 167 |
+
First, create a Conda environment with Python 3.10 and CUDA 12.8:
|
| 168 |
```shell
|
| 169 |
+
conda create -n vllm python=3.10
|
| 170 |
+
conda activate vllm
|
| 171 |
```
|
| 172 |
|
| 173 |
+
Next, install our vLLM wheel package:
|
| 174 |
```shell
|
| 175 |
+
pip install https://media.githubusercontent.com/media/zheyishine/vllm_whl/refs/heads/main/vllm-0.8.5.post2.dev28%2Bgd327eed71.cu128-cp310-cp310-linux_x86_64.whl --force-reinstall
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
Finally, install compatible versions of transformers after vLLM is installed:
|
| 179 |
+
```shell
|
| 180 |
+
pip install transformers==4.51.1
|
| 181 |
```
|
| 182 |
|
| 183 |
#### Offline Inference
|
|
|
|
| 186 |
from transformers import AutoTokenizer
|
| 187 |
from vllm import LLM, SamplingParams
|
| 188 |
|
| 189 |
+
if __name__ == '__main__':
|
| 190 |
+
tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ring-mini-linear-2.0", trust_remote_code=True)
|
| 191 |
+
|
| 192 |
+
sampling_params = SamplingParams(temperature=0.6, top_p=1.0, max_tokens=1024)
|
| 193 |
|
| 194 |
+
# use `max_num_seqs=1` without concurrency
|
| 195 |
+
llm = LLM(model="inclusionAI/Ring-mini-linear-2.0", dtype='auto', enable_prefix_caching=False, max_num_seqs=128)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
prompt = "Give me a short introduction to large language models."
|
| 199 |
+
messages = [
|
| 200 |
+
{"role": "user", "content": prompt}
|
| 201 |
+
]
|
| 202 |
+
|
| 203 |
+
text = tokenizer.apply_chat_template(
|
| 204 |
+
messages,
|
| 205 |
+
tokenize=False,
|
| 206 |
+
add_generation_prompt=True
|
| 207 |
+
)
|
| 208 |
+
outputs = llm.generate([text], sampling_params)
|
| 209 |
+
for output in outputs:
|
| 210 |
+
print(output.outputs[0].text)
|
| 211 |
```
|
| 212 |
|
| 213 |
#### Online Inference
|
| 214 |
```shell
|
| 215 |
vllm serve inclusionAI/Ring-mini-linear-2.0 \
|
| 216 |
--tensor-parallel-size 1 \
|
| 217 |
+
--pipeline-parallel-size 1 \
|
| 218 |
--gpu-memory-utilization 0.90 \
|
| 219 |
+
--max-num-seqs 128 \
|
| 220 |
--no-enable-prefix-caching
|
| 221 |
+
--api-key your-api-key
|
| 222 |
```
|
| 223 |
+
|
| 224 |
+
|