import json import sys import time import httpx URL = "http://127.0.0.1:8080/v1/chat/completions" request_body = { "messages": [ { "role": "user", "content": "用三句话解释大语言模型的推理过程。", } ], "temperature": 0.2, "max_tokens": 100, "stream": True, "stream_options": {"include_usage": True}, "timings_per_token": True, } request_started = time.perf_counter() first_token_at = None final_stats = None with httpx.Client(timeout=120.0) as client: with client.stream("POST", URL, json=request_body) as response: response.raise_for_status() for line in response.iter_lines(): if not line.startswith("data: "): continue data = line.removeprefix("data: ") if data == "[DONE]": break event = json.loads(data) choices = event.get("choices", []) if choices: content = choices[0].get("delta", {}).get("content") if content: if first_token_at is None: first_token_at = time.perf_counter() print(content, end="", flush=True) if event.get("usage"): final_stats = event print() if first_token_at is not None: ttft_ms = (first_token_at - request_started) * 1000 print(f"TTFT: {ttft_ms:.1f} ms", file=sys.stderr) if final_stats is not None: usage = final_stats["usage"] timings = final_stats["timings"] print(f"Prompt tokens: {usage['prompt_tokens']}", file=sys.stderr) print(f"Output tokens: {usage['completion_tokens']}", file=sys.stderr) print( f"Prefill: {timings['prompt_per_second']:.1f} tok/s", file=sys.stderr, ) print( f"Decode: {timings['predicted_per_second']:.1f} tok/s", file=sys.stderr, )