Spaces:

build-small-hackathon
/

open-cortex

Running

open-cortex / scripts /stream_chat.py

feat(main): stream metrics fetch

828386c 6 days ago

1.92 kB

	import json
	import sys
	import time

	import httpx


	URL = "http://127.0.0.1:8080/v1/chat/completions"

	request_body = {
	"messages": [
	{
	"role": "user",
	"content": "用三句话解释大语言模型的推理过程。",
	}
	],
	"temperature": 0.2,
	"max_tokens": 100,
	"stream": True,
	"stream_options": {"include_usage": True},
	"timings_per_token": True,
	}

	request_started = time.perf_counter()
	first_token_at = None
	final_stats = None

	with httpx.Client(timeout=120.0) as client:
	with client.stream("POST", URL, json=request_body) as response:
	response.raise_for_status()

	for line in response.iter_lines():
	if not line.startswith("data: "):
	continue

	data = line.removeprefix("data: ")

	if data == "[DONE]":
	break

	event = json.loads(data)
	choices = event.get("choices", [])

	if choices:
	content = choices[0].get("delta", {}).get("content")

	if content:
	if first_token_at is None:
	first_token_at = time.perf_counter()

	print(content, end="", flush=True)

	if event.get("usage"):
	final_stats = event

	print()

	if first_token_at is not None:
	ttft_ms = (first_token_at - request_started) * 1000
	print(f"TTFT: {ttft_ms:.1f} ms", file=sys.stderr)

	if final_stats is not None:
	usage = final_stats["usage"]
	timings = final_stats["timings"]

	print(f"Prompt tokens: {usage['prompt_tokens']}", file=sys.stderr)
	print(f"Output tokens: {usage['completion_tokens']}", file=sys.stderr)
	print(
	f"Prefill: {timings['prompt_per_second']:.1f} tok/s",
	file=sys.stderr,
	)
	print(
	f"Decode: {timings['predicted_per_second']:.1f} tok/s",
	file=sys.stderr,
	)