Spaces:

MCP-1st-Birthday
/

eu-ai-act-chatgpt-mcp

Running

App Files Files Community

eu-ai-act-chatgpt-mcp / modal /gpt_oss_inference.py

drosatos

Deploy ChatGPT MCP Server

9434d3d 11 days ago

raw

history blame contribute delete

13.9 kB

	"""
	GPT-OSS Model Deployment on Modal with vLLM

	This script deploys OpenAI's GPT-OSS models (20B or 120B) on Modal.com
	with vLLM for efficient inference.

	Usage:
	# First time setup - pre-download model weights (run once, takes ~5-10 min)
	modal run gpt_oss_inference.py::download_model

	# Test the server locally
	modal run gpt_oss_inference.py

	# Deploy to production
	modal deploy gpt_oss_inference.py

	Performance Tips:
	1. Run download_model first to cache weights in the volume
	2. Reduce MAX_MODEL_LEN for faster startup (8k is sufficient for most use cases)
	3. Keep FAST_BOOT=True for cheaper GPUs (A10G, L4)
	4. Increase SCALEDOWN_WINDOW to reduce cold starts during demos

	Based on: https://modal.com/docs/examples/gpt_oss_inference
	"""

	import json
	import time
	from datetime import datetime, timezone
	from typing import Any

	import aiohttp
	import modal

	# =============================================================================
	# Container Image Configuration
	# =============================================================================

	# Enable HF Transfer for faster model downloads (5-10x faster)
	vllm_image = (
	modal.Image.from_registry(
	"nvidia/cuda:12.8.1-devel-ubuntu22.04",
	add_python="3.12",
	)
	.entrypoint([])
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # Enable fast downloads
	.uv_pip_install(
	"vllm==0.11.0",
	"huggingface_hub[hf_transfer]==0.35.0",
	"flashinfer-python==0.3.1",
	)
	)

	# =============================================================================
	# Model Configuration
	# =============================================================================

	# Choose the model size - 20B is faster, 120B has more capabilities
	MODEL_NAME = "openai/gpt-oss-20b" # or "openai/gpt-oss-120b"
	MODEL_REVISION = "d666cf3b67006cf8227666739edf25164aaffdeb"

	# =============================================================================
	# GPU Configuration - CHOOSE YOUR GPU TIER
	# =============================================================================
	#
	# Modal GPU Pricing (approximate, per hour):
	# ┌─────────────┬──────────┬────────────────────────────────────────────┐
	# │ GPU │ Price/hr │ Notes │
	# ├─────────────┼──────────┼────────────────────────────────────────────┤
	# │ T4 (16GB) │ ~$0.25 │ ❌ Too small for GPT-OSS │
	# │ L4 (24GB) │ ~$0.59 │ ⚠️ Tight fit, may work with 20B │
	# │ A10G (24GB) │ ~$0.76 │ ✅ Good balance for 20B model │
	# │ A100 40GB │ ~$1.79 │ ✅ Comfortable for 20B │
	# │ A100 80GB │ ~$2.78 │ ✅ Works for both 20B and 120B │
	# │ H100 (80GB) │ ~$3.95 │ ✅ Best performance, both models │
	# └─────────────┴──────────┴────────────────────────────────────────────┘
	#
	# GPT-OSS 20B with MXFP4 quantization needs ~10-15GB VRAM
	# GPT-OSS 120B needs ~40-50GB VRAM

	# Choose your GPU - uncomment the one you want to use:
	GPU_CONFIG = "A100-40GB" # ~$0.76/hr - RECOMMENDED for budget (works with 20B)
	# GPU_CONFIG = "L4" # ~$0.59/hr - Cheapest option (may be tight)
	# GPU_CONFIG = "A100" # ~$1.79/hr - More headroom (40GB version)
	# GPU_CONFIG = "H100" # ~$3.95/hr - Maximum performance

	# =============================================================================
	# Volume Configuration for Caching
	# =============================================================================

	# Cache for HuggingFace model weights
	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)

	# Cache for vLLM compilation artifacts
	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

	# =============================================================================
	# Performance Configuration
	# =============================================================================

	MINUTES = 60 # Helper constant

	# FAST_BOOT = True: Faster startup but slower inference
	# FAST_BOOT = False: Slower startup but faster inference (recommended for production)
	FAST_BOOT = True # Use True for cheaper GPUs to reduce startup memory

	# CUDA graph capture sizes for optimized inference
	CUDA_GRAPH_CAPTURE_SIZES = [1, 2, 4, 8, 16, 24, 32]

	# Data type configuration
	# NOTE: GPT-OSS uses MXFP4 quantization which REQUIRES bfloat16 - float16 is NOT supported
	# The Marlin kernel warning on A10G/L4 is expected and can be ignored
	USE_FLOAT16 = False # Must be False for GPT-OSS (MXFP4 only supports bfloat16)

	# Maximum model length (context window) - SIGNIFICANTLY REDUCED for faster startup
	# The KV cache allocation is proportional to context length, so smaller = much faster startup
	# For EU AI Act assessments, 8k-16k tokens is more than enough
	# GPT-OSS 20B supports up to 128k tokens, but we only need ~8k for our use case
	MAX_MODEL_LEN = 16384 # 16k tokens - sufficient for compliance assessments, 4x faster startup

	# Server configuration
	VLLM_PORT = 8000
	N_GPU = 1 # Number of GPUs for tensor parallelism
	MAX_INPUTS = 50 # Reduced for smaller GPUs

	# Keep container warm longer to avoid cold starts (costs more but faster response)
	# For hackathon demo: 10 minutes to reduce cold starts during presentation
	SCALEDOWN_WINDOW = 10 * MINUTES # Increased for demo stability

	# =============================================================================
	# Modal App Definition
	# =============================================================================

	app = modal.App("gpt-oss-vllm-inference")


	# Select GPU based on GPU_CONFIG
	_GPU_MAP = {
	"T4": "T4",
	"L4": "L4",
	"A10G": "A10G",
	"A100": "A100:40GB",
	"A100-80GB": "A100:80GB",
	"H100": "H100",
	}
	SELECTED_GPU = _GPU_MAP.get(GPU_CONFIG, "A10G")


	# =============================================================================
	# Pre-download Model Weights (reduces warm start time significantly)
	# =============================================================================

	@app.function(
	image=vllm_image,
	volumes={"/root/.cache/huggingface": hf_cache_vol},
	timeout=30 * MINUTES,
	)
	def download_model():
	"""
	Pre-download the model weights to the volume cache.
	Run this once with: modal run gpt_oss_inference.py::download_model
	This will cache the weights and make subsequent starts much faster.
	"""
	from huggingface_hub import snapshot_download

	print(f"📥 Downloading model weights for {MODEL_NAME}...")
	print(f" Revision: {MODEL_REVISION}")

	snapshot_download(
	MODEL_NAME,
	revision=MODEL_REVISION,
	local_dir=f"/root/.cache/huggingface/hub/models--{MODEL_NAME.replace('/', '--')}",
	)

	print("✅ Model weights downloaded and cached!")
	print(" Future container starts will use the cached weights.")


	@app.function(
	image=vllm_image,
	gpu=SELECTED_GPU,
	scaledown_window=SCALEDOWN_WINDOW,
	timeout=30 * MINUTES,
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	)
	@modal.concurrent(max_inputs=MAX_INPUTS)
	@modal.web_server(port=VLLM_PORT, startup_timeout=30 * MINUTES)
	def serve():
	"""Start the vLLM server with GPT-OSS model."""
	import subprocess

	cmd = [
	"vllm",
	"serve",
	"--uvicorn-log-level=info",
	MODEL_NAME,
	"--revision",
	MODEL_REVISION,
	"--served-model-name",
	"llm", # Serve model as "llm" - this is what clients expect
	"--host",
	"0.0.0.0",
	"--port",
	str(VLLM_PORT),
	]

	# enforce-eager disables both Torch compilation and CUDA graph capture
	# default is no-enforce-eager. see the --compilation-config flag for tighter control
	cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]

	if not FAST_BOOT: # CUDA graph capture is only used with `--no-enforce-eager`
	cmd += [
	"-O.cudagraph_capture_sizes="
	+ str(CUDA_GRAPH_CAPTURE_SIZES).replace(" ", "")
	]

	# Data type optimization: use float16 for A10G/L4 (SM86) to avoid Marlin kernel warning
	# bf16 is optimized for SM90+ (H100), fp16 is better for Ampere architecture
	if USE_FLOAT16:
	cmd += ["--dtype", "float16"]
	else:
	cmd += ["--dtype", "bfloat16"]

	# Limit context length to speed up startup and reduce memory allocation
	cmd += ["--max-model-len", str(MAX_MODEL_LEN)]

	# Disable custom all-reduce for single GPU (reduces startup overhead)
	if N_GPU == 1:
	cmd += ["--disable-custom-all-reduce"]

	# Enable prefix caching for faster subsequent requests
	cmd += ["--enable-prefix-caching"]

	# Trust remote code for GPT-OSS models
	cmd += ["--trust-remote-code"]

	# Optimize loading format for faster startup
	cmd += ["--load-format", "auto"] # Auto-detect best format

	# assume multiple GPUs are for splitting up large matrix multiplications
	cmd += ["--tensor-parallel-size", str(N_GPU)]

	# Additional optimizations for faster startup and inference
	# Disable usage stats collection to speed up startup
	cmd += ["--disable-log-stats"]

	# Use swap space if needed (helps with memory pressure on smaller GPUs)
	cmd += ["--swap-space", "4"] # 4GB swap space

	print(f"Starting vLLM server with command: {' '.join(cmd)}")

	subprocess.Popen(" ".join(cmd), shell=True)


	# =============================================================================
	# Local Test Entrypoint
	# =============================================================================


	@app.local_entrypoint()
	async def test(test_timeout=30 * MINUTES, user_content=None, twice=True):
	"""
	Test the deployed server with a sample prompt.

	Args:
	test_timeout: Maximum time to wait for server health
	user_content: Custom prompt to send (default: SVD explanation)
	twice: Whether to send a second request
	"""
	url = serve.get_web_url()

	system_prompt = {
	"role": "system",
	"content": f"""You are ChatModal, a large language model trained by Modal.
	Knowledge cutoff: 2024-06
	Current date: {datetime.now(timezone.utc).date()}
	Reasoning: low
	# Valid channels: analysis, commentary, final. Channel must be included for every message.
	Calls to these tools must go to the commentary channel: 'functions'.""",
	}

	if user_content is None:
	user_content = "Explain what the Singular Value Decomposition is."

	messages = [ # OpenAI chat format
	system_prompt,
	{"role": "user", "content": user_content},
	]

	async with aiohttp.ClientSession(base_url=url) as session:
	print(f"Running health check for server at {url}")
	async with session.get("/health", timeout=test_timeout - 1 * MINUTES) as resp:
	up = resp.status == 200
	assert up, f"Failed health check for server at {url}"
	print(f"Successful health check for server at {url}")

	print(f"Sending messages to {url}:", *messages, sep="\n\t")
	await _send_request(session, "llm", messages)

	if twice:
	messages[0]["content"] += "\nTalk like a pirate, matey."
	print(f"Re-sending messages to {url}:", *messages, sep="\n\t")
	await _send_request(session, "llm", messages)


	async def _send_request(
	session: aiohttp.ClientSession, model: str, messages: list
	) -> None:
	"""Send a streaming request to the vLLM server."""
	# `stream=True` tells an OpenAI-compatible backend to stream chunks
	payload: dict[str, Any] = {"messages": messages, "model": model, "stream": True}

	headers = {"Content-Type": "application/json", "Accept": "text/event-stream"}

	t = time.perf_counter()
	async with session.post(
	"/v1/chat/completions", json=payload, headers=headers, timeout=10 * MINUTES
	) as resp:
	async for raw in resp.content:
	resp.raise_for_status()
	# extract new content and stream it
	line = raw.decode().strip()
	if not line or line == "data: [DONE]":
	continue
	if line.startswith("data: "): # SSE prefix
	line = line[len("data: ") :]

	chunk = json.loads(line)
	assert (
	chunk["object"] == "chat.completion.chunk"
	) # or something went horribly wrong
	delta = chunk["choices"][0]["delta"]

	if "content" in delta:
	print(delta["content"], end="") # print the content as it comes in
	elif "reasoning_content" in delta:
	print(delta["reasoning_content"], end="")
	elif not delta:
	print()
	else:
	raise ValueError(f"Unsupported response delta: {delta}")
	print("")
	print(f"Time to Last Token: {time.perf_counter() - t:.2f} seconds")


	# =============================================================================
	# Utility Functions
	# =============================================================================


	def get_endpoint_url() -> str:
	"""Get the deployed endpoint URL."""
	return serve.get_web_url()


	if __name__ == "__main__":
	print("Run this script with Modal:")
	print(" modal run gpt_oss_inference.py # Test the server")
	print(" modal deploy gpt_oss_inference.py # Deploy to production")