CloudOrAPI / app.py
3v324v23's picture
Add Gradio app comparing cloud GPU vs API costs
862c42d
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
MODEL_PRESETS = {
"DeepSeek V4 — OpenRouter (~90% cache)": (0.041, 0.87),
"Claude Sonnet 4.6": (3.0, 15.0),
"Claude Haiku 4.5": (1.0, 5.0),
"Custom": None,
}
CLOUD_PRESETS = {
"GMI Cloud": [["H200 × 8", 20.8], ["B200 × 8", 32.0], ["GB200 × 4", 32.0]],
"Custom": None,
}
DEFAULT_MODEL = "DeepSeek V4 — OpenRouter (~90% cache)"
DEFAULT_CLOUD = "GMI Cloud"
DEFAULT_IN_K = 64.0
DEFAULT_OUT_K = 4.0
DEFAULT_RPS = 1.0
GPU_COLORS = ["#2E86DE", "#10AC84", "#EE5253", "#8854D0", "#F79F1F", "#576574"]
WORKLOAD_COLOR = "#9b59b6"
def cost_per_request(in_k: float, out_k: float, in_price: float, out_price: float) -> float:
return (in_k * 1000 * in_price + out_k * 1000 * out_price) / 1_000_000
def parse_gpus(df):
if isinstance(df, pd.DataFrame):
rows = df.fillna(0).values.tolist()
else:
rows = df or []
out = []
for row in rows:
if not row or len(row) < 2:
continue
name = str(row[0]).strip() if row[0] is not None else ""
try:
hourly = float(row[1])
except (TypeError, ValueError):
continue
if not name or hourly <= 0:
continue
out.append((name, hourly))
return out
def compute(in_price, out_price, in_k, out_k, gpu_df, planned_rps):
cpr = cost_per_request(in_k, out_k, in_price, out_price)
gpus = parse_gpus(gpu_df)
headline = _headline(cpr, in_k, out_k, in_price, out_price)
if cpr <= 0 or not gpus:
empty_break = pd.DataFrame(columns=["GPU config", "$/hour", "Break-even req/hr", "Break-even RPS"])
empty_workload = pd.DataFrame(columns=["Option", "$ / hour", "vs API"])
return headline, empty_break, empty_workload, _empty_figure()
break_rows = []
max_rps = 0.0
for name, hourly in gpus:
rph = hourly / cpr
rps = rph / 3600
max_rps = max(max_rps, rps)
break_rows.append({
"GPU config": name,
"$/hour": f"${hourly:,.2f}",
"Break-even req/hr": f"{rph:,.0f}",
"Break-even RPS": f"{rps:,.3f}",
})
break_df = pd.DataFrame(break_rows)
api_hourly = planned_rps * 3600 * cpr
workload_rows = [{
"Option": "API",
"$ / hour": f"${api_hourly:,.2f}",
"vs API": "—",
}]
for name, hourly in gpus:
diff = hourly - api_hourly
if abs(diff) < 0.005:
note = "break-even"
elif diff < 0:
note = f"−${abs(diff):,.2f}/hr cheaper than API"
else:
note = f"+${diff:,.2f}/hr pricier than API"
workload_rows.append({
"Option": name,
"$ / hour": f"${hourly:,.2f}",
"vs API": note,
})
workload_df = pd.DataFrame(workload_rows)
x_max = max(max_rps * 1.6, planned_rps * 1.3, 0.1)
fig = _build_figure(cpr, gpus, x_max, planned_rps)
return headline, break_df, workload_df, fig
def _headline(cpr, in_k, out_k, in_price, out_price):
return (
f"### API cost per request: **${cpr:,.6f}** \n"
f"_({int(in_k * 1000):,} in × ${in_price}/1M + {int(out_k * 1000):,} out × ${out_price}/1M)_"
)
def _empty_figure():
fig = go.Figure()
fig.update_layout(
template="plotly_white",
height=480,
annotations=[dict(text="Set positive values for tokens, prices, and at least one GPU row.",
xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)],
)
return fig
def _build_figure(cpr, gpus, x_max, planned_rps):
n = 200
xs = [x_max * i / (n - 1) for i in range(n)]
api_costs = [r * 3600 * cpr for r in xs]
fig = go.Figure()
fig.add_trace(go.Scatter(
x=xs, y=api_costs, mode="lines",
name="API cost",
line=dict(color="#222f3e", width=3),
hovertemplate="RPS: %{x:.3f}<br>API $/hr: $%{y:,.2f}<extra></extra>",
))
y_max = max(api_costs[-1], max(h for _, h in gpus)) * 1.18
for i, (name, hourly) in enumerate(gpus):
color = GPU_COLORS[i % len(GPU_COLORS)]
fig.add_trace(go.Scatter(
x=[0, x_max], y=[hourly, hourly], mode="lines",
name=f"{name} (${hourly:.2f}/hr)",
line=dict(color=color, width=2, dash="dash"),
hovertemplate=f"{name}<br>$/hr: ${hourly:,.2f}<extra></extra>",
))
rph = hourly / cpr
rps = rph / 3600
if rps <= x_max:
fig.add_trace(go.Scatter(
x=[rps], y=[hourly],
mode="markers+text",
marker=dict(color=color, size=11, line=dict(color="white", width=2)),
text=[f" {rps:.3f} RPS"],
textposition="middle right",
textfont=dict(color=color, size=12),
showlegend=False,
hovertemplate=(
f"{name} break-even<br>"
f"RPS: {rps:.3f}<br>"
f"req/hr: {rph:,.0f}<br>"
f"$/hr: ${hourly:,.2f}<extra></extra>"
),
))
api_at = planned_rps * 3600 * cpr
fig.add_shape(type="line",
x0=planned_rps, x1=planned_rps, y0=0, y1=y_max,
line=dict(color=WORKLOAD_COLOR, width=2, dash="dot"))
fig.add_annotation(x=planned_rps, y=y_max,
text=f"your workload: {planned_rps:.2f} RPS",
showarrow=False,
font=dict(color=WORKLOAD_COLOR, size=12),
yshift=8)
fig.add_trace(go.Scatter(
x=[planned_rps], y=[api_at],
mode="markers",
marker=dict(color=WORKLOAD_COLOR, size=11, symbol="diamond",
line=dict(color="white", width=2)),
name="Your workload (on API)",
hovertemplate=f"At {planned_rps:.2f} RPS<br>API $/hr: ${api_at:,.2f}<extra></extra>",
))
fig.update_layout(
template="plotly_white",
height=480,
margin=dict(l=60, r=30, t=70, b=50),
xaxis=dict(title="Requests per second", range=[0, x_max]),
yaxis=dict(title="$ / hour", rangemode="tozero"),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
title=dict(text="Cloud GPU $/hr vs API $/hr — where lines cross is break-even",
font=dict(size=14)),
)
return fig
def apply_model_preset(preset_name, cur_in, cur_out):
p = MODEL_PRESETS.get(preset_name)
if p is None:
return cur_in, cur_out
return p[0], p[1]
def apply_cloud_preset(preset_name, cur_df):
p = CLOUD_PRESETS.get(preset_name)
if p is None:
return cur_df
return p
def reset_all():
return (
DEFAULT_MODEL,
DEFAULT_CLOUD,
MODEL_PRESETS[DEFAULT_MODEL][0],
MODEL_PRESETS[DEFAULT_MODEL][1],
DEFAULT_IN_K,
DEFAULT_OUT_K,
CLOUD_PRESETS[DEFAULT_CLOUD],
DEFAULT_RPS,
)
with gr.Blocks(title="Cloud bills vs API bills") as demo:
gr.Markdown(
"""
# Cloud bills vs API bills
At what request rate does renting GPUs beat paying per token?
Drag the **Your workload** slider to see live cost at your planned scale.
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Model & API pricing")
model_preset = gr.Dropdown(
choices=list(MODEL_PRESETS.keys()),
value=DEFAULT_MODEL,
label="Model preset",
info="Pick a preset or switch to Custom to enter your own prices.",
)
in_price = gr.Number(
value=MODEL_PRESETS[DEFAULT_MODEL][0],
label="Input $ / 1M tokens",
precision=4,
info="Effective input price (post-cache for OpenRouter-style providers).",
)
out_price = gr.Number(
value=MODEL_PRESETS[DEFAULT_MODEL][1],
label="Output $ / 1M tokens",
precision=4,
)
gr.Markdown("### Request shape")
in_tokens_k = gr.Slider(
1, 256, value=DEFAULT_IN_K, step=1,
label="Input tokens / request (k)",
info="64 means 64,000 tokens. Slide for typical context size.",
)
out_tokens_k = gr.Slider(
0.1, 32, value=DEFAULT_OUT_K, step=0.1,
label="Output tokens / request (k)",
info="4 means 4,000 tokens.",
)
gr.Markdown("### Cloud GPU rates")
cloud_preset = gr.Dropdown(
choices=list(CLOUD_PRESETS.keys()),
value=DEFAULT_CLOUD,
label="Cloud provider preset",
info="Edit the table below to match your contract.",
)
gpu_df = gr.Dataframe(
value=CLOUD_PRESETS[DEFAULT_CLOUD],
headers=["Config", "$ / hour"],
datatype=["str", "number"],
column_count=(2, "fixed"),
row_count=(3, "dynamic"),
interactive=True,
)
reset_btn = gr.Button("↺ Reset to defaults", variant="secondary", size="sm")
with gr.Column(scale=2):
gr.Markdown("### Your workload")
planned_rps = gr.Slider(
0, 5, value=DEFAULT_RPS, step=0.05,
label="Planned requests / second",
info="What scale do you expect to run at? The dotted line on the chart marks this point.",
)
workload_table = gr.Dataframe(
headers=["Option", "$ / hour", "vs API"],
interactive=False,
wrap=True,
)
gr.Markdown("### Break-even points")
headline = gr.Markdown()
break_table = gr.Dataframe(
headers=["GPU config", "$/hour", "Break-even req/hr", "Break-even RPS"],
interactive=False,
wrap=True,
)
chart = gr.Plot()
inputs = [in_price, out_price, in_tokens_k, out_tokens_k, gpu_df, planned_rps]
outputs = [headline, break_table, workload_table, chart]
for c in inputs:
c.change(compute, inputs=inputs, outputs=outputs)
model_preset.change(
apply_model_preset,
inputs=[model_preset, in_price, out_price],
outputs=[in_price, out_price],
)
cloud_preset.change(
apply_cloud_preset,
inputs=[cloud_preset, gpu_df],
outputs=[gpu_df],
)
reset_outputs = [model_preset, cloud_preset, in_price, out_price,
in_tokens_k, out_tokens_k, gpu_df, planned_rps]
reset_btn.click(reset_all, outputs=reset_outputs).then(
compute, inputs=inputs, outputs=outputs
)
demo.load(compute, inputs=inputs, outputs=outputs)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft())