LocateAnything / index.html
fix-bot
ui: default OCR task to slow inference mode
3a01738
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>NVIDIA LocateAnything - Fast Vision-Language Grounding</title>
<!-- Premium Google Fonts -->
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Outfit:wght@500;600;700;800;900&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
<!-- Tailwind CSS CDN -->
<script src="https://cdn.tailwindcss.com"></script>
<script>
tailwind.config = {
theme: {
extend: {
fontFamily: {
sans: ['Inter', 'sans-serif'],
outfit: ['Outfit', 'sans-serif'],
mono: ['Fira Code', 'monospace'],
},
colors: {
nvidia: {
light: '#76b900',
brand: '#76b900',
dark: '#5c9000',
hover: '#87d300',
},
dark: {
50: '#222222',
100: '#1a1a1a',
200: '#121212',
300: '#0a0a0a',
400: '#050505',
}
}
}
}
}
</script>
<style>
body {
background-color: #050505;
background-image:
radial-gradient(circle at 10% 20%, rgba(118, 185, 0, 0.08) 0%, transparent 45%),
radial-gradient(circle at 90% 80%, rgba(99, 102, 241, 0.05) 0%, transparent 45%);
background-attachment: fixed;
}
/* NVIDIA-style Carbon Triangle Grid Pattern */
.carbon-grid {
background-image:
linear-gradient(30deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
linear-gradient(150deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
linear-gradient(30deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
linear-gradient(150deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
linear-gradient(60deg, #171717 25%, transparent 25.5%, transparent 75%, #171717 75.5%, #171717),
linear-gradient(60deg, #171717 25%, transparent 25.5%, transparent 75%, #171717 75.5%, #171717);
background-size: 80px 140px;
background-position: 0 0, 0 0, 40px 70px, 40px 70px, 0 0, 40px 70px;
}
/* Glassmorphism Styles */
.glass-panel {
background: rgba(18, 18, 18, 0.65);
backdrop-filter: blur(20px);
-webkit-backdrop-filter: blur(20px);
border: 1px solid rgba(255, 255, 255, 0.04);
box-shadow: 0 24px 64px 0 rgba(0, 0, 0, 0.7);
}
.glass-panel-interactive {
transition: all 0.4s cubic-bezier(0.16, 1, 0.3, 1);
}
.glass-panel-interactive:hover {
border-color: rgba(118, 185, 0, 0.25);
box-shadow: 0 30px 80px 0 rgba(118, 185, 0, 0.08);
transform: translateY(-2px);
}
/* SAM 3 Style Glassmorphic Float Input */
.sam-input-bar {
background: rgba(255, 255, 255, 0.06);
backdrop-filter: blur(25px);
-webkit-backdrop-filter: blur(25px);
border: 1px solid rgba(255, 255, 255, 0.08);
box-shadow: 0 16px 40px rgba(0, 0, 0, 0.5);
transition: all 0.3s cubic-bezier(0.16, 1, 0.3, 1);
}
.sam-input-bar:focus-within {
background: rgba(255, 255, 255, 0.09);
border-color: rgba(118, 185, 0, 0.6);
box-shadow: 0 20px 48px rgba(118, 185, 0, 0.15);
}
/* Hexagonal Glowing Border for Media Workspace (NVIDIA GTC Keynote Style) */
.gtc-polygon-wrapper {
position: relative;
background: #0f1218;
border: 1px solid rgba(118, 185, 0, 0.15);
box-shadow: 0 0 50px rgba(0, 0, 0, 0.8);
overflow: hidden;
clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
}
.gtc-polygon-wrapper::before {
content: '';
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
border: 2px solid #76b900;
pointer-events: none;
clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
opacity: 0.8;
box-shadow: inset 0 0 20px rgba(118, 185, 0, 0.3);
}
.gtc-neon-border {
position: absolute;
top: -2px;
left: -2px;
right: -2px;
bottom: -2px;
background: linear-gradient(135deg, #76b900, #3f6200, #76b900);
z-index: 0;
pointer-events: none;
opacity: 0.95;
clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
}
.gtc-inner-box {
position: relative;
background: #080a0e;
z-index: 10;
height: 100%;
clip-path: polygon(8.1% 0.1%, 99.9% 0.1%, 99.9% 91.9%, 91.9% 99.9%, 0.1% 99.9%, 0.1% 8.1%);
}
/* Pill Buttons styling */
.pill-btn-green {
background-color: #76b900;
transition: all 0.3s cubic-bezier(0.16, 1, 0.3, 1);
}
.pill-btn-green:hover {
background-color: #87d300;
box-shadow: 0 0 24px rgba(118, 185, 0, 0.45);
transform: translateY(-1px);
}
.pill-btn-green:active {
transform: translateY(1px);
}
/* Custom Scrollbar */
::-webkit-scrollbar {
width: 6px;
height: 6px;
}
::-webkit-scrollbar-track {
background: #0a0a0a;
}
::-webkit-scrollbar-thumb {
background: #222;
border-radius: 3px;
}
::-webkit-scrollbar-thumb:hover {
background: #333;
}
/* Pulse loaders */
.dot-pulse {
animation: pulse 1.4s infinite ease-in-out;
}
@keyframes pulse {
0%, 100% { opacity: 0.3; transform: scale(0.9); }
50% { opacity: 1; transform: scale(1.1); }
}
.drop-zone-active {
border-color: #76b900 !important;
background: rgba(118, 185, 0, 0.04) !important;
}
/* Detection overlay tag pop-in (restored from previous demo) */
@keyframes det-pop {
0% { opacity: 0; transform: translateY(10px) scale(0.88); }
60% { opacity: 1; transform: translateY(-2px) scale(1.03); }
100% { opacity: 1; transform: translateY(0) scale(1); }
}
.det-tag-pop {
opacity: 0;
animation: det-pop 0.38s cubic-bezier(0.16, 1, 0.3, 1) forwards;
}
.det-count-pop {
animation: det-pop 0.35s cubic-bezier(0.16, 1, 0.3, 1) forwards;
}
/* Detected overlays: fixed height, internal scroll */
.detection-scroll {
min-height: 0;
overflow-y: auto;
overscroll-behavior: contain;
scroll-behavior: smooth;
scrollbar-width: thin;
scrollbar-color: rgba(118, 185, 0, 0.45) rgba(0, 0, 0, 0.2);
mask-image: linear-gradient(to bottom, black 88%, transparent 100%);
-webkit-mask-image: linear-gradient(to bottom, black 88%, transparent 100%);
}
.detection-scroll::-webkit-scrollbar {
width: 5px;
}
.detection-scroll::-webkit-scrollbar-thumb {
background: rgba(118, 185, 0, 0.45);
border-radius: 999px;
}
</style>
</head>
<body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
<!-- NVIDIA Brand Navigation Header (Transparent dark blur) -->
<nav class="bg-black/40 backdrop-blur-md sticky top-0 z-50 px-6 py-3.5 border-b border-white/5 shadow-lg">
<div class="max-w-[1600px] mx-auto flex items-center justify-between">
<!-- Official Styled NVIDIA Brand Text Logo -->
<a href="#" class="flex items-center gap-1.5 select-none group">
<svg class="h-6 w-6 text-nvidia-brand transition-transform duration-500 group-hover:rotate-180" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5">
<path stroke-linecap="round" stroke-linejoin="round" d="M9 3v2m6-2v2M9 19v2m6-2v2M5 9H3m2 6H3m18-6h-2m2 6h-2M7 19h10a2 2 0 002-2V7a2 2 0 00-2-2H7a2 2 0 00-2 2v10a2 2 0 002 2z" />
</svg>
<span class="font-outfit text-[22px] font-black tracking-tighter text-white">
NVIDIA <span class="font-light tracking-wide text-slate-400">LocateAnything</span>
</span>
</a>
<span class="px-3 py-1 text-xs font-semibold rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 flex items-center gap-1.5 font-mono">
<span class="h-1.5 w-1.5 rounded-full bg-nvidia-brand animate-pulse"></span>
ZeroGPU Server
</span>
</div>
</nav>
<!-- MAIN MINIMAL LAYOUT CONTAINER -->
<main class="max-w-[1600px] mx-auto px-4 sm:px-6 lg:px-8 pt-8 lg:pt-10 space-y-8">
<!-- Giant Showcase Container (SAM 3 Full-Bleed Style) -->
<div class="relative w-full rounded-[32px] overflow-hidden border border-white/5 bg-[#080a0e] shadow-2xl h-[580px] lg:h-[640px] flex select-none">
<!-- 1. Dedicated Media Canvas (offset right of the control panel on desktop) -->
<div class="absolute inset-y-0 right-0 left-0 lg:left-[440px] z-0 flex items-center justify-center bg-black/40 lg:border-l lg:border-white/10">
<!-- Drop Zone (Initially shown) -->
<div id="drop-zone" class="absolute inset-0 border-none rounded-none bg-transparent flex flex-col items-center justify-center p-4 text-center cursor-pointer transition-all z-10">
<div id="upload-prompt" class="space-y-3 opacity-60 hover:opacity-100 transition-opacity">
<div class="inline-flex h-12 w-12 rounded-full bg-white/5 items-center justify-center text-slate-300">
<svg class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
<path stroke-linecap="round" stroke-linejoin="round" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12" />
</svg>
</div>
<div>
<p class="text-xs font-bold text-slate-200">Drag & drop your file here</p>
<p class="text-[10px] text-slate-500 mt-1">or click to browse local folders</p>
</div>
</div>
<!-- Dynamic Preview Media -->
<img id="preview-image" src="" alt="Input Preview" class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl z-20 border border-white/5">
<video id="preview-video" src="" controls class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl z-20 border border-white/5"></video>
<!-- File Input -->
<input type="file" id="media-file-input" accept="image/*,video/*" class="absolute inset-0 opacity-0 cursor-pointer z-30">
</div>
<!-- Inference Output Zone -->
<div class="absolute inset-0 pointer-events-none flex items-center justify-center z-20">
<img id="output-image" src="" alt="Inference Output" class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl pointer-events-auto border border-white/5">
<video id="output-video" src="" controls class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl pointer-events-auto border border-white/5"></video>
</div>
<!-- Processing Overlays -->
<div id="processing-overlay" class="absolute inset-0 bg-black/85 backdrop-blur-sm hidden flex-col items-center justify-center gap-4 z-40">
<div class="flex gap-1.5">
<span class="dot-pulse inline-block h-3 w-3 rounded-full bg-nvidia-brand" style="animation-delay: 0s;"></span>
<span class="dot-pulse inline-block h-3 w-3 rounded-full bg-emerald-400" style="animation-delay: 0.2s;"></span>
<span class="dot-pulse inline-block h-3 w-3 rounded-full bg-emerald-300" style="animation-delay: 0.4s;"></span>
</div>
<div class="text-center space-y-1">
<p id="processing-status" class="text-[11px] font-bold tracking-widest text-slate-200 uppercase">Executing Model...</p>
<p class="text-[9px] text-slate-500 uppercase tracking-wider font-mono">ZeroGPU Queue Active</p>
</div>
</div>
</div>
<!-- 2. Left Control Panel (Title, simple selectors, accordion, and action buttons) -->
<div class="absolute left-6 top-8 bottom-8 z-30 flex flex-col justify-between w-[380px] max-w-[calc(100%-3rem)] pointer-events-none">
<!-- Main Header Overlay text -->
<div class="space-y-3 pt-4 pointer-events-auto bg-gradient-to-b from-[#080a0e]/90 via-[#080a0e]/60 to-transparent p-4 rounded-2xl">
<span class="text-[9px] font-bold text-nvidia-brand uppercase tracking-widest block font-mono">AI Research from NVIDIA</span>
<h1 class="font-outfit text-3xl sm:text-5xl font-black tracking-tight text-white leading-none">
Locate<span class="text-nvidia-brand font-light">Anything</span>
</h1>
<p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
</p>
<p class="text-[9px] text-slate-500 max-w-sm leading-relaxed border-l-2 border-nvidia-brand/30 pl-2.5">
Note: inputs larger than 1K are auto-resized in this Space demo. For full-resolution inference, download the weights and run locally.
</p>
</div>
<!-- Setup Glass Card Controls -->
<div class="glass-panel rounded-2xl p-4 space-y-4 pointer-events-auto max-w-xs shadow-2xl">
<div class="grid grid-cols-2 gap-3">
<!-- Media Type toggle selection -->
<div class="space-y-1">
<label class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Media Type</label>
<div class="grid grid-cols-2 gap-0.5 bg-black/40 p-0.5 rounded-lg border border-white/5 text-center">
<button id="media-type-image" class="py-1 rounded-md font-semibold text-[9px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10">
Image
</button>
<button id="media-type-video" class="py-1 rounded-md font-semibold text-[9px] text-slate-400 hover:text-slate-200 transition-all">
Video
</button>
</div>
</div>
<!-- Task Selector -->
<div class="space-y-1">
<label for="task-type" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Task Type</label>
<select id="task-type" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200 font-semibold">
<option value="Detection">Detection</option>
<option value="Grounding">Grounding</option>
<option value="OCR">OCR</option>
<option value="GUI">GUI</option>
<option value="Pointing">Pointing</option>
</select>
</div>
</div>
<!-- Advanced parameters sliders (Collapsible details inside the left overlay) -->
<details class="group border-t border-white/5 pt-3">
<summary class="list-none flex justify-between items-center cursor-pointer select-none text-[8px] font-bold text-slate-400 tracking-wider uppercase hover:text-slate-200 transition-colors">
<span>⚙️ Advanced parameters</span>
<svg class="h-3 w-3 transform group-open:rotate-180 transition-transform text-slate-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7" />
</svg>
</summary>
<div class="space-y-3 pt-3">
<!-- Inference Mode Selection -->
<div class="space-y-1">
<label for="inference-mode" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Inference Mode</label>
<select id="inference-mode" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200">
<option value="hybrid">Hybrid</option>
<option value="fast">Fast</option>
<option value="slow">Slow</option>
</select>
</div>
<!-- Short side resize cap -->
<div class="space-y-1">
<label for="short-size" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Resize Cap (px)</label>
<input type="number" id="short-size" placeholder="Auto-Cap (1024)" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200 font-mono">
</div>
<!-- Temp -->
<div class="space-y-1">
<div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
<span>Temperature</span>
<span id="temp-val" class="font-mono text-nvidia-brand">0.7</span>
</div>
<input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
</div>
<!-- Top P -->
<div class="space-y-1">
<div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
<span>Top P</span>
<span id="topp-val" class="font-mono text-nvidia-brand">0.9</span>
</div>
<input type="range" id="topp" min="0.05" max="1.0" step="0.05" value="0.9" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
</div>
<!-- Top K -->
<div class="space-y-1">
<div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
<span>Top K</span>
<span id="topk-val" class="font-mono text-nvidia-brand">20</span>
</div>
<input type="range" id="topk" min="1" max="100" step="1" value="20" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
</div>
<!-- Video Frames (Only displayed for Video mode) -->
<div id="video-frames-wrapper" class="space-y-1 opacity-50 pointer-events-none transition-opacity duration-300">
<div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
<span>Max Video Frames</span>
<span id="frames-val" class="font-mono text-nvidia-brand">4</span>
</div>
<input type="range" id="max-frames" min="1" max="10" step="1" value="4" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand" disabled>
</div>
</div>
</details>
<!-- Quick Start Guide -->
<details class="group border-t border-white/5 pt-3" open>
<summary class="list-none flex justify-between items-center cursor-pointer select-none text-[8px] font-bold text-nvidia-brand tracking-wider uppercase hover:text-nvidia-hover transition-colors">
<span>📖 How to Use</span>
<svg class="h-3 w-3 transform group-open:rotate-180 transition-transform text-slate-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7" />
</svg>
</summary>
<ol class="space-y-1.5 pt-2.5 text-[9px] text-slate-400 leading-relaxed list-decimal list-inside marker:text-nvidia-brand/70">
<li>Upload an <strong class="text-slate-300">Image</strong> or <strong class="text-slate-300">Video</strong>, or pick a Quick Sandbox example below.</li>
<li>Choose a <strong class="text-slate-300">Task Type</strong>: Detection · Grounding · OCR · GUI · Pointing.</li>
<li>Enter <strong class="text-slate-300">Categories</strong> in the search bar (comma-separated, e.g. <code class="text-nvidia-brand/80">car, person</code>).</li>
<li>Optionally tune <strong class="text-slate-300">Advanced parameters</strong> above (mode, resize, temperature, etc.).</li>
<li>Click <strong class="text-nvidia-brand">Run Inference</strong> or press <kbd class="px-1 py-0.5 rounded bg-white/5 border border-white/10 text-[8px]">Enter</kbd> in the search bar.</li>
</ol>
</details>
</div>
<!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
<div class="pointer-events-auto pt-2 max-w-xs">
<button id="run-btn" class="pill-btn-green w-full py-3 px-6 rounded-full text-black font-extrabold text-sm flex items-center justify-center gap-2 select-none shadow-2xl">
<span id="btn-icon">🧠</span>
<span id="btn-text">Run Inference</span>
</button>
</div>
</div>
<!-- 3. Floating Categories Search Bar (bottom-center of the dedicated image zone) -->
<div class="absolute bottom-6 left-0 right-0 lg:left-[440px] z-30 flex flex-col items-center gap-2 px-6 pointer-events-none">
<div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full max-w-md pointer-events-auto">
<svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
<path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
</svg>
<input type="text" id="categories" value="car, bus, person, potted plant" placeholder="Describe objects to locate..." class="bg-transparent border-none outline-none focus:outline-none w-full text-slate-100 placeholder-slate-600 font-semibold text-xs">
<button id="clear-search-btn" class="text-slate-500 hover:text-white transition-colors p-0.5 rounded-full hover:bg-white/5 shrink-0">
<svg class="h-3.5 w-3.5" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
<path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
<p class="text-[9px] text-slate-500 text-center leading-relaxed pointer-events-none px-1 max-w-md">
Comma-separated targets · supports English &amp; Chinese · press <span class="text-slate-400">Enter</span> to run
</p>
</div>
<!-- Floating Workspace Status (top-right of the image zone) -->
<div class="absolute top-4 right-4 z-30 bg-black/60 backdrop-blur px-2.5 py-1 rounded-lg border border-white/10 text-[9px] text-slate-400 font-mono select-none pointer-events-none">
status: <span id="workspace-status" class="text-slate-200 font-semibold">No Media Loaded</span>
</div>
</div>
<!-- Shelf Section (Examples and Log metrics placed directly below the giant showcase) -->
<div class="grid grid-cols-1 lg:grid-cols-12 gap-6 items-start">
<!-- Left: Examples Library Shelf (Col Span: 5) -->
<div class="lg:col-span-5 space-y-4">
<div class="glass-panel rounded-2xl p-5 space-y-4">
<span class="text-[9px] font-bold text-slate-400 uppercase tracking-widest block font-mono">🖼️ Interactive Quick Sandbox</span>
<div class="grid grid-cols-4 gap-3">
<!-- Card 1 -->
<div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Book" data-category="book" data-task="Detection" data-mode="hybrid" data-asset="assets/book.jpg">
<div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/book.jpg');"></div>
<span class="text-[9px] font-semibold text-slate-300 block truncate">Book</span>
</div>
<!-- Card 2 -->
<div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Sushi" data-category="sushi" data-task="Detection" data-mode="hybrid" data-asset="assets/sweet.jpg">
<div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/sweet.jpg');"></div>
<span class="text-[9px] font-semibold text-slate-300 block truncate">Sushi</span>
</div>
<!-- Card 3 -->
<div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Person" data-category="person" data-task="Detection" data-mode="hybrid" data-asset="assets/person.jpg">
<div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/person.jpg');"></div>
<span class="text-[9px] font-semibold text-slate-300 block truncate">People</span>
</div>
<!-- Card 4 -->
<div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="OCR" data-category="text" data-task="OCR" data-mode="slow" data-asset="assets/ocr.jpg">
<div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/ocr.jpg');"></div>
<span class="text-[9px] font-semibold text-slate-300 block truncate">OCR</span>
</div>
</div>
</div>
<!-- Text Prompt logs -->
<div class="glass-panel rounded-2xl p-4 text-[10px] text-slate-500 font-mono flex justify-between items-center select-none bg-black/40">
<span class="truncate block">compiled: <span id="raw-prompt-preview" class="text-slate-400"></span></span>
</div>
</div>
<!-- Right: Performance Metrics & Tag draw overlays (Col Span: 7) -->
<div class="lg:col-span-7 space-y-4">
<div class="glass-panel rounded-2xl p-5 space-y-4">
<div class="grid grid-cols-1 sm:grid-cols-12 gap-4 items-start">
<!-- Performance Statistics Metrics Console (Grid: 5) -->
<div class="sm:col-span-5 bg-black/60 rounded-xl p-4 border border-white/5 font-mono text-[10px] text-slate-300 space-y-2 leading-normal h-[168px]">
<div class="text-nvidia-brand font-bold border-b border-white/5 pb-1 mb-1.5 uppercase tracking-widest text-[9px] font-mono">📊 Metrics Log</div>
<div class="flex justify-between"><span class="text-slate-500">Status:</span> <span id="meta-status" class="text-emerald-500 font-semibold">Idle</span></div>
<div class="flex justify-between"><span class="text-slate-500">Tokens/Frames:</span> <span id="meta-tokens">-</span></div>
<div class="flex justify-between"><span class="text-slate-500">Detections:</span> <span id="meta-boxes">-</span></div>
<div class="flex justify-between"><span class="text-slate-500">TPS / BPS:</span> <span><span id="meta-tps">-</span> / <span id="meta-bps">-</span></span></div>
<div class="flex justify-between"><span class="text-slate-500">Time:</span> <span id="meta-time">-</span></div>
</div>
<!-- Tag drawer box list (Grid: 7) -->
<div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col h-[168px] overflow-hidden">
<div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
<span>🎯 Detected Target Overlays</span>
<span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
</div>
<div id="detection-tags-wrapper" class="detection-scroll flex-1 flex flex-col gap-1.5 pt-1 text-[10px] text-slate-500">
<div id="detection-empty-hint" class="space-y-1.5 leading-relaxed">
<p>Run inference to populate detected targets here — each result will pop in one by one.</p>
<p class="text-[9px] text-slate-600">Adjustable: Task Type · Categories · Inference Mode · Resize Cap · Temperature · Top P/K · Max Video Frames</p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Full-width Decoding Trace (always visible, no nested scroll) -->
<div id="rich-trace-section" class="glass-panel rounded-2xl p-5">
<div id="rich-trace-log" class="text-[10px]">
<div class="rounded-xl border border-dashed border-white/10 bg-black/30 p-6 text-center text-[10px] text-slate-500 leading-relaxed">
<p class="text-slate-400 font-semibold mb-1">Decoding Trace</p>
<p>Run inference to watch model tokens pop in here — ref labels, box coords, and stats shown in full without scrolling sideways.</p>
</div>
</div>
</div>
</main>
<!-- Gradio client connection & app runtime logic -->
<script type="module">
import { client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
// State variables
let selectedMediaType = "Image";
let activeFile = null;
let clientInstance = null;
// Cache elements
const mediaTypeImageBtn = document.getElementById("media-type-image");
const mediaTypeVideoBtn = document.getElementById("media-type-video");
const videoFramesWrapper = document.getElementById("video-frames-wrapper");
const taskTypeSelect = document.getElementById("task-type");
const categoriesInput = document.getElementById("categories");
const clearSearchBtn = document.getElementById("clear-search-btn");
const inferenceModeSelect = document.getElementById("inference-mode");
const rawPromptPreview = document.getElementById("raw-prompt-preview");
// Advanced Controls Elements
const tempSlider = document.getElementById("temp");
const tempVal = document.getElementById("temp-val");
const toppSlider = document.getElementById("topp");
const toppVal = document.getElementById("topp-val");
const topkSlider = document.getElementById("topk");
const topkVal = document.getElementById("topk-val");
const shortSizeInput = document.getElementById("short-size");
const maxFramesSlider = document.getElementById("max-frames");
const maxFramesVal = document.getElementById("frames-val");
// Workspace Preview elements
const dropZone = document.getElementById("drop-zone");
const uploadPrompt = document.getElementById("upload-prompt");
const previewImage = document.getElementById("preview-image");
const previewVideo = document.getElementById("preview-video");
const fileInput = document.getElementById("media-file-input");
const workspaceStatus = document.getElementById("workspace-status");
// Output result elements
const outputEmpty = document.getElementById("output-empty");
const outputImage = document.getElementById("output-image");
const outputVideo = document.getElementById("output-video");
// Overlay and run button
const runBtn = document.getElementById("run-btn");
const btnText = document.getElementById("btn-text");
const btnIcon = document.getElementById("btn-icon");
const processingOverlay = document.getElementById("processing-overlay");
const processingStatus = document.getElementById("processing-status");
// Logging & Trace elements
const metaStatus = document.getElementById("meta-status");
const metaTokens = document.getElementById("meta-tokens");
const metaBoxes = document.getElementById("meta-boxes");
const metaTps = document.getElementById("meta-tps");
const metaBps = document.getElementById("meta-bps");
const metaTime = document.getElementById("meta-time");
const detectionTagsWrapper = document.getElementById("detection-tags-wrapper");
const detectionCountBadge = document.getElementById("detection-count-badge");
const richTraceLog = document.getElementById("rich-trace-log");
const TRACE_PLACEHOLDER_HTML = `
<div class="rounded-xl border border-dashed border-white/10 bg-black/30 p-6 text-center text-[10px] text-slate-500 leading-relaxed">
<p class="text-slate-400 font-semibold mb-1">Decoding Trace</p>
<p>Run inference to watch model tokens pop in here — ref labels, box coords, and stats shown in full without scrolling sideways.</p>
</div>`;
function setTracePlaceholder() {
richTraceLog.innerHTML = TRACE_PLACEHOLDER_HTML;
}
function setTraceProcessing() {
richTraceLog.innerHTML = '<p class="text-slate-400 animate-pulse p-4 text-center">Building decoding trace...</p>';
}
function setTraceHtml(html) {
richTraceLog.innerHTML = html || TRACE_PLACEHOLDER_HTML;
}
// Connect client
async function getClient() {
if (!clientInstance) {
try {
clientInstance = await client(window.location.origin);
} catch (e) {
console.error("Gradio Server connection failed:", e);
alert("Could not connect to Gradio backend. Ensure the server is active.");
}
}
return clientInstance;
}
// Live values updater
function setupLiveUpdaters() {
tempSlider.addEventListener("input", (e) => tempVal.textContent = e.target.value);
toppSlider.addEventListener("input", (e) => toppVal.textContent = e.target.value);
topkSlider.addEventListener("input", (e) => topkVal.textContent = e.target.value);
maxFramesSlider.addEventListener("input", (e) => maxFramesVal.textContent = e.target.value);
// Clear search categories button
clearSearchBtn.addEventListener("click", () => {
categoriesInput.value = "";
categoriesInput.focus();
triggerPromptUpdate();
});
// Trigger prompt generation updates
const triggerPromptUpdate = () => {
const task = taskTypeSelect.value;
const cat = categoriesInput.value;
rawPromptPreview.textContent = generateRawPromptText(task, cat);
};
taskTypeSelect.addEventListener("change", () => {
// OCR defaults to slow (standard AR decoding) for best text accuracy
if (taskTypeSelect.value === "OCR") {
inferenceModeSelect.value = "slow";
}
triggerPromptUpdate();
});
categoriesInput.addEventListener("input", triggerPromptUpdate);
// Run prompt builder initially
triggerPromptUpdate();
}
// Prompt builder mirroring python logic
function generateRawPromptText(taskType, category) {
if (!category) category = "objects";
const cats = category.split(",")
.map(c => c.trim())
.filter(c => c.length > 0)
.join("</c>");
switch (taskType) {
case "Detection": return `Locate all the instances that matches the following description: ${cats}.`;
case "Grounding": return `Locate all the instances that match the following description: ${cats}.`;
case "OCR": return "Detect all the text in box format.";
case "GUI": return `Locate the region that matches the following description: ${cats}.`;
case "Pointing": return `Point to: ${cats}.`;
default: return `Locate all the instances that matches the following description: ${cats}.`;
}
}
function formatDetectionCoords(det) {
const coords = det.coords || [];
if (!coords.length) return "";
const rounded = coords.map(c => Number.isFinite(c) ? Math.round(c) : c);
return rounded.join(", ");
}
function renderDetectionTags(detections) {
detectionTagsWrapper.innerHTML = "";
detectionCountBadge.textContent = "0";
detectionCountBadge.classList.remove("det-count-pop");
if (!detections.length) {
detectionTagsWrapper.innerHTML = '<p class="text-slate-500">No objects matched the given categories.</p>';
return;
}
// Animate count badge after tags finish popping in
const countDelay = detections.length * 80 + 120;
setTimeout(() => {
detectionCountBadge.textContent = detections.length;
detectionCountBadge.classList.add("det-count-pop");
}, countDelay);
detections.forEach((det, idx) => {
setTimeout(() => {
const card = document.createElement("div");
card.className = "det-tag-pop flex items-center justify-between gap-2 px-2 py-1.5 rounded-lg bg-nvidia-brand/8 border border-nvidia-brand/20 hover:border-nvidia-brand/40 transition-colors";
card.style.animationDelay = "0s";
const labelWrap = document.createElement("div");
labelWrap.className = "flex items-center gap-1.5 min-w-0";
const typeBadge = document.createElement("span");
typeBadge.className = "shrink-0 px-1 py-0.5 rounded text-[7px] font-bold uppercase tracking-wider bg-black/40 text-nvidia-brand border border-nvidia-brand/25";
typeBadge.textContent = det.type || "box";
const label = document.createElement("span");
label.className = "font-bold uppercase tracking-wider text-[9px] text-nvidia-brand truncate";
label.textContent = det.frame ? `[F${det.frame}] ${det.label}` : (det.label || "object");
labelWrap.appendChild(typeBadge);
labelWrap.appendChild(label);
const coords = document.createElement("span");
coords.className = "shrink-0 font-mono text-[8px] text-slate-500";
const coordStr = formatDetectionCoords(det);
coords.textContent = coordStr ? `[${coordStr}]` : "";
card.appendChild(labelWrap);
card.appendChild(coords);
detectionTagsWrapper.appendChild(card);
detectionTagsWrapper.scrollTop = detectionTagsWrapper.scrollHeight;
}, idx * 80);
});
}
function resetDetectionTagsPlaceholder() {
detectionTagsWrapper.innerHTML = `
<div id="detection-empty-hint" class="space-y-1.5 leading-relaxed">
<p>Run inference to populate detected targets here — each result will pop in one by one.</p>
<p class="text-[9px] text-slate-600">Adjustable: Task Type · Categories · Inference Mode · Resize Cap · Temperature · Top P/K · Max Video Frames</p>
</div>`;
detectionCountBadge.textContent = "0";
detectionCountBadge.classList.remove("det-count-pop");
}
// Switch workspace input styles without clearing
function setMediaType(type) {
selectedMediaType = type;
if (type === "Image") {
mediaTypeImageBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10";
mediaTypeVideoBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] text-slate-400 hover:text-slate-200 transition-all";
videoFramesWrapper.classList.add("hidden");
videoFramesWrapper.classList.add("opacity-50");
videoFramesWrapper.classList.add("pointer-events-none");
maxFramesSlider.disabled = true;
fileInput.accept = "image/*";
workspaceStatus.textContent = activeFile ? "Image Loaded" : "No Media Loaded";
} else {
mediaTypeVideoBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10";
mediaTypeImageBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] text-slate-400 hover:text-slate-200 transition-all";
videoFramesWrapper.classList.remove("hidden");
videoFramesWrapper.classList.remove("opacity-50");
videoFramesWrapper.classList.remove("pointer-events-none");
maxFramesSlider.disabled = false;
fileInput.accept = "video/*";
workspaceStatus.textContent = activeFile ? "Video Loaded" : "No Media Loaded";
}
}
// Reset elements
function clearWorkspace() {
activeFile = null;
previewImage.src = "";
previewImage.classList.add("hidden");
previewVideo.src = "";
previewVideo.classList.add("hidden");
uploadPrompt.classList.remove("hidden");
if (outputEmpty) outputEmpty.classList.remove("hidden");
outputImage.src = "";
outputImage.classList.add("hidden");
outputVideo.src = "";
outputVideo.classList.add("hidden");
workspaceStatus.textContent = "Workspace Cleared";
resetDetectionTagsPlaceholder();
}
// Drag and drop utilities
function setupDragDrop() {
['dragenter', 'dragover'].forEach(eventName => {
dropZone.addEventListener(eventName, (e) => {
e.preventDefault();
dropZone.classList.add('drop-zone-active');
}, false);
});
['dragleave', 'drop'].forEach(eventName => {
dropZone.addEventListener(eventName, (e) => {
e.preventDefault();
dropZone.classList.remove('drop-zone-active');
}, false);
});
dropZone.addEventListener('drop', (e) => {
const dt = e.dataTransfer;
const file = dt.files[0];
if (file) handleFileImport(file);
});
fileInput.addEventListener('change', (e) => {
const file = e.target.files[0];
if (file) handleFileImport(file);
});
}
// Display imported media
function handleFileImport(file) {
uploadPrompt.classList.add("hidden");
if (file.type.startsWith("image/")) {
setMediaType("Image");
activeFile = file;
const reader = new FileReader();
reader.onload = (e) => {
previewImage.src = e.target.result;
previewImage.classList.remove("hidden");
previewVideo.classList.add("hidden");
};
reader.readAsDataURL(file);
workspaceStatus.textContent = `Image Loaded: ${file.name}`;
} else if (file.type.startsWith("video/")) {
setMediaType("Video");
activeFile = file;
previewVideo.src = URL.createObjectURL(file);
previewVideo.classList.remove("hidden");
previewImage.classList.add("hidden");
workspaceStatus.textContent = `Video Loaded: ${file.name}`;
}
}
// Initialize preloaded examples click actions
// Utility to fetch preloaded example assets and convert to File
async function loadExampleFromAsset(url, filename) {
try {
const response = await fetch(url);
const blob = await response.blob();
return new File([blob], filename, { type: blob.type });
} catch (err) {
console.error("Failed to load example asset:", err);
return null;
}
}
// Initialize preloaded examples click actions
function setupExamples() {
document.querySelectorAll(".example-card").forEach(card => {
card.addEventListener("click", async () => {
const type = card.getAttribute("data-type");
const name = card.getAttribute("data-name");
const category = card.getAttribute("data-category");
const task = card.getAttribute("data-task");
const mode = card.getAttribute("data-mode");
const assetPath = card.getAttribute("data-asset"); // e.g. "assets/book.jpg"
clearWorkspace();
workspaceStatus.textContent = `Loading ${name} example...`;
// Set parameters
taskTypeSelect.value = task;
categoriesInput.value = category;
inferenceModeSelect.value = mode;
// Trigger live prompt update
taskTypeSelect.dispatchEvent(new Event("change"));
// Setup Media type
setMediaType(type);
// Fetch asset file with robust absolute URL resolution (works in iframe)
const ext = type === "Image" ? "jpg" : "mp4";
const resolvedAssetUrl = new URL(assetPath, window.location.href).href;
console.log("Fetching example from:", resolvedAssetUrl);
const file = await loadExampleFromAsset(resolvedAssetUrl, `${name.toLowerCase()}.${ext}`);
if (file) {
activeFile = file;
uploadPrompt.classList.add("hidden");
if (type === "Image") {
previewImage.src = URL.createObjectURL(file);
previewImage.classList.remove("hidden");
previewVideo.classList.add("hidden");
workspaceStatus.textContent = `Example Image Loaded: ${name}`;
} else {
previewVideo.src = URL.createObjectURL(file);
previewVideo.classList.remove("hidden");
previewImage.classList.add("hidden");
workspaceStatus.textContent = `Example Video Loaded: ${name}`;
}
} else {
workspaceStatus.textContent = `Failed to load ${name} example`;
}
});
});
}
// Execution logic
async function executeInference() {
if (!activeFile) {
alert("Please upload a media file (Image or Video) or select an example first.");
return;
}
// Set loading state
runBtn.disabled = true;
btnText.textContent = "⏳ Queueing Request...";
btnIcon.textContent = "🔒";
processingOverlay.classList.remove("hidden");
processingStatus.textContent = "Waiting for Gradio queue...";
// Clean outputs
if (outputEmpty) outputEmpty.classList.add("hidden");
outputImage.classList.add("hidden");
outputVideo.classList.add("hidden");
setTraceProcessing();
metaStatus.textContent = "Processing...";
metaStatus.className = "text-yellow-500 font-semibold";
detectionTagsWrapper.innerHTML = '<p class="text-slate-400 animate-pulse">Processing objects in backend...</p>';
detectionCountBadge.textContent = "0";
detectionCountBadge.classList.remove("det-count-pop");
try {
const clientInstance = await getClient();
if (!clientInstance) {
throw new Error("Unable to create Gradio Client instance.");
}
// Handle file parameter wrapping using Gradio client handle_file
const wrappedFile = activeFile ? handle_file(activeFile) : null;
const imageFile = (selectedMediaType === "Image") ? wrappedFile : null;
const videoFile = (selectedMediaType === "Video") ? wrappedFile : null;
// Collect configuration values
const taskType = taskTypeSelect.value;
const category = categoriesInput.value;
const modelMode = inferenceModeSelect.value;
const temp = parseFloat(tempSlider.value);
const topp = parseFloat(toppSlider.value);
const topk = parseInt(topkSlider.value);
const shortSize = shortSizeInput.value ? parseInt(shortSizeInput.value) : null;
const maxVideoFrames = parseInt(maxFramesSlider.value);
processingStatus.textContent = "Running Vision Model (duration-locked)...";
// Execute predictions using named parameters object matching app.py signature
const result = await clientInstance.predict("/run_inference", {
input_type: selectedMediaType,
image_file: imageFile,
video_file: videoFile,
task_type: taskType,
category: category,
model_mode: modelMode,
temp: temp,
top_p: topp,
top_k: topk,
short_size: shortSize,
question_override: null,
max_video_frames: maxVideoFrames
});
console.log("Inference complete. API outputs:", result);
// Unpack result values
const [outImageObj, outVideoObj, meta] = result.data;
if (!meta.success) {
throw new Error(meta.error || "Backend returned processing failure.");
}
// Process image result
if (selectedMediaType === "Image" && outImageObj) {
outputImage.src = outImageObj.url;
outputImage.classList.remove("hidden");
outputVideo.classList.add("hidden");
}
// Process video result
else if (selectedMediaType === "Video" && outVideoObj) {
outputVideo.src = outVideoObj.url;
outputVideo.classList.remove("hidden");
outputImage.classList.add("hidden");
}
// Render metrics logs
metaStatus.textContent = "Success";
metaStatus.className = "text-emerald-500 font-semibold";
const stats = meta.stats || {};
metaTokens.textContent = stats.num_tokens || stats.total_frames || "-";
metaBoxes.textContent = stats.num_boxes || stats.processed_frames || "-";
metaTps.textContent = stats.tps || "-";
metaBps.textContent = stats.bps || "-";
metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
// Render detection tags with staggered pop-in animation
renderDetectionTags(meta.detections || []);
// Render decoding trace (token-by-token pop animation from previous version)
setTraceHtml(meta.html);
} catch (err) {
console.error("Execution failed:", err);
metaStatus.textContent = "Error";
metaStatus.className = "text-red-500 font-semibold";
detectionTagsWrapper.innerHTML = `<span class="text-red-400">Failed: ${err.message}</span>`;
setTracePlaceholder();
alert(`Inference failed: ${err.message}`);
if (outputEmpty) outputEmpty.classList.remove("hidden");
} finally {
// Restore UI state
runBtn.disabled = false;
btnText.textContent = "Run Inference";
btnIcon.textContent = "🧠";
processingOverlay.classList.add("hidden");
}
}
// Add event listeners on load
document.addEventListener("DOMContentLoaded", () => {
mediaTypeImageBtn.addEventListener("click", () => {
if (selectedMediaType !== "Image") {
setMediaType("Image");
clearWorkspace();
}
});
mediaTypeVideoBtn.addEventListener("click", () => {
if (selectedMediaType !== "Video") {
setMediaType("Video");
clearWorkspace();
}
});
runBtn.addEventListener("click", executeInference);
// Bind enter key press in Categories float bar input
categoriesInput.addEventListener("keydown", (e) => {
if (e.key === "Enter") {
e.preventDefault();
executeInference();
}
});
setupLiveUpdaters();
setupDragDrop();
setupExamples();
});
</script>
</body>
</html>