Delanoe Pirard
Deploy to HuggingFace Spaces
18b382b
#!/usr/bin/env python3
# Copyright (c) 2025 Delanoe Pirard / Aedelon - Apache 2.0
"""
Full Benchmark Suite for Depth Anything 3
Tests ALL optimization combinations for each device (CPU, MPS, CUDA).
Optimizations tested:
- Preprocessing: CPU (PIL) vs GPU (NVJPEG on CUDA)
- Attention: SDPA (Flash Attention) vs Manual
Usage:
python benchmarks/full_benchmark.py # Best device only
python benchmarks/full_benchmark.py -d all # All devices
python benchmarks/full_benchmark.py -d cuda # CUDA only
python benchmarks/full_benchmark.py --quick # Quick mode
"""
import argparse
import gc
import logging
import os
import shutil
import sys
import time
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional
# Suppress ALL logging before any imports
logging.disable(logging.CRITICAL)
os.environ["DA3_LOG_LEVEL"] = "ERROR"
warnings.filterwarnings("ignore")
import numpy as np
import torch
from PIL import Image
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
# Suppress depth_anything_3 logger specifically
logging.getLogger("depth_anything_3").disabled = True
logging.getLogger("dinov2").disabled = True
# ============================================================================
# STYLES
# ============================================================================
class Style:
CYAN = "\033[96m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
BOLD = "\033[1m"
DIM = "\033[2m"
RESET = "\033[0m"
def colored(text, color, bold=False):
prefix = Style.BOLD if bold else ""
return f"{prefix}{color}{text}{Style.RESET}"
# ============================================================================
# UTILITIES
# ============================================================================
def cleanup():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
def sync_device(device):
if device.type == "cuda":
torch.cuda.synchronize()
elif device.type == "mps":
torch.mps.synchronize()
def get_available_devices() -> List[torch.device]:
"""Get all available devices for benchmarking."""
devices = [torch.device("cpu")]
if torch.backends.mps.is_available():
devices.append(torch.device("mps"))
if torch.cuda.is_available():
devices.append(torch.device("cuda"))
return devices
def get_device_name(device: torch.device) -> str:
"""Get human-readable device name."""
if device.type == "cuda":
return torch.cuda.get_device_name(device)
elif device.type == "mps":
return "Apple Silicon (MPS)"
else:
import platform
return f"CPU ({platform.processor() or 'Unknown'})"
# ============================================================================
# DATA CLASSES
# ============================================================================
@dataclass
class BenchmarkResult:
"""Single benchmark result."""
mean_ms: float
std_ms: float
fps: float
@classmethod
def from_times(cls, times: List[float], batch_size: int = 1):
mean_ms = np.mean(times)
std_ms = np.std(times)
fps = 1000 / mean_ms * batch_size
return cls(mean_ms=mean_ms, std_ms=std_ms, fps=fps)
@dataclass
class OptimizationConfig:
"""Configuration for a specific optimization combination."""
name: str
preprocessing: str # "cpu" or "gpu"
attention: str # "sdpa" or "manual"
description: str
@property
def short_name(self) -> str:
prep = "GPU" if self.preprocessing == "gpu" else "CPU"
attn = "SDPA" if self.attention == "sdpa" else "Manual"
return f"{prep}+{attn}"
# ============================================================================
# BENCHMARK FUNCTIONS
# ============================================================================
def get_optimization_configs(device: torch.device) -> List[OptimizationConfig]:
"""Get all valid optimization configurations for a device."""
configs = []
if device.type == "cuda":
# CUDA: All 4 combinations
configs = [
OptimizationConfig("gpu_sdpa", "gpu", "sdpa", "GPU Decode (NVJPEG) + SDPA (Flash)"),
OptimizationConfig("gpu_manual", "gpu", "manual", "GPU Decode (NVJPEG) + Manual Attn"),
OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA (Flash)"),
OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"),
]
elif device.type == "mps":
# MPS: CPU preprocessing is better, 2 combinations
configs = [
OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA"),
OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"),
]
else:
# CPU: 2 combinations
configs = [
OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "SDPA Attention"),
OptimizationConfig("cpu_manual", "cpu", "manual", "Manual Attention"),
]
return configs
def benchmark_preprocessing_detailed(device: torch.device, runs: int = 5) -> Dict:
"""Benchmark preprocessing in detail."""
from depth_anything_3.utils.io.input_processor import InputProcessor
from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
results = {}
temp_dir = "temp_bench_preproc"
sizes = [
("720p", 1280, 720),
("1080p", 1920, 1080),
("4K", 3840, 2160),
]
os.makedirs(temp_dir, exist_ok=True)
try:
cpu_proc = InputProcessor()
gpu_proc = None
if device.type == "cuda":
gpu_proc = GPUInputProcessor(device=device)
for name, w, h in sizes:
results[name] = {}
# Create test files
files = []
pil_imgs = []
for i in range(4):
img = Image.new("RGB", (w, h), color=(100 + i*10, 150, 200))
fpath = f"{temp_dir}/{name}_{i}.jpg"
img.save(fpath, quality=95)
files.append(fpath)
pil_imgs.append(img.copy())
# CPU benchmark
cleanup()
for _ in range(2):
cpu_proc(image=pil_imgs, process_res=518, num_workers=8)
times = []
for _ in range(runs):
start = time.perf_counter()
cpu_proc(image=pil_imgs, process_res=518, num_workers=8)
times.append((time.perf_counter() - start) * 1000)
results[name]["cpu"] = BenchmarkResult.from_times(times, batch_size=4)
# GPU benchmark (NVJPEG for CUDA)
if gpu_proc and gpu_proc.use_gpu:
cleanup()
for _ in range(2):
gpu_proc(image=files, process_res=518, num_workers=1)
sync_device(device)
times = []
for _ in range(runs):
sync_device(device)
start = time.perf_counter()
gpu_proc(image=files, process_res=518, num_workers=1)
sync_device(device)
times.append((time.perf_counter() - start) * 1000)
results[name]["gpu"] = BenchmarkResult.from_times(times, batch_size=4)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
return results
def benchmark_attention_detailed(device: torch.device, runs: int = 10) -> Dict:
"""Benchmark attention backends in detail."""
from depth_anything_3.model.dinov2.layers import Attention
results = {}
dtype = torch.float16 if device.type == "cuda" else torch.float32
configs = [
("ViT-S (518px)", 384, 6, 529),
("ViT-L (518px)", 1024, 16, 529),
("ViT-L (770px)", 1024, 16, 1156),
]
for name, dim, heads, seq_len in configs:
results[name] = {}
x = torch.randn(1, seq_len, dim, device=device, dtype=dtype)
for backend in ["sdpa", "manual"]:
cleanup()
attn = Attention(dim=dim, num_heads=heads, attn_backend=backend).to(device, dtype)
attn.eval()
# Warmup
with torch.no_grad():
for _ in range(3):
attn(x)
sync_device(device)
# Benchmark
times = []
with torch.no_grad():
for _ in range(runs):
sync_device(device)
start = time.perf_counter()
attn(x)
sync_device(device)
times.append((time.perf_counter() - start) * 1000)
results[name][backend] = BenchmarkResult.from_times(times)
del attn
return results
def benchmark_inference_matrix(
device: torch.device,
models: List[str],
runs: int = 3,
) -> Dict:
"""Benchmark all optimization combinations for inference."""
from depth_anything_3.api import DepthAnything3
results = {}
temp_dir = "temp_bench_infer"
configs = get_optimization_configs(device)
os.makedirs(temp_dir, exist_ok=True)
# Create test images (720p)
img_paths = []
pil_imgs = []
for i in range(4):
img = Image.new("RGB", (1280, 720), color=(100 + i*20, 150, 200))
path = f"{temp_dir}/test_{i}.jpg"
img.save(path, quality=95)
img_paths.append(path)
pil_imgs.append(img.copy())
try:
for model_name in models:
results[model_name] = {}
for config in configs:
cleanup()
# Set attention backend
os.environ["DA3_ATTENTION_BACKEND"] = config.attention
# Load model fresh (to apply attention backend)
model = DepthAnything3(
model_name=model_name,
device=device,
use_cache=False,
)
# Choose input based on preprocessing
if config.preprocessing == "gpu" and device.type == "cuda":
test_input = img_paths[:1] # File paths for NVJPEG
else:
test_input = pil_imgs[:1] # PIL for CPU preprocessing
# Warmup
for _ in range(3):
model.inference(test_input, process_res=518)
sync_device(device)
# Benchmark
times = []
for _ in range(runs):
sync_device(device)
start = time.perf_counter()
model.inference(test_input, process_res=518)
sync_device(device)
times.append((time.perf_counter() - start) * 1000)
results[model_name][config.name] = {
"result": BenchmarkResult.from_times(times, batch_size=1),
"config": config,
}
del model
cleanup()
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
return results
# ============================================================================
# DISPLAY FUNCTIONS
# ============================================================================
def print_header(title: str):
"""Print section header."""
print()
print(colored("═" * 70, Style.CYAN))
print(colored("β•‘", Style.CYAN) + colored(f" {title}", Style.BOLD).center(77) + colored("β•‘", Style.CYAN))
print(colored("═" * 70, Style.CYAN))
def print_subheader(title: str):
"""Print subsection header."""
print()
print(colored(f"β–Ά {title}", Style.YELLOW, bold=True))
print(colored("─" * 70, Style.DIM))
def format_speedup(speedup: float) -> str:
"""Format speedup with color."""
if speedup >= 1.5:
return colored(f"{speedup:.2f}x", Style.GREEN, bold=True)
elif speedup >= 1.1:
return colored(f"{speedup:.2f}x", Style.GREEN)
elif speedup >= 0.95:
return f"{speedup:.2f}x"
else:
return colored(f"{speedup:.2f}x", Style.RED)
def print_preprocessing_results(results: Dict, device: torch.device):
"""Print preprocessing benchmark results."""
print_subheader("PREPROCESSING (4 images batch)")
has_gpu = any("gpu" in r for r in results.values())
if has_gpu:
print(f" {'Resolution':<12} {'CPU (PIL)':<14} {'GPU (NVJPEG)':<14} {'Speedup':<10}")
print(f" {'-'*50}")
for name, data in results.items():
cpu_ms = data["cpu"].mean_ms
if "gpu" in data:
gpu_ms = data["gpu"].mean_ms
speedup = cpu_ms / gpu_ms
print(f" {name:<12} {cpu_ms:>8.1f} ms {gpu_ms:>8.1f} ms {format_speedup(speedup)}")
else:
print(f" {name:<12} {cpu_ms:>8.1f} ms {'N/A':<14}")
else:
print(f" {'Resolution':<12} {'CPU (PIL)':<14}")
print(f" {'-'*30}")
for name, data in results.items():
cpu_ms = data["cpu"].mean_ms
print(f" {name:<12} {cpu_ms:>8.1f} ms")
# Summary
if has_gpu:
speedups = []
for data in results.values():
if "gpu" in data:
speedups.append(data["cpu"].mean_ms / data["gpu"].mean_ms)
if speedups:
avg = np.mean(speedups)
print()
print(f" {colored('β†’', Style.GREEN)} GPU preprocessing avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster")
def print_attention_results(results: Dict, device: torch.device):
"""Print attention benchmark results."""
print_subheader("ATTENTION (per layer forward pass)")
print(f" {'Config':<18} {'SDPA':<12} {'Manual':<12} {'Speedup':<10}")
print(f" {'-'*52}")
for name, data in results.items():
sdpa_ms = data["sdpa"].mean_ms
manual_ms = data["manual"].mean_ms
speedup = manual_ms / sdpa_ms
print(f" {name:<18} {sdpa_ms:>6.3f} ms {manual_ms:>6.3f} ms {format_speedup(speedup)}")
# Summary
speedups = [d["manual"].mean_ms / d["sdpa"].mean_ms for d in results.values()]
avg = np.mean(speedups)
print()
print(f" {colored('β†’', Style.GREEN)} SDPA avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster than manual")
# Check Flash SDP
if device.type == "cuda":
from torch.backends.cuda import flash_sdp_enabled
if flash_sdp_enabled():
print(f" {colored('β†’', Style.GREEN)} Flash Attention: {colored('ENABLED', Style.GREEN, bold=True)} (PyTorch native)")
def print_inference_matrix(results: Dict, device: torch.device):
"""Print inference benchmark matrix."""
print_subheader("END-TO-END INFERENCE (720p input, batch=1)")
configs = get_optimization_configs(device)
# Header
header = f" {'Model':<12}"
for cfg in configs:
header += f" {cfg.short_name:<14}"
header += " Best"
print(header)
print(f" {'-'*(14 + 15*len(configs) + 6)}")
# Results per model
for model_name, model_results in results.items():
row = f" {model_name:<12}"
best_fps = 0
best_config = None
worst_fps = float('inf')
for cfg in configs:
if cfg.name in model_results:
result = model_results[cfg.name]["result"]
fps = result.fps
row += f" {fps:>6.1f} img/s "
if fps > best_fps:
best_fps = fps
best_config = cfg
if fps < worst_fps:
worst_fps = fps
else:
row += f" {'N/A':<14}"
# Best indicator
if best_config:
row += f" {colored(best_config.short_name, Style.GREEN, bold=True)}"
print(row)
# Summary
print()
print(f" {Style.DIM}Legend: GPU=NVJPEG decode, CPU=PIL decode, SDPA=Flash Attention{Style.RESET}")
def print_device_summary(
device: torch.device,
preproc_results: Dict,
attn_results: Dict,
infer_results: Dict,
):
"""Print summary for a device."""
print()
print(colored("─" * 70, Style.CYAN))
print(colored(f" {device.type.upper()} - OPTIMIZATION SUMMARY", Style.BOLD))
print(colored("─" * 70, Style.CYAN))
# Best configuration
if infer_results:
print()
print(f" {colored('Best configuration per model:', Style.CYAN)}")
for model_name, model_results in infer_results.items():
if not model_results:
continue
best_name = max(model_results.keys(), key=lambda k: model_results[k]["result"].fps)
best = model_results[best_name]
worst_name = min(model_results.keys(), key=lambda k: model_results[k]["result"].fps)
worst = model_results[worst_name]
speedup = best["result"].fps / worst["result"].fps if worst["result"].fps > 0 else 1
print(f" {model_name:<12} {colored(best['config'].description, Style.GREEN)}")
print(f" {'':<12} {best['result'].fps:.1f} img/s ({speedup:.1f}x vs worst)")
# Recommendations
print()
print(f" {colored('Recommendations:', Style.CYAN)}")
if device.type == "cuda":
print(f" βœ“ Use {colored('GPU preprocessing (NVJPEG)', Style.GREEN)} for file inputs")
print(f" βœ“ {colored('SDPA (Flash Attention)', Style.GREEN)} is enabled by default")
print(f" βœ“ Pass file paths (not PIL images) to leverage NVJPEG")
elif device.type == "mps":
print(f" βœ“ Use {colored('CPU preprocessing', Style.GREEN)} (faster than GPU on MPS)")
print(f" βœ“ {colored('SDPA', Style.GREEN)} provides moderate speedup")
else:
print(f" βœ“ {colored('SDPA', Style.GREEN)} provides speedup over manual attention")
print(f" β—‹ Consider using GPU (CUDA/MPS) for better performance")
# ============================================================================
# MAIN
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description="DA3 Full Benchmark - Test all optimization combinations",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python benchmarks/full_benchmark.py # Best device only
python benchmarks/full_benchmark.py -d all # All devices
python benchmarks/full_benchmark.py -d cuda # CUDA only
python benchmarks/full_benchmark.py --quick # Quick mode (fewer runs)
python benchmarks/full_benchmark.py --models da3-small da3-large
"""
)
parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)")
parser.add_argument("--skip-preprocessing", action="store_true", help="Skip preprocessing benchmark")
parser.add_argument("--skip-attention", action="store_true", help="Skip attention benchmark")
parser.add_argument("--skip-inference", action="store_true", help="Skip inference benchmark")
parser.add_argument("-d", "--device", type=str, default="auto",
choices=["auto", "cpu", "mps", "cuda", "all"],
help="Device to benchmark (default: auto)")
parser.add_argument("--models", nargs="+", default=None,
help="Models to benchmark (default: all)")
args = parser.parse_args()
# Configure runs
runs_preproc = 3 if args.quick else 5
runs_attn = 5 if args.quick else 10
runs_infer = 2 if args.quick else 4
# Determine models
if args.models:
models = args.models
elif args.quick:
models = ["da3-small", "da3-large"]
else:
models = ["da3-small", "da3-base", "da3-large"]
# Determine devices
available_devices = get_available_devices()
if args.device == "auto":
devices_to_test = [available_devices[-1]] # Best available
elif args.device == "all":
devices_to_test = available_devices
else:
requested = torch.device(args.device)
if requested in available_devices:
devices_to_test = [requested]
else:
print(f"Error: Device '{args.device}' not available.")
print(f"Available: {[d.type for d in available_devices]}")
return
# Main header
print()
print(colored("β•”" + "═" * 68 + "β•—", Style.CYAN))
print(colored("β•‘", Style.CYAN) + colored(" DEPTH ANYTHING 3 - FULL BENCHMARK", Style.BOLD).center(77) + colored("β•‘", Style.CYAN))
print(colored("β•‘", Style.CYAN) + colored(" All Optimization Combinations", Style.DIM).center(77) + colored("β•‘", Style.CYAN))
print(colored("β•š" + "═" * 68 + "╝", Style.CYAN))
print(f"\n {Style.DIM}PyTorch{Style.RESET} : {colored(torch.__version__, Style.CYAN)}")
print(f" {Style.DIM}Models{Style.RESET} : {colored(', '.join(models), Style.CYAN)}")
print(f" {Style.DIM}Mode{Style.RESET} : {colored('Quick' if args.quick else 'Full', Style.CYAN)}")
print(f"\n {Style.DIM}Available devices:{Style.RESET}")
for d in available_devices:
status = colored("●", Style.GREEN) if d in devices_to_test else colored("β—‹", Style.DIM)
print(f" {status} {d.type.upper():<6} {get_device_name(d)}")
all_results = {}
# Run benchmarks for each device
for device in devices_to_test:
device_name = get_device_name(device)
all_results[device.type] = {}
print_header(f"{device.type.upper()} - {device_name}")
# 1. Preprocessing
preproc_results = {}
if not args.skip_preprocessing and device.type != "cpu":
preproc_results = benchmark_preprocessing_detailed(device, runs=runs_preproc)
all_results[device.type]["preprocessing"] = preproc_results
print_preprocessing_results(preproc_results, device)
elif device.type == "cpu":
print_subheader("PREPROCESSING")
print(f" {Style.DIM}Skipped (CPU only - no GPU comparison){Style.RESET}")
# 2. Attention
attn_results = {}
if not args.skip_attention:
attn_results = benchmark_attention_detailed(device, runs=runs_attn)
all_results[device.type]["attention"] = attn_results
print_attention_results(attn_results, device)
# 3. Inference Matrix
infer_results = {}
if not args.skip_inference:
infer_results = benchmark_inference_matrix(device, models, runs=runs_infer)
all_results[device.type]["inference"] = infer_results
print_inference_matrix(infer_results, device)
# Device Summary
print_device_summary(device, preproc_results, attn_results, infer_results)
cleanup()
# Cross-device comparison
if len(devices_to_test) > 1 and not args.skip_inference:
print_header("CROSS-DEVICE COMPARISON")
# Find common model
common_model = models[-1] # Usually largest tested
print()
print(f" {colored(f'{common_model} (best config per device):', Style.CYAN)}")
print(f" {'Device':<10} {'Config':<30} {'Performance':<15}")
print(f" {'-'*55}")
base_fps = None
for device in devices_to_test:
if device.type in all_results and "inference" in all_results[device.type]:
infer = all_results[device.type]["inference"].get(common_model, {})
if infer:
best_name = max(infer.keys(), key=lambda k: infer[k]["result"].fps)
best = infer[best_name]
fps = best["result"].fps
if base_fps is None:
base_fps = fps
speedup = fps / base_fps if base_fps else 1
speedup_str = f"({speedup:.1f}x)" if device != devices_to_test[0] else "(baseline)"
print(f" {device.type.upper():<10} {best['config'].description:<30} {fps:>5.1f} img/s {speedup_str}")
# Final summary
print()
print(colored("═" * 70, Style.CYAN))
print(colored("β•‘", Style.CYAN) + colored(" BENCHMARK COMPLETE", Style.BOLD).center(77) + colored("β•‘", Style.CYAN))
print(colored("═" * 70, Style.CYAN))
print()
if __name__ == "__main__":
main()