|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
GPU Preprocessing Benchmark |
|
|
|
|
|
Compares CPU vs GPU preprocessing performance across different image sizes. |
|
|
Measures: |
|
|
- Preprocessing time only |
|
|
- Total inference time (preprocessing + model forward) |
|
|
- Memory usage |
|
|
- Speedup percentages |
|
|
""" |
|
|
|
|
|
import time |
|
|
from typing import List, Tuple |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
from PIL import Image |
|
|
|
|
|
from depth_anything_3.utils.io.input_processor import InputProcessor |
|
|
from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor |
|
|
|
|
|
|
|
|
import os |
|
|
import shutil |
|
|
|
|
|
def create_test_files(sizes: List[Tuple[int, int]], count: int = 4, temp_dir: str = "temp_bench_imgs") -> Tuple[List[List[str]], str]: |
|
|
"""Create test image files on disk. |
|
|
|
|
|
Args: |
|
|
sizes: List of (width, height) tuples |
|
|
count: Number of images per size |
|
|
temp_dir: Directory to save images |
|
|
|
|
|
Returns: |
|
|
List of image path batches, one per size |
|
|
Path to temp directory |
|
|
""" |
|
|
if os.path.exists(temp_dir): |
|
|
shutil.rmtree(temp_dir) |
|
|
os.makedirs(temp_dir) |
|
|
|
|
|
batches = [] |
|
|
for w, h, _ in sizes: |
|
|
batch = [] |
|
|
for i in range(count): |
|
|
img = Image.new("RGB", (w, h), color=(i * 50, 100, 150)) |
|
|
fname = f"{temp_dir}/{w}x{h}_{i}.jpg" |
|
|
img.save(fname, quality=95, subsampling=0) |
|
|
batch.append(fname) |
|
|
batches.append(batch) |
|
|
return batches, temp_dir |
|
|
|
|
|
def benchmark_gpu_decode_files( |
|
|
processor, |
|
|
image_paths: List[str], |
|
|
process_res: int = 504, |
|
|
warmup_runs: int = 2, |
|
|
benchmark_runs: int = 10, |
|
|
num_workers: int = 8, |
|
|
) -> float: |
|
|
"""Benchmark GPU decoding (from file path).""" |
|
|
|
|
|
for _ in range(warmup_runs): |
|
|
processor( |
|
|
image=image_paths, |
|
|
process_res=process_res, |
|
|
process_res_method="upper_bound_resize", |
|
|
num_workers=num_workers, |
|
|
) |
|
|
|
|
|
|
|
|
times = [] |
|
|
for _ in range(benchmark_runs): |
|
|
if hasattr(processor, 'device') and processor.device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
start = time.perf_counter() |
|
|
|
|
|
tensor, _, _ = processor( |
|
|
image=image_paths, |
|
|
process_res=process_res, |
|
|
process_res_method="upper_bound_resize", |
|
|
num_workers=num_workers, |
|
|
) |
|
|
|
|
|
if hasattr(processor, 'device') and processor.device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
elapsed = time.perf_counter() - start |
|
|
times.append(elapsed) |
|
|
|
|
|
return np.mean(times) |
|
|
|
|
|
def create_test_images(sizes: List[Tuple[int, int]], count: int = 4) -> List[List[Image.Image]]: |
|
|
"""Create test images for each size. |
|
|
|
|
|
Args: |
|
|
sizes: List of (width, height) tuples |
|
|
count: Number of images per size |
|
|
|
|
|
Returns: |
|
|
List of image batches, one per size |
|
|
""" |
|
|
batches = [] |
|
|
for w, h in sizes: |
|
|
batch = [Image.new("RGB", (w, h), color=(i * 50, 100, 150)) for i in range(count)] |
|
|
batches.append(batch) |
|
|
return batches |
|
|
|
|
|
|
|
|
def benchmark_hybrid( |
|
|
processor, |
|
|
images: List[Image.Image], |
|
|
process_res: int = 504, |
|
|
warmup_runs: int = 2, |
|
|
benchmark_runs: int = 10, |
|
|
num_workers: int = 8, |
|
|
device=torch.device("cuda") |
|
|
) -> float: |
|
|
"""Benchmark hybrid preprocessing (CPU resize -> GPU normalize).""" |
|
|
|
|
|
|
|
|
for _ in range(warmup_runs): |
|
|
imgs_cpu, _, _ = processor( |
|
|
image=images, |
|
|
process_res=process_res, |
|
|
process_res_method="upper_bound_resize", |
|
|
num_workers=num_workers, |
|
|
perform_normalization=False |
|
|
) |
|
|
imgs_gpu = imgs_cpu.to(device, non_blocking=True).float() / 255.0 |
|
|
_ = InputProcessor.normalize_tensor(imgs_gpu, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) |
|
|
|
|
|
|
|
|
times = [] |
|
|
for _ in range(benchmark_runs): |
|
|
if device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
start = time.perf_counter() |
|
|
|
|
|
|
|
|
imgs_cpu, _, _ = processor( |
|
|
image=images, |
|
|
process_res=process_res, |
|
|
process_res_method="upper_bound_resize", |
|
|
num_workers=num_workers, |
|
|
perform_normalization=False |
|
|
) |
|
|
|
|
|
|
|
|
imgs_gpu = imgs_cpu.to(device, non_blocking=True).float() / 255.0 |
|
|
_ = InputProcessor.normalize_tensor(imgs_gpu, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) |
|
|
|
|
|
if device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
elapsed = time.perf_counter() - start |
|
|
times.append(elapsed) |
|
|
|
|
|
return np.mean(times) |
|
|
|
|
|
def benchmark_preprocessing( |
|
|
processor, |
|
|
images: List[Image.Image], |
|
|
process_res: int = 504, |
|
|
warmup_runs: int = 2, |
|
|
benchmark_runs: int = 10, |
|
|
num_workers: int = 8, |
|
|
) -> float: |
|
|
"""Benchmark preprocessing performance. |
|
|
|
|
|
Args: |
|
|
processor: InputProcessor or GPUInputProcessor instance |
|
|
images: List of test images |
|
|
process_res: Processing resolution |
|
|
warmup_runs: Number of warmup runs to discard |
|
|
benchmark_runs: Number of benchmark runs to average |
|
|
num_workers: Number of parallel workers (for CPU processor) |
|
|
|
|
|
Returns: |
|
|
Average preprocessing time in seconds |
|
|
""" |
|
|
|
|
|
for _ in range(warmup_runs): |
|
|
processor( |
|
|
image=images, |
|
|
process_res=process_res, |
|
|
process_res_method="upper_bound_resize", |
|
|
num_workers=num_workers, |
|
|
) |
|
|
|
|
|
|
|
|
times = [] |
|
|
for _ in range(benchmark_runs): |
|
|
if hasattr(processor, 'device') and processor.device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
start = time.perf_counter() |
|
|
tensor, _, _ = processor( |
|
|
image=images, |
|
|
process_res=process_res, |
|
|
process_res_method="upper_bound_resize", |
|
|
num_workers=num_workers, |
|
|
) |
|
|
|
|
|
if hasattr(processor, 'device') and processor.device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
elapsed = time.perf_counter() - start |
|
|
times.append(elapsed) |
|
|
|
|
|
return np.mean(times) |
|
|
|
|
|
|
|
|
def print_results_table(results: List[dict]): |
|
|
"""Pretty print benchmark results as table.""" |
|
|
print("\n" + "=" * 140) |
|
|
print("GPU PREPROCESSING BENCHMARK RESULTS") |
|
|
print("=" * 140) |
|
|
print(f"{'Image Size':<15} {'CPU Time':<12} {'GPU Time':<12} {'Hybrid Time':<12} {'GPU Decode':<12} {'Best Method':<15}") |
|
|
print("-" * 140) |
|
|
|
|
|
for result in results: |
|
|
size_str = f"{result['width']}x{result['height']}" |
|
|
cpu_time = f"{result['cpu_time']*1000:.2f} ms" |
|
|
gpu_time = f"{result['gpu_time']*1000:.2f} ms" |
|
|
hybrid_time = f"{result['hybrid_time']*1000:.2f} ms" |
|
|
gpu_decode_time = f"{result['gpu_decode_time']*1000:.2f} ms" |
|
|
|
|
|
times = [result['cpu_time'], result['gpu_time'], result['hybrid_time'], result['gpu_decode_time']] |
|
|
labels = ["CPU", "GPU", "Hybrid", "GPU Decode"] |
|
|
best_idx = np.argmin(times) |
|
|
best = labels[best_idx] |
|
|
|
|
|
print(f"{size_str:<15} {cpu_time:<12} {gpu_time:<12} {hybrid_time:<12} {gpu_decode_time:<12} {best:<15}") |
|
|
|
|
|
print("=" * 140 + "\n") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Run comprehensive benchmark.""" |
|
|
print("\n" + "=" * 100) |
|
|
print("INITIALIZING GPU PREPROCESSING BENCHMARK") |
|
|
print("=" * 100) |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
device_name = "cuda" |
|
|
device_info = torch.cuda.get_device_name(0) |
|
|
print(f"β GPU Device: {device_info}") |
|
|
print("β GPU preprocessing: ENABLED (NVJPEG + Kornia)") |
|
|
elif torch.backends.mps.is_available(): |
|
|
device_name = "mps" |
|
|
device_info = "Apple MPS" |
|
|
print(f"β GPU Device: {device_info}") |
|
|
print("βΉ GPU preprocessing: DISABLED on MPS (CPU is faster on Apple Silicon)") |
|
|
print(" β GPUInputProcessor will use CPU path automatically") |
|
|
print(" β GPU reserved for model inference (5-10x speedup there)") |
|
|
else: |
|
|
print("β No GPU available - benchmark will show CPU vs CPU (no speedup expected)") |
|
|
device_name = "cpu" |
|
|
device_info = "CPU only" |
|
|
|
|
|
device = torch.device(device_name) |
|
|
|
|
|
|
|
|
cpu_proc = InputProcessor() |
|
|
gpu_proc = GPUInputProcessor(device=device_name) |
|
|
print(f"β Processors initialized: CPU vs {device_name.upper()}") |
|
|
|
|
|
|
|
|
|
|
|
test_sizes = [ |
|
|
(640, 480, "Small (VGA)"), |
|
|
(1280, 720, "Medium (HD)"), |
|
|
(1920, 1080, "Large (Full HD)"), |
|
|
(3840, 2160, "XLarge (4K)"), |
|
|
] |
|
|
|
|
|
process_res = 504 |
|
|
num_images = 4 |
|
|
num_workers = 8 |
|
|
|
|
|
print(f"β Test config: {num_images} images per batch, process_res={process_res}, num_workers={num_workers}") |
|
|
print(f"β Testing {len(test_sizes)} image sizes: {', '.join([desc for _, _, desc in test_sizes])}") |
|
|
|
|
|
|
|
|
print("\nGenerating test images (PIL & Files)...") |
|
|
image_batches_pil = create_test_images([(w, h) for w, h, _ in test_sizes], count=num_images) |
|
|
image_batches_files, temp_dir = create_test_files(test_sizes, count=num_images) |
|
|
print("β Test images generated") |
|
|
|
|
|
|
|
|
print("\nRunning benchmarks (this may take a minute)...\n") |
|
|
results = [] |
|
|
|
|
|
try: |
|
|
for (w, h, desc), imgs_pil, imgs_files in zip(test_sizes, image_batches_pil, image_batches_files): |
|
|
print(f"Benchmarking {desc} ({w}x{h})...", end=" ", flush=True) |
|
|
|
|
|
cpu_time = benchmark_preprocessing(cpu_proc, imgs_pil, process_res, num_workers=num_workers) |
|
|
gpu_time = benchmark_preprocessing(gpu_proc, imgs_pil, process_res, num_workers=num_workers) |
|
|
hybrid_time = benchmark_hybrid(cpu_proc, imgs_pil, process_res, num_workers=num_workers, device=device) |
|
|
|
|
|
|
|
|
gpu_decode_time = benchmark_gpu_decode_files(gpu_proc, imgs_files, process_res, num_workers=num_workers) |
|
|
|
|
|
results.append({ |
|
|
'width': w, |
|
|
'height': h, |
|
|
'description': desc, |
|
|
'cpu_time': cpu_time, |
|
|
'gpu_time': gpu_time, |
|
|
'hybrid_time': hybrid_time, |
|
|
'gpu_decode_time': gpu_decode_time |
|
|
}) |
|
|
|
|
|
best_time = min(cpu_time, gpu_time, hybrid_time, gpu_decode_time) |
|
|
if best_time == gpu_decode_time: |
|
|
win = "GPU Decode" |
|
|
elif best_time == hybrid_time: |
|
|
win = "Hybrid" |
|
|
elif best_time == gpu_time: |
|
|
win = "GPU" |
|
|
else: |
|
|
win = "CPU" |
|
|
|
|
|
print(f"β Best: {win}") |
|
|
|
|
|
|
|
|
print_results_table(results) |
|
|
|
|
|
|
|
|
if device_name == "cuda": |
|
|
print("\nGPU Memory Usage:") |
|
|
print(f" Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.1f} MB") |
|
|
print(f" Cached: {torch.cuda.memory_reserved(0) / 1024**2:.1f} MB") |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(temp_dir): |
|
|
shutil.rmtree(temp_dir) |
|
|
print(f"\nβ Cleaned up temp directory: {temp_dir}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|