Spaces:

WolfDavid
/

blip-captioner

Sleeping

File size: 12,149 Bytes

a388160

"""
BLIP Image Captioner — HF Space

Real image-to-text captioning using Salesforce's BLIP model.
"""

from __future__ import annotations

import time
from typing import Optional

import gradio as gr
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor

# ═══════════════════════════════════════════════════════════════════
# Model loading
# ═══════════════════════════════════════════════════════════════════

MODEL_NAME = "Salesforce/blip-image-captioning-base"

_model: Optional[BlipForConditionalGeneration] = None
_processor: Optional[BlipProcessor] = None


def load_model():
    """Load BLIP model and processor on first use."""
    global _model, _processor

    if _model is not None:
        return

    _processor = BlipProcessor.from_pretrained(MODEL_NAME)
    _model = BlipForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
    )
    _model.eval()


# ═══════════════════════════════════════════════════════════════════
# Caption generation
# ═══════════════════════════════════════════════════════════════════

def caption_image(
    image: Image.Image,
    prompt: str,
    max_length: int,
    num_beams: int,
):
    """Generate a caption for an image, optionally conditioned on a prompt."""
    if image is None:
        return "_Upload an image to get a caption._", "0 ms"

    load_model()

    image = image.convert("RGB")
    prompt = (prompt or "").strip()

    start = time.perf_counter()

    if prompt:
        inputs = _processor(image, prompt, return_tensors="pt")
    else:
        inputs = _processor(image, return_tensors="pt")

    with torch.inference_mode():
        output_ids = _model.generate(
            **inputs,
            max_new_tokens=int(max_length),
            num_beams=int(num_beams),
            early_stopping=True,
        )

    latency_ms = (time.perf_counter() - start) * 1000
    caption = _processor.decode(output_ids[0], skip_special_tokens=True)

    return caption, f"{latency_ms:.0f} ms"


# ═══════════════════════════════════════════════════════════════════
# Multiple captions (variety sampling)
# ═══════════════════════════════════════════════════════════════════

def generate_multiple_captions(image: Image.Image, n: int = 3):
    """Generate multiple captions with different beam sizes for variety."""
    if image is None:
        return "_Upload an image first._"

    load_model()
    image = image.convert("RGB")

    start = time.perf_counter()
    inputs = _processor(image, return_tensors="pt")

    captions = []
    with torch.inference_mode():
        for beams in (1, 3, 5):
            output_ids = _model.generate(
                **inputs,
                max_new_tokens=50,
                num_beams=beams,
                early_stopping=True,
            )
            cap = _processor.decode(output_ids[0], skip_special_tokens=True)
            captions.append((beams, cap))

    latency_ms = (time.perf_counter() - start) * 1000

    lines = [f"**Generated in {latency_ms:.0f} ms:**\n"]
    for beams, cap in captions:
        lines.append(f"- **Beams={beams}:** {cap}")
    return "\n".join(lines)


# ═══════════════════════════════════════════════════════════════════
# Gradio UI
# ═══════════════════════════════════════════════════════════════════

with gr.Blocks(title="BLIP Image Captioner", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # BLIP Image Captioner

        Generate natural-language descriptions for any image using
        **Salesforce's BLIP** (Bootstrapping Language-Image Pre-training).

        Runs on HF's free CPU tier. First request loads the model (~20s),
        subsequent captions generate in a few seconds.

        > Try uploading a photo of a person, scene, object, or activity.
        > You can optionally provide a **prompt prefix** to condition
        > the caption (e.g., "a photograph of" or "a painting of").
        """
    )

    with gr.Tabs():
        # ─────────────────────────────────────────────────────────
        # Tab 1 — Single Caption
        # ─────────────────────────────────────────────────────────
        with gr.Tab("Single Caption"):
            with gr.Row():
                with gr.Column(scale=1):
                    image_input = gr.Image(
                        type="pil",
                        label="Upload Image",
                        height=400,
                    )
                    prompt_input = gr.Textbox(
                        label="Optional Prompt Prefix",
                        placeholder="e.g., 'a photograph of' (leave blank for unconditional)",
                    )
                    with gr.Row():
                        max_length = gr.Slider(
                            minimum=20,
                            maximum=100,
                            step=5,
                            value=50,
                            label="Max Caption Length",
                        )
                        num_beams = gr.Slider(
                            minimum=1,
                            maximum=8,
                            step=1,
                            value=5,
                            label="Beam Search Width",
                        )
                    caption_btn = gr.Button(
                        "Generate Caption",
                        variant="primary",
                        size="lg",
                    )

                with gr.Column(scale=1):
                    caption_output = gr.Textbox(
                        label="Generated Caption",
                        lines=3,
                        interactive=False,
                    )
                    latency_output = gr.Textbox(
                        label="Latency",
                        interactive=False,
                    )

            caption_btn.click(
                caption_image,
                inputs=[image_input, prompt_input, max_length, num_beams],
                outputs=[caption_output, latency_output],
            )

            gr.Examples(
                examples=[
                    ["https://images.unsplash.com/photo-1574158622682-e40e69881006?w=640", ""],
                    ["https://images.unsplash.com/photo-1552053831-71594a27632d?w=640", ""],
                    ["https://images.unsplash.com/photo-1502920917128-1aa500764cbd?w=640", "a photograph of"],
                ],
                inputs=[image_input, prompt_input],
            )

        # ─────────────────────────────────────────────────────────
        # Tab 2 — Variety Comparison
        # ─────────────────────────────────────────────────────────
        with gr.Tab("Variety Comparison"):
            gr.Markdown(
                """
                Generate **multiple captions** with different beam search
                widths to see how the model's output varies. Higher beam
                width tends to produce more grammatical but sometimes
                blander captions.
                """
            )
            with gr.Row():
                with gr.Column(scale=1):
                    image_input_var = gr.Image(
                        type="pil",
                        label="Upload Image",
                        height=400,
                    )
                    variety_btn = gr.Button(
                        "Generate 3 Captions",
                        variant="primary",
                        size="lg",
                    )
                with gr.Column(scale=1):
                    variety_output = gr.Markdown()

            variety_btn.click(
                generate_multiple_captions,
                inputs=[image_input_var],
                outputs=[variety_output],
            )

        # ─────────────────────────────────────────────────────────
        # Tab 3 — About
        # ─────────────────────────────────────────────────────────
        with gr.Tab("About"):
            gr.Markdown(
                """
                ## Model

                **Name:** [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)

                **Paper:** [BLIP: Bootstrapping Language-Image Pre-training](https://arxiv.org/abs/2201.12086)
                (Li et al., 2022)

                **Architecture:** ViT-base vision encoder + BERT-base
                language decoder with cross-attention. Pre-trained on
                a large corpus of image-caption pairs from the web with
                a self-filtering approach (CapFilt) to clean noisy data.

                **Parameters:** ~250M (base variant)

                **Training data:** COCO, Visual Genome, SBU Captions,
                Conceptual Captions, Conceptual 12M

                ## Why BLIP?

                Pre-BLIP vision-language models typically fell into two
                camps: **understanding** models (CLIP) or **generation**
                models (image captioning). BLIP unifies both by training
                a single model that can do:

                1. **Image-text contrastive learning** (like CLIP)
                2. **Image-text matching** (binary classification)
                3. **Image-grounded text generation** (captioning)

                The "Bootstrapping" in the name refers to the CapFilt
                training procedure — using the model itself to filter
                and generate synthetic captions to improve the training
                data.

                ## Limitations

                - Base model (not large) — favors speed over quality
                - Trained on English-language captions only
                - May miss nuance or details in complex scenes
                - Can struggle with rare objects or unusual scenes

                ## Tech Stack

                - **transformers** — model loading and inference
                - **torch** — tensor backend (CPU on HF free tier)
                - **Pillow** — image processing
                - **Gradio** — UI

                ---
                **Source:** [github.com/wolfwdavid/ai-tools-collection](https://github.com/wolfwdavid/ai-tools-collection)
                &nbsp;|&nbsp;
                **HF Profile:** [@WolfDavid](https://huggingface.co/WolfDavid)
                """
            )


if __name__ == "__main__":
    demo.launch()