# import gradio as gr
# from PIL import Image
# import torch
# from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM

# # Set device to CPU and default dtype to float32
# DEVICE = torch.device("cpu")
# torch.set_default_dtype(torch.float32)

# # Load CLIP model and processor
# try:
#     clip_model = CLIPModel.from_pretrained(
#         "openai/clip-vit-base-patch32",
#         torch_dtype=torch.float32
#     ).to(DEVICE)
#     clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# except Exception as e:
#     raise Exception(f"Error loading CLIP model or processor: {str(e)}")

# # Load language model and tokenizer
# def load_model():
#     try:
#         # Use a lightweight model suitable for CPU (distilgpt2 for lower memory)
#         #model_name = "distilgpt2"  # Switched to distilgpt2 for better CPU performance
#         model_name="microsoft/phi-3-mini-4k-instruct"
#         model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             torch_dtype=torch.float32,
#             trust_remote_code=True
#         ).to(DEVICE)
#         tokenizer = AutoTokenizer.from_pretrained(
#             model_name,
#             trust_remote_code=True
#         )

#         # Set pad token if not defined
#         if tokenizer.pad_token is None:
#             tokenizer.pad_token = tokenizer.eos_token
#             model.config.pad_token_id = model.config.eos_token_id

#         model.eval()
#         return model, tokenizer

#     except Exception as e:
#         raise Exception(f"Error loading language model: {str(e)}")

# # Simple multimodal captioning function
# def generate_caption(image, model, tokenizer):
#     try:
#         if not isinstance(image, Image.Image):
#             return "Error: Input must be a valid image."
#         if image.mode != "RGB":
#             image = image.convert("RGB")

#         # Process image with CLIP
#         image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE)
#         with torch.no_grad():
#             image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32)

#         # Prepare prompt
#         prompt = "Caption this image:"
#         inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
#         input_ids = inputs["input_ids"].to(DEVICE)
#         attention_mask = inputs["attention_mask"].to(DEVICE)

#         # Simple projection: use image embedding as a prefix
#         projection = torch.nn.Linear(512, model.config.hidden_size).to(DEVICE)
#         with torch.no_grad():
#             image_embedding_projected = projection(image_embedding)

#         # Combine image and text embeddings
#         text_embedding = model.get_input_embeddings()(input_ids)
#         fused_embedding = torch.cat([image_embedding_projected.unsqueeze(1), text_embedding], dim=1)
#         attention_mask = torch.cat([
#             torch.ones(input_ids.size(0), 1, device=DEVICE),
#             attention_mask
#         ], dim=1)

#         # Generate caption
#         with torch.no_grad():
#             generated_ids = model.generate(
#                 inputs_embeds=fused_embedding,
#                 attention_mask=attention_mask,
#                 max_new_tokens=50,
#                 min_length=10,
#                 num_beams=3,  # Reduced for CPU speed
#                 repetition_penalty=1.2,
#                 do_sample=False
#             )
#         caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
#         return caption.strip()

#     except Exception as e:
#         return f"Error generating caption: {str(e)}"

# # Load model and tokenizer
# model, tokenizer = load_model()

# # Gradio interface with explicit component configuration
# def gradio_caption(image):
#     if image is None:
#         return "Please upload an image."
#     result = generate_caption(image, model, tokenizer)
#     return result if isinstance(result, str) else str(result)

# # Define components explicitly to avoid schema issues
# inputs = gr.Image(
#     type="pil",
#     label="Upload an Image",
#     sources=["upload"],  # Restrict to uploads to simplify schema
# )
# outputs = gr.Textbox(
#     label="Generated Caption",
#     lines=2,
#     placeholder="Caption will appear here..."
# )

# # Use gr.Blocks for finer control instead of gr.Interface
# with gr.Blocks(title="CPU-Based Image Captioning") as interface:
#     gr.Markdown(
#         """
#         # CPU-Based Image Captioning with CLIP and DistilGPT2
#         Upload an image to generate a caption using a lightweight multimodal model.
#         This app runs on CPU and may produce basic captions due to simplified processing.
#         """
#     )
#     with gr.Row():
#         with gr.Column():
#             image_input = inputs
#             submit_button = gr.Button("Generate Caption")
#         with gr.Column():
#             caption_output = outputs
#     submit_button.click(
#         fn=gradio_caption,
#         inputs=image_input,
#         outputs=caption_output
#     )

# # Launch locally with debugging enabled
# interface.launch(debug=True)


import gradio as gr
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM

# Set device to CPU and default dtype to float32
DEVICE = torch.device("cpu")
torch.set_default_dtype(torch.float32)

# Load CLIP model and processor
try:
    clip_model = CLIPModel.from_pretrained(
        "openai/clip-vit-base-patch32",
        torch_dtype=torch.float32
    ).to(DEVICE)
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
except Exception as e:
    raise Exception(f"Error loading CLIP model or processor: {str(e)}")

# Load language model and tokenizer
def load_model():
    try:
        #model_name = "distilgpt2"
        model_name="microsoft/phi-3-mini-4k-instruct"
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            trust_remote_code=True
        ).to(DEVICE)
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True
        )

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = model.config.eos_token_id

        model.eval()
        return model, tokenizer

    except Exception as e:
        raise Exception(f"Error loading language model: {str(e)}")

# Caption generation logic
def generate_caption(image, model, tokenizer):
    try:
        # Ensure the image is a PIL Image and convert to RGB if necessary
        if not isinstance(image, Image.Image):
            image = Image.frombytes('RGB', image.size, image.rgb) if hasattr(image, 'rgb') else image
        else:
            # Convert to RGB if the image has a different mode (e.g., RGBA, L)
            if image.mode != 'RGB':
                image = image.convert('RGB')


        image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32)

        prompt = "[IMG] Caption this image:"
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs["input_ids"].to(DEVICE)
        attention_mask = inputs["attention_mask"].to(DEVICE)

        projection = torch.nn.Linear(512, model.config.hidden_size).to(DEVICE)
        with torch.no_grad():
            image_embedding_projected = projection(image_embedding)

        text_embedding = model.get_input_embeddings()(input_ids)
        fused_embedding = torch.cat([image_embedding_projected.unsqueeze(1), text_embedding], dim=1)
        attention_mask = torch.cat([
            torch.ones(input_ids.size(0), 1, device=DEVICE),
            attention_mask
        ], dim=1)

        with torch.no_grad():
            generated_ids = model.generate(
                inputs_embeds=fused_embedding,
                attention_mask=attention_mask,
                max_new_tokens=50,
                min_length=10,
                num_beams=3,
                repetition_penalty=1.2,
                do_sample=False
            )
        caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return caption.strip()

    except Exception as e:
        return f"Error generating caption: {str(e)}"

# Load model/tokenizer
model, tokenizer = load_model()

# Wrapper for Gradio function call
def gradio_caption(image):
    if image is None:
        return "Please upload an image."
    return generate_caption(image, model, tokenizer)

# Reusable UI component blocks
def create_image_input():
    return gr.Image(
        type="pil",
        label="Upload an Image",
        sources=["upload"]
    )

def create_caption_output():
    return gr.Textbox(
        label="Generated Caption",
        lines=2,
        placeholder="Caption will appear here..."
    )

# Build UI
interface = gr.Interface(
    fn=gradio_caption,
    inputs=gr.Image(type="pil", label="Upload an Image"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="Image Captioning with Fine-Tuned MultiModalModel (Epoch 0)",
    description=(
        "Upload an image to generate a caption using a fine-tuned multimodal model based on Phi-3 and CLIP. "
         "The weights from Epoch_0 are used here, but the model may not generate accurate captions due to limited training."
    )
)
interface.launch()