# import gradio as gr # from PIL import Image # import torch # from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM # # Set device to CPU and default dtype to float32 # DEVICE = torch.device("cpu") # torch.set_default_dtype(torch.float32) # # Load CLIP model and processor # try: # clip_model = CLIPModel.from_pretrained( # "openai/clip-vit-base-patch32", # torch_dtype=torch.float32 # ).to(DEVICE) # clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # except Exception as e: # raise Exception(f"Error loading CLIP model or processor: {str(e)}") # # Load language model and tokenizer # def load_model(): # try: # # Use a lightweight model suitable for CPU (distilgpt2 for lower memory) # #model_name = "distilgpt2" # Switched to distilgpt2 for better CPU performance # model_name="microsoft/phi-3-mini-4k-instruct" # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=torch.float32, # trust_remote_code=True # ).to(DEVICE) # tokenizer = AutoTokenizer.from_pretrained( # model_name, # trust_remote_code=True # ) # # Set pad token if not defined # if tokenizer.pad_token is None: # tokenizer.pad_token = tokenizer.eos_token # model.config.pad_token_id = model.config.eos_token_id # model.eval() # return model, tokenizer # except Exception as e: # raise Exception(f"Error loading language model: {str(e)}") # # Simple multimodal captioning function # def generate_caption(image, model, tokenizer): # try: # if not isinstance(image, Image.Image): # return "Error: Input must be a valid image." # if image.mode != "RGB": # image = image.convert("RGB") # # Process image with CLIP # image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE) # with torch.no_grad(): # image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32) # # Prepare prompt # prompt = "Caption this image:" # inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) # input_ids = inputs["input_ids"].to(DEVICE) # attention_mask = inputs["attention_mask"].to(DEVICE) # # Simple projection: use image embedding as a prefix # projection = torch.nn.Linear(512, model.config.hidden_size).to(DEVICE) # with torch.no_grad(): # image_embedding_projected = projection(image_embedding) # # Combine image and text embeddings # text_embedding = model.get_input_embeddings()(input_ids) # fused_embedding = torch.cat([image_embedding_projected.unsqueeze(1), text_embedding], dim=1) # attention_mask = torch.cat([ # torch.ones(input_ids.size(0), 1, device=DEVICE), # attention_mask # ], dim=1) # # Generate caption # with torch.no_grad(): # generated_ids = model.generate( # inputs_embeds=fused_embedding, # attention_mask=attention_mask, # max_new_tokens=50, # min_length=10, # num_beams=3, # Reduced for CPU speed # repetition_penalty=1.2, # do_sample=False # ) # caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True) # return caption.strip() # except Exception as e: # return f"Error generating caption: {str(e)}" # # Load model and tokenizer # model, tokenizer = load_model() # # Gradio interface with explicit component configuration # def gradio_caption(image): # if image is None: # return "Please upload an image." # result = generate_caption(image, model, tokenizer) # return result if isinstance(result, str) else str(result) # # Define components explicitly to avoid schema issues # inputs = gr.Image( # type="pil", # label="Upload an Image", # sources=["upload"], # Restrict to uploads to simplify schema # ) # outputs = gr.Textbox( # label="Generated Caption", # lines=2, # placeholder="Caption will appear here..." # ) # # Use gr.Blocks for finer control instead of gr.Interface # with gr.Blocks(title="CPU-Based Image Captioning") as interface: # gr.Markdown( # """ # # CPU-Based Image Captioning with CLIP and DistilGPT2 # Upload an image to generate a caption using a lightweight multimodal model. # This app runs on CPU and may produce basic captions due to simplified processing. # """ # ) # with gr.Row(): # with gr.Column(): # image_input = inputs # submit_button = gr.Button("Generate Caption") # with gr.Column(): # caption_output = outputs # submit_button.click( # fn=gradio_caption, # inputs=image_input, # outputs=caption_output # ) # # Launch locally with debugging enabled # interface.launch(debug=True) import gradio as gr from PIL import Image import torch from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM # Set device to CPU and default dtype to float32 DEVICE = torch.device("cpu") torch.set_default_dtype(torch.float32) # Load CLIP model and processor try: clip_model = CLIPModel.from_pretrained( "openai/clip-vit-base-patch32", torch_dtype=torch.float32 ).to(DEVICE) clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") except Exception as e: raise Exception(f"Error loading CLIP model or processor: {str(e)}") # Load language model and tokenizer def load_model(): try: #model_name = "distilgpt2" model_name="microsoft/phi-3-mini-4k-instruct" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True ).to(DEVICE) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id model.eval() return model, tokenizer except Exception as e: raise Exception(f"Error loading language model: {str(e)}") # Caption generation logic def generate_caption(image, model, tokenizer): try: # Ensure the image is a PIL Image and convert to RGB if necessary if not isinstance(image, Image.Image): image = Image.frombytes('RGB', image.size, image.rgb) if hasattr(image, 'rgb') else image else: # Convert to RGB if the image has a different mode (e.g., RGBA, L) if image.mode != 'RGB': image = image.convert('RGB') image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE) with torch.no_grad(): image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32) prompt = "[IMG] Caption this image:" inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) input_ids = inputs["input_ids"].to(DEVICE) attention_mask = inputs["attention_mask"].to(DEVICE) projection = torch.nn.Linear(512, model.config.hidden_size).to(DEVICE) with torch.no_grad(): image_embedding_projected = projection(image_embedding) text_embedding = model.get_input_embeddings()(input_ids) fused_embedding = torch.cat([image_embedding_projected.unsqueeze(1), text_embedding], dim=1) attention_mask = torch.cat([ torch.ones(input_ids.size(0), 1, device=DEVICE), attention_mask ], dim=1) with torch.no_grad(): generated_ids = model.generate( inputs_embeds=fused_embedding, attention_mask=attention_mask, max_new_tokens=50, min_length=10, num_beams=3, repetition_penalty=1.2, do_sample=False ) caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True) return caption.strip() except Exception as e: return f"Error generating caption: {str(e)}" # Load model/tokenizer model, tokenizer = load_model() # Wrapper for Gradio function call def gradio_caption(image): if image is None: return "Please upload an image." return generate_caption(image, model, tokenizer) # Reusable UI component blocks def create_image_input(): return gr.Image( type="pil", label="Upload an Image", sources=["upload"] ) def create_caption_output(): return gr.Textbox( label="Generated Caption", lines=2, placeholder="Caption will appear here..." ) # Build UI interface = gr.Interface( fn=gradio_caption, inputs=gr.Image(type="pil", label="Upload an Image"), outputs=gr.Textbox(label="Generated Caption"), title="Image Captioning with Fine-Tuned MultiModalModel (Epoch 0)", description=( "Upload an image to generate a caption using a fine-tuned multimodal model based on Phi-3 and CLIP. " "The weights from Epoch_0 are used here, but the model may not generate accurate captions due to limited training." ) ) interface.launch()