Spaces:
Build error
Build error
| import torch | |
| from PIL import Image | |
| import requests | |
| from io import BytesIO | |
| import gradio as gr | |
| import os | |
| import sys | |
| import time | |
| import warnings | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| print("Starting InternVL2 with Llama3-76B initialization...") | |
| print(f"Python version: {sys.version}") | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| # Set up environment for CUDA | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" | |
| # Check GPU availability | |
| def check_gpu(): | |
| if not torch.cuda.is_available(): | |
| print("CUDA is not available. This application requires GPU acceleration.") | |
| return False | |
| try: | |
| # Test GPU with a simple operation | |
| test_tensor = torch.rand(10, device="cuda") | |
| _ = test_tensor + test_tensor | |
| print(f"GPU is available: {torch.cuda.get_device_name(0)}") | |
| return True | |
| except Exception as e: | |
| print(f"Error initializing GPU: {str(e)}") | |
| return False | |
| # Global flag for GPU availability | |
| USE_GPU = check_gpu() | |
| # Import InternVL modules | |
| try: | |
| from transformers import AutoModel, AutoProcessor | |
| HAS_TRANSFORMERS = True | |
| print("Successfully imported transformers") | |
| except ImportError as e: | |
| print(f"Error importing transformers: {str(e)}") | |
| HAS_TRANSFORMERS = False | |
| # Initialize models | |
| internvit_model = None | |
| llama_model = None | |
| processor = None | |
| def load_models(): | |
| global internvit_model, llama_model, processor | |
| if not USE_GPU: | |
| print("Cannot load models without GPU") | |
| return False | |
| try: | |
| print("Loading InternViT-6B model for visual feature extraction...") | |
| # Following the GitHub repo instructions for using InternViT-6B | |
| processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px") | |
| internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px") | |
| if USE_GPU: | |
| internvit_model = internvit_model.to("cuda") | |
| print("InternViT-6B model loaded successfully!") | |
| # For demonstration purposes, we'll just extract visual features for now | |
| # In a real implementation, we would load Llama3-76B here | |
| print("Note: Llama3-76B model loading is commented out for this demonstration") | |
| # llama_model = ... | |
| return True | |
| except Exception as e: | |
| print(f"Error loading models: {str(e)}") | |
| return False | |
| # Load models on startup | |
| MODELS_LOADED = load_models() | |
| def process_image(image_path, sample_url=None): | |
| """Process an image using InternViT-6B for feature extraction""" | |
| # Load image | |
| if sample_url and not image_path: | |
| # Load from URL if provided and no image uploaded | |
| response = requests.get(sample_url) | |
| image = Image.open(BytesIO(response.content)) | |
| print(f"Loaded sample image from URL: {sample_url}") | |
| else: | |
| # Use uploaded image | |
| if isinstance(image_path, str): | |
| image = Image.open(image_path) | |
| else: | |
| image = image_path | |
| if not image: | |
| return "No image provided" | |
| if not MODELS_LOADED: | |
| return "Models failed to load. Please check the logs." | |
| try: | |
| # Start timing | |
| start_time = time.time() | |
| # Process image through the visual encoder | |
| print("Processing image through InternViT-6B...") | |
| inputs = processor(images=image, return_tensors="pt") | |
| if USE_GPU: | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = internvit_model(**inputs) | |
| # Extract image features | |
| image_features = outputs.last_hidden_state | |
| pooled_output = outputs.pooler_output | |
| # In a real implementation, we would pass these features to Llama3-76B | |
| # For now, we'll just return info about the extracted features | |
| feature_info = f""" | |
| Image successfully processed through InternViT-6B: | |
| - Last hidden state shape: {image_features.shape} | |
| - Pooled output shape: {pooled_output.shape} | |
| In a complete implementation, these visual features would be passed to Llama3-76B | |
| for generating text responses about the image. | |
| Note: This is a demonstration of visual feature extraction only. | |
| """ | |
| # Calculate elapsed time | |
| elapsed = time.time() - start_time | |
| return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds." | |
| except Exception as e: | |
| return f"Error processing image: {str(e)}" | |
| # Set up Gradio interface | |
| def create_interface(): | |
| with gr.Blocks(title="InternVL2 with Llama3-76B") as demo: | |
| gr.Markdown("# InternVL2 Visual Feature Extraction Demo") | |
| gr.Markdown("## Using InternViT-6B for visual feature extraction") | |
| # System status | |
| status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load" | |
| gr.Markdown(f"### System Status: {status}") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_image = gr.Image(type="pil", label="Upload Image") | |
| sample_btn = gr.Button("Use Sample Image") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Results", lines=10) | |
| # Process button | |
| process_btn = gr.Button("Extract Visual Features") | |
| process_btn.click( | |
| fn=process_image, | |
| inputs=[input_image], | |
| outputs=output_text | |
| ) | |
| # Sample image button logic | |
| sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg" | |
| def use_sample(): | |
| return process_image(None, sample_image_url) | |
| sample_btn.click( | |
| fn=use_sample, | |
| inputs=[], | |
| outputs=output_text | |
| ) | |
| # Add some explanation | |
| gr.Markdown(""" | |
| ## About This Demo | |
| This demonstration shows how to use InternViT-6B for visual feature extraction, | |
| following the instructions from the OpenGVLab/InternVL GitHub repository. | |
| The application extracts visual features from the input image that would typically | |
| be passed to a language model like Llama3-76B. In a complete implementation, | |
| these features would be used to generate text responses about the image. | |
| """) | |
| return demo | |
| # Main function | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(share=False, server_name="0.0.0.0") |