Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from PIL import Image | |
| try: | |
| from spaces import GPU | |
| except ImportError: | |
| # Define a no-op decorator if running locally | |
| def GPU(func): | |
| return func | |
| import os | |
| import argparse | |
| from inference import GenerativeInferenceModel, get_inference_configs | |
| # Parse command line arguments | |
| parser = argparse.ArgumentParser(description='Run Generative Inference Demo') | |
| parser.add_argument('--port', type=int, default=7860, help='Port to run the server on') | |
| args = parser.parse_args() | |
| # Create model directories if they don't exist | |
| os.makedirs("models", exist_ok=True) | |
| os.makedirs("stimuli", exist_ok=True) | |
| # Check if running on Hugging Face Spaces | |
| if "SPACE_ID" in os.environ: | |
| default_port = int(os.environ.get("PORT", 7860)) | |
| else: | |
| default_port = 8861 # Local default port | |
| # Initialize model | |
| model = GenerativeInferenceModel() | |
| # Define example images and their parameters with updated values from the research | |
| examples = [ | |
| { | |
| "image": os.path.join("stimuli", "Kanizsa_square.jpg"), | |
| "name": "Kanizsa Square", | |
| "wiki": "https://en.wikipedia.org/wiki/Kanizsa_triangle", | |
| "papers": [ | |
| "[Gestalt Psychology](https://en.wikipedia.org/wiki/Gestalt_psychology)", | |
| "[Neural Mechanisms](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer4", # last layer | |
| "initial_noise": 0.1, | |
| "diffusion_noise": 0.003, # Corrected parameter name | |
| "step_size": 0.5, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 0.5 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "face_vase.png"), | |
| "name": "Rubin's Face-Vase (Object Prior)", | |
| "wiki": "https://en.wikipedia.org/wiki/Rubin_vase", | |
| "papers": [ | |
| "[Figure-Ground Perception](https://en.wikipedia.org/wiki/Figure-ground_(perception))", | |
| "[Bistable Perception](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer4", # last layer | |
| "initial_noise": 0.7, | |
| "diffusion_noise": 0.005, # Corrected parameter name | |
| "step_size": 1.0, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 1.0 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "figure_ground.png"), | |
| "name": "Figure-Ground Illusion", | |
| "wiki": "https://en.wikipedia.org/wiki/Figure-ground_(perception)", | |
| "papers": [ | |
| "[Gestalt Principles](https://en.wikipedia.org/wiki/Gestalt_psychology)", | |
| "[Perceptual Organization](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer3", | |
| "initial_noise": 0.5, | |
| "diffusion_noise": 0.005, # Corrected parameter name | |
| "step_size": 0.8, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 0.8 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "Neon_Color_Circle.jpg"), | |
| "name": "Neon Color Spreading", | |
| "wiki": "https://en.wikipedia.org/wiki/Neon_color_spreading", | |
| "papers": [ | |
| "[Color Assimilation](https://doi.org/10.1016/j.visres.2000.200.1)", | |
| "[Perceptual Filling-in](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer3", | |
| "initial_noise": 0.5, | |
| "diffusion_noise": 0.003, # Corrected parameter name | |
| "step_size": 1.0, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 1.0 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "EhresteinSingleColor.png"), | |
| "name": "Ehrenstein Illusion", | |
| "wiki": "https://en.wikipedia.org/wiki/Ehrenstein_illusion", | |
| "papers": [ | |
| "[Subjective Contours](https://doi.org/10.1016/j.visres.2000.200.1)", | |
| "[Neural Processing](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer3", | |
| "initial_noise": 0.5, | |
| "diffusion_noise": 0.005, # Corrected parameter name | |
| "step_size": 0.8, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 0.8 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "Confetti_illusion.png"), | |
| "name": "Confetti Illusion", | |
| "wiki": "https://en.wikipedia.org/wiki/Optical_illusion", | |
| "papers": [ | |
| "[Color Perception](https://doi.org/10.1016/j.visres.2000.200.1)", | |
| "[Context Effects](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer3", | |
| "initial_noise": 0.7, | |
| "diffusion_noise": 0.01, # Corrected parameter name | |
| "step_size": 1.0, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 1.0 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "CornsweetBlock.png"), | |
| "name": "Cornsweet Illusion", | |
| "wiki": "https://en.wikipedia.org/wiki/Cornsweet_illusion", | |
| "papers": [ | |
| "[Brightness Perception](https://doi.org/10.1016/j.visres.2000.200.1)", | |
| "[Edge Effects](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer3", | |
| "initial_noise": 0.5, | |
| "diffusion_noise": 0.005, # Corrected parameter name | |
| "step_size": 0.8, # Step size (learning rate parameter) | |
| "iterations": 50, # Number of iterations | |
| "epsilon": 0.8 | |
| } | |
| }, | |
| { | |
| "image": os.path.join("stimuli", "GroupingByContinuity.png"), | |
| "name": "Grouping by Continuity", | |
| "wiki": "https://en.wikipedia.org/wiki/Principles_of_grouping", | |
| "papers": [ | |
| "[Gestalt Principles](https://en.wikipedia.org/wiki/Gestalt_psychology)", | |
| "[Visual Organization](https://doi.org/10.1016/j.tics.2003.08.003)" | |
| ], | |
| "method": "ReverseDiffusion", | |
| "reverse_diff": { | |
| "model": "resnet50_robust", | |
| "layer": "layer3", | |
| "initial_noise": 0.1, | |
| "diffusion_noise": 0.005, # Corrected parameter name | |
| "step_size": 0.4, # Step size (learning rate parameter) | |
| "iterations": 100, # Number of iterations | |
| "epsilon": 0.4 | |
| } | |
| } | |
| ] | |
| def run_inference(image, model_type, inference_type, eps_value, num_iterations, | |
| initial_noise=0.05, diffusion_noise=0.3, step_size=0.8, model_layer="layer3"): | |
| # Convert eps to float | |
| eps = float(eps_value) | |
| # Load inference configuration based on the selected type | |
| config = get_inference_configs(inference_type=inference_type, eps=eps, n_itr=int(num_iterations)) | |
| # Handle ReverseDiffusion specific parameters | |
| if inference_type == "ReverseDiffusion": | |
| config['initial_inference_noise_ratio'] = float(initial_noise) | |
| config['diffusion_noise_ratio'] = float(diffusion_noise) | |
| config['step_size'] = float(step_size) # Added step size parameter | |
| config['top_layer'] = model_layer | |
| # Run generative inference | |
| result = model.inference(image, model_type, config) | |
| # Extract results based on return type | |
| if isinstance(result, tuple): | |
| # Old format returning (output_image, all_steps) | |
| output_image, all_steps = result | |
| else: | |
| # New format returning dictionary | |
| output_image = result['final_image'] | |
| all_steps = result['steps'] | |
| # Create animation frames | |
| frames = [] | |
| for i, step_image in enumerate(all_steps): | |
| # Convert tensor to PIL image | |
| step_pil = Image.fromarray((step_image.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)) | |
| frames.append(step_pil) | |
| # Convert the final output image to PIL | |
| final_image = Image.fromarray((output_image.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)) | |
| # Return the final inferred image and the animation frames directly | |
| return final_image, frames | |
| # Helper function to apply example parameters | |
| def apply_example(example): | |
| return [ | |
| example["image"], | |
| "resnet50_robust", # Model type | |
| example["method"], # Inference type | |
| example["reverse_diff"]["epsilon"], # Epsilon value | |
| example["reverse_diff"]["iterations"], # Number of iterations | |
| example["reverse_diff"]["initial_noise"], # Initial noise | |
| example["reverse_diff"]["diffusion_noise"], # Diffusion noise value (corrected) | |
| example["reverse_diff"]["step_size"], # Step size (added) | |
| example["reverse_diff"]["layer"] # Model layer | |
| ] | |
| # Define the interface | |
| with gr.Blocks(title="Generative Inference Demo") as demo: | |
| gr.Markdown("# Generative Inference Demo") | |
| gr.Markdown("This demo showcases how neural networks can perceive visual illusions through generative inference.") | |
| # Main processing interface | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Inputs | |
| image_input = gr.Image(label="Input Image", type="pil") | |
| with gr.Row(): | |
| model_choice = gr.Dropdown( | |
| choices=["resnet50_robust", "standard_resnet50"], | |
| value="resnet50_robust", | |
| label="Model" | |
| ) | |
| inference_type = gr.Dropdown( | |
| choices=["ReverseDiffusion", "IncreaseConfidence"], | |
| value="ReverseDiffusion", | |
| label="Inference Method" | |
| ) | |
| with gr.Row(): | |
| eps_slider = gr.Slider(minimum=0.01, maximum=3.0, value=0.5, step=0.01, label="Epsilon (Perturbation Size)") | |
| iterations_slider = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Number of Iterations") # Default 50 | |
| with gr.Row(): | |
| initial_noise_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.05, step=0.01, | |
| label="Initial Noise Ratio") | |
| diffusion_noise_slider = gr.Slider(minimum=0.0, maximum=0.05, value=0.01, step=0.001, | |
| label="Diffusion Noise Ratio") # Corrected name | |
| with gr.Row(): | |
| step_size_slider = gr.Slider(minimum=0.01, maximum=2.0, value=0.5, step=0.01, | |
| label="Step Size") # Added step size slider | |
| layer_choice = gr.Dropdown( | |
| choices=["all", "conv1", "bn1", "relu", "maxpool", "layer1", "layer2", "layer3", "layer4", "avgpool"], | |
| value="all", | |
| label="Model Layer" | |
| ) | |
| run_button = gr.Button("Run Inference", variant="primary") | |
| with gr.Column(scale=2): | |
| # Outputs | |
| output_image = gr.Image(label="Final Inferred Image") | |
| output_frames = gr.Gallery(label="Inference Steps", columns=5, rows=2) | |
| # Examples section with integrated explanations | |
| gr.Markdown("## Visual Illusion Examples") | |
| gr.Markdown("Select an illusion to load its parameters and see how generative inference reveals perceptual effects") | |
| # For each example, create a row with the image and explanation side by side | |
| for i, ex in enumerate(examples): | |
| with gr.Row(): | |
| # Left column for the image | |
| with gr.Column(scale=1): | |
| # Display the example image | |
| example_img = gr.Image(value=ex["image"], type="filepath", label=f"{ex['name']}") | |
| load_btn = gr.Button(f"Load Parameters", variant="primary") | |
| # Set up the load button to apply this example's parameters | |
| load_btn.click( | |
| fn=lambda ex=ex: apply_example(ex), | |
| outputs=[ | |
| image_input, model_choice, inference_type, | |
| eps_slider, iterations_slider, | |
| initial_noise_slider, diffusion_noise_slider, | |
| step_size_slider, layer_choice | |
| ] | |
| ) | |
| # Right column for the explanation | |
| with gr.Column(scale=2): | |
| gr.Markdown(f"### {ex['name']}") | |
| gr.Markdown(f"[Read more on Wikipedia]({ex['wiki']})") | |
| gr.Markdown("**Previous Explanations:**") | |
| papers_list = "\n".join([f"- {paper}" for paper in ex["papers"]]) | |
| gr.Markdown(papers_list) | |
| gr.Markdown("**Research Parameters:**") | |
| params_md = f""" | |
| - **Method**: {ex['method']} | |
| - **Model Layer**: {ex['reverse_diff']['layer']} | |
| - **Initial Noise**: {ex['reverse_diff']['initial_noise']} | |
| - **Diffusion Noise**: {ex['reverse_diff']['diffusion_noise']} | |
| - **Step Size**: {ex['reverse_diff']['step_size']} | |
| - **Iterations**: {ex['reverse_diff']['iterations']} | |
| - **Epsilon**: {ex['reverse_diff']['epsilon']} | |
| """ | |
| gr.Markdown(params_md) | |
| if i < len(examples) - 1: # Don't add separator after the last example | |
| gr.Markdown("---") | |
| # Set up event handler for the main inference | |
| run_button.click( | |
| fn=run_inference, | |
| inputs=[ | |
| image_input, model_choice, inference_type, | |
| eps_slider, iterations_slider, | |
| initial_noise_slider, diffusion_noise_slider, | |
| step_size_slider, layer_choice | |
| ], | |
| outputs=[output_image, output_frames] | |
| ) | |
| # About section | |
| gr.Markdown(""" | |
| ## About Generative Inference | |
| Generative inference is a technique that reveals how neural networks perceive visual stimuli. This demo primarily uses the ReverseDiffusion method. | |
| ### ReverseDiffusion | |
| Starts with a noisy version of the image and guides the optimization to match features of the noisy image. | |
| This approach reveals different aspects of visual processing and is inspired by diffusion models. | |
| ### IncreaseConfidence | |
| Optimizes the network's activations to increase confidence in classification, leading to enhanced | |
| features that the network associates with its preferred interpretation. | |
| ### Parameters: | |
| - **Initial Noise Ratio**: Controls the amount of noise added to the image at the beginning | |
| - **Diffusion Noise Ratio**: Controls the amount of noise added at each optimization step | |
| - **Step Size**: Learning rate for the optimization process | |
| - **Number of Iterations**: How many optimization steps to perform | |
| - **Model Layer**: Select a specific layer of the ResNet50 model to extract features from | |
| - **Epsilon**: Controls the size of perturbation during optimization | |
| Different layers capture different levels of abstraction - earlier layers represent low-level features | |
| like edges and textures, while later layers represent higher-level features and object parts. | |
| """) | |
| # Launch the demo | |
| if __name__ == "__main__": | |
| print(f"Starting server on port {args.port}") | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=args.port, | |
| share=False, | |
| debug=True | |
| ) |