Thanks to Ai2 for creating the model; it's a small example.

#4
by aifeifei798 - opened
import torch
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForImageTextToText
import pyautogui
import os

# 1. Configuration
MODEL_PATH = "allenai/MolmoPoint-GUI-8B"

# PyAutoGUI Safety: Move mouse to any corner of the screen to abort the script
pyautogui.FAILSAFE = True

# 2. Initialize Model & Processor (Loads only once)
print(f"--- Initializing AI Vision Agent ---")
print(f"Loading model from {MODEL_PATH}... please wait.")

processor = AutoProcessor.from_pretrained(
    MODEL_PATH, trust_remote_code=True, use_fast=True
)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_PATH, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto"
)

print("βœ… System Ready! The model is now watching your screen.")
print("=" * 60)

# 3. Main Interaction Loop
while True:
    try:
        # Get user command
        user_prompt = input("\nπŸ‘‰ Enter what you want to click (or 'q' to quit): ")

        if user_prompt.lower() in ["q", "exit", "quit"]:
            print("πŸ‘‹ Shutting down AI Agent. Goodbye!")
            break

        if not user_prompt.strip():
            continue

        print(f"πŸ“Έ Capturing screen and searching for: '{user_prompt}'...")

        # Step A: Take a real-time screenshot
        # We convert to RGB to ensure compatibility with the model
        screen_img = pyautogui.screenshot().convert("RGB")

        # Step B: Prepare model inputs
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": user_prompt},
                    {"type": "image", "image": screen_img},
                ],
            }
        ]

        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
            padding=True,
            return_pointing_metadata=True,
        )

        metadata = inputs.pop("metadata")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Step C: Model Inference
        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
            output = model.generate(
                **inputs,
                logits_processor=model.build_logit_processor_from_inputs(inputs),
                max_new_tokens=256,
                use_cache=True,
            )

        # Step D: Process Output & Extract Coordinates
        generated_tokens = output[:, inputs["input_ids"].size(1) :]
        generated_text = processor.post_process_image_text_to_text(
            generated_tokens,
            skip_special_tokens=False,
            clean_up_tokenization_spaces=False,
        )[0]

        points = model.extract_image_points(
            generated_text,
            metadata["token_pooling"],
            metadata["subpatch_mapping"],
            metadata["image_sizes"],
        )

        # Step E: Execute Action
        if points:
            # points[0] format: [object_id, img_idx, x_pixel, y_pixel]
            target_x, target_y = points[0][2], points[0][3]

            print(f"🎯 Target found at: (X: {target_x:.1f}, Y: {target_y:.1f})")

            # Smoothly move the mouse to the target
            pyautogui.moveTo(target_x, target_y, duration=0.5)

            # Perform the click
            pyautogui.click()
            print("βœ… Action performed: Clicked!")

            # Optional: Save a visual confirmation for debugging
            # draw = ImageDraw.Draw(screen_img)
            # draw.ellipse([target_x-10, target_y-10, target_x+10, target_y+10], outline="red", width=5)
            # screen_img.save("last_action_debug.png")
        else:
            print("❌ Model could not find that element. Try a different description.")

    except KeyboardInterrupt:
        print("\nπŸ‘‹ User interrupted. Exiting...")
        break
    except Exception as e:
        print(f"⚠️ An error occurred: {e}")

more: https://lab.feimatrix.com/building-an-autonomous-ai-vision-agent-the-complete-guide/

Sign up or log in to comment