Thanks to Ai2 for creating the model; it's a small example.
#4
by aifeifei798 - opened
import torch
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForImageTextToText
import pyautogui
import os
# 1. Configuration
MODEL_PATH = "allenai/MolmoPoint-GUI-8B"
# PyAutoGUI Safety: Move mouse to any corner of the screen to abort the script
pyautogui.FAILSAFE = True
# 2. Initialize Model & Processor (Loads only once)
print(f"--- Initializing AI Vision Agent ---")
print(f"Loading model from {MODEL_PATH}... please wait.")
processor = AutoProcessor.from_pretrained(
MODEL_PATH, trust_remote_code=True, use_fast=True
)
model = AutoModelForImageTextToText.from_pretrained(
MODEL_PATH, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto"
)
print("β
System Ready! The model is now watching your screen.")
print("=" * 60)
# 3. Main Interaction Loop
while True:
try:
# Get user command
user_prompt = input("\nπ Enter what you want to click (or 'q' to quit): ")
if user_prompt.lower() in ["q", "exit", "quit"]:
print("π Shutting down AI Agent. Goodbye!")
break
if not user_prompt.strip():
continue
print(f"πΈ Capturing screen and searching for: '{user_prompt}'...")
# Step A: Take a real-time screenshot
# We convert to RGB to ensure compatibility with the model
screen_img = pyautogui.screenshot().convert("RGB")
# Step B: Prepare model inputs
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{"type": "image", "image": screen_img},
],
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
padding=True,
return_pointing_metadata=True,
)
metadata = inputs.pop("metadata")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Step C: Model Inference
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
output = model.generate(
**inputs,
logits_processor=model.build_logit_processor_from_inputs(inputs),
max_new_tokens=256,
use_cache=True,
)
# Step D: Process Output & Extract Coordinates
generated_tokens = output[:, inputs["input_ids"].size(1) :]
generated_text = processor.post_process_image_text_to_text(
generated_tokens,
skip_special_tokens=False,
clean_up_tokenization_spaces=False,
)[0]
points = model.extract_image_points(
generated_text,
metadata["token_pooling"],
metadata["subpatch_mapping"],
metadata["image_sizes"],
)
# Step E: Execute Action
if points:
# points[0] format: [object_id, img_idx, x_pixel, y_pixel]
target_x, target_y = points[0][2], points[0][3]
print(f"π― Target found at: (X: {target_x:.1f}, Y: {target_y:.1f})")
# Smoothly move the mouse to the target
pyautogui.moveTo(target_x, target_y, duration=0.5)
# Perform the click
pyautogui.click()
print("β
Action performed: Clicked!")
# Optional: Save a visual confirmation for debugging
# draw = ImageDraw.Draw(screen_img)
# draw.ellipse([target_x-10, target_y-10, target_x+10, target_y+10], outline="red", width=5)
# screen_img.save("last_action_debug.png")
else:
print("β Model could not find that element. Try a different description.")
except KeyboardInterrupt:
print("\nπ User interrupted. Exiting...")
break
except Exception as e:
print(f"β οΈ An error occurred: {e}")
more: https://lab.feimatrix.com/building-an-autonomous-ai-vision-agent-the-complete-guide/