Files changed (1) hide show
  1. README.md +102 -3
README.md CHANGED
@@ -1,3 +1,102 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ language:
4
+ - en
5
+ ---
6
+ # Isaac-0.2-1B by Perceptron
7
+
8
+ Introducing the 1B parameter variant of Isaac-0.2, the hybrid-reasoning vision-language model.
9
+
10
+ This release brings major upgrades β€” optional reasoning via thinking traces, perceptive tool calling (including our new Focus system), stronger grounding, better OCR, better desktop use, and improved structured output β€” while remaining fast, compact, and deployable.
11
+
12
+ ## Extending the efficient frontier of perception
13
+
14
+ Isaac 0.2 extends what we started with Isaac 0.1: small models that outperform systems 10Γ— larger on visual reasoning and perception tasks, all running on commodity GPUs or edge devices.
15
+ From robotics to media search to industrial inspection, Isaac 0.2 delivers high-accuracy perception without the heavy compute footprint.
16
+
17
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/65526dfffb76980adeffa369/yQl-9BAxLud6hhK8gCKLt.png)
18
+
19
+ ## What's New in Isaac 0.2
20
+
21
+ * **Reasoning via Thinking Traces**: Short, structured reasoning traces improve multi-step decisions, small-object understanding, and ambiguous spatial tasks.
22
+
23
+ * **Perceptive Tool Calling + Focus (Zoom & Crop)**: Isaac 0.2 can trigger tool calls to focus (i.e., zoom and crop) and re-query the model on a smaller region β€” dramatically improving fine-grained perception.
24
+
25
+ * **Structured Outputs**: More reliable structured output generation for consistent JSON and predictable downstream integration.
26
+
27
+ * **Complex OCR**: Improved text recognition across cluttered, low-resolution, or distorted regions β€” enabling accurate extraction from documents, diagrams, labels, screens, and dense real-world scenes.
28
+
29
+ * **Desktop Use**: Better performance on everyday desktop and mobile workflows such as UI understanding and navigation, making Isaac faster and more capable for agentic use cases.
30
+
31
+ ## Performance Benchmarks
32
+
33
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/65526dfffb76980adeffa369/scKXlSu474L4r8-I6Ahau.png)
34
+
35
+ ## Chatting with Isaac in πŸ€— Transformers
36
+ Learn more at our [Huggingface Example Repo](https://github.com/perceptron-ai-inc/perceptron/tree/main/huggingface), where we demo extracting and rendering points.
37
+
38
+ ```bash
39
+ pip install perceptron
40
+ ```
41
+
42
+ ### Usage
43
+
44
+ ```python
45
+ from transformers import AutoModelForCausalLM, AutoProcessor
46
+ from transformers.utils.import_utils import is_torch_cuda_available
47
+ from transformers.image_utils import load_image
48
+
49
+ def document_to_messages(document: list[dict]):
50
+ messages, images = [], []
51
+ for item in document:
52
+ if not (content := item.get("content")):
53
+ continue
54
+ role = item.get("role", "user")
55
+ if item.get("type") == "image":
56
+ images.append(load_image(content))
57
+ messages.append({"role": role, "content": "<image>"})
58
+ elif item.get("type") == "text":
59
+ messages.append({"role": role, "content": content})
60
+ return messages, images
61
+
62
+ # Load model/processor from the checkpoint
63
+ checkpoint_path = "PerceptronAI/Isaac-0.2-1B-Preview"
64
+ processor = AutoProcessor.from_pretrained(checkpoint_path, trust_remote_code=True)
65
+ device, dtype = ("cuda","bfloat16") if is_torch_cuda_available() else ("cpu","float32")
66
+ model = AutoModelForCausalLM.from_pretrained(
67
+ checkpoint_path, trust_remote_code=True, vision_attn_implementation="flash_attention_2", dtype = dtype
68
+ ).to(device=device)
69
+
70
+ document = [
71
+ {
72
+ "type": "text",
73
+ "content": "<hint>BOX</hint>",
74
+ "role": "user",
75
+ },
76
+ {
77
+ "type": "image",
78
+ "content": "https://raw.githubusercontent.com/perceptron-ai-inc/perceptron/refs/heads/main/huggingface/assets/example.webp",
79
+ "role": "user",
80
+ },
81
+ {
82
+ "type": "text",
83
+ "content": "Determine whether it is safe to cross the street. Look for signage and moving traffic.",
84
+ "role": "user",
85
+ },
86
+ ]
87
+
88
+ # Prepare inputs for generation
89
+ messages, images = document_to_messages(document)
90
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
91
+ inputs = processor(text=text, images=images, return_tensors="pt")
92
+
93
+ # Generation
94
+ generated_ids = model.generate(
95
+ tensor_stream=inputs["tensor_stream"].to(device),
96
+ max_new_tokens=256,
97
+ do_sample=False,
98
+ )
99
+ generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
100
+ print(f"\n Output: {generated_text}")
101
+
102
+ ```