File size: 2,905 Bytes
088dc17
 
 
 
 
 
 
bbddc92
088dc17
 
 
 
 
 
 
 
 
bbddc92
 
 
 
088dc17
 
 
 
 
 
 
bbddc92
 
 
 
088dc17
 
 
 
 
 
bbddc92
 
 
 
 
 
 
 
 
 
 
088dc17
 
bbddc92
 
 
 
 
088dc17
 
 
bbddc92
088dc17
 
bbddc92
088dc17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from typing import Dict, List, Any
from PIL import Image
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import torch
import base64
import io


class EndpointHandler:
    def __init__(self, path: str = ""):
        """Called when the endpoint starts. Load model and processor."""
        self.processor = Pix2StructProcessor.from_pretrained(path)
        self.model = Pix2StructForConditionalGeneration.from_pretrained(path)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        
        # Default prompt for DePlot
        self.default_header = "Generate underlying data table of the figure below:"
    
    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Called on every request.
        
        Args:
            data: Dictionary containing:
                - inputs: base64 encoded image string
                - parameters (optional): dict with:
                    - header: text prompt for the model (default: DePlot prompt)
                    - max_new_tokens: max generation length (default: 512)
                
        Returns:
            List containing the generated table text
        """
        inputs = data.get("inputs")
        parameters = data.get("parameters", {})
        
        # Get header text - check multiple possible keys
        header_text = (
            parameters.get("header") or
            parameters.get("text") or
            parameters.get("prompt") or
            data.get("header") or
            data.get("text") or
            data.get("prompt") or
            self.default_header
        )
        
        # Decode base64 image
        if isinstance(inputs, str):
            try:
                image_bytes = base64.b64decode(inputs)
                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            except Exception as e:
                raise ValueError(f"Failed to decode base64 image: {e}")
        else:
            raise ValueError("Expected base64 encoded image string in 'inputs'")
        
        # Process image WITH header text (required for Pix2Struct!)
        model_inputs = self.processor(
            images=image,
            text=header_text,  # <-- THIS WAS MISSING
            return_tensors="pt"
        ).to(self.device)
        
        # Get generation parameters
        max_new_tokens = parameters.get("max_new_tokens", 512)
        
        # Generate
        with torch.no_grad():
            predictions = self.model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens
            )
        
        # Decode
        output_text = self.processor.decode(
            predictions[0],
            skip_special_tokens=True
        )
        
        return [{"generated_text": output_text}]