File size: 5,165 Bytes
96217b3
 
57b75d0
96217b3
 
 
57b75d0
96217b3
57b75d0
96217b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57b75d0
 
96217b3
 
 
 
 
 
 
 
57b75d0
96217b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57b75d0
96217b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57b75d0
96217b3
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
LongCLIP processor for preprocessing images and text.

This module provides a processor that combines image and text preprocessing
for LongCLIP models.
"""

from typing import List, Optional, Union

from transformers import CLIPImageProcessor, CLIPTokenizer
from transformers.processing_utils import ProcessorMixin


class LongCLIPProcessor(ProcessorMixin):
    """
    Processor for LongCLIP that combines image and text preprocessing.

    This processor wraps CLIPImageProcessor and CLIPTokenizer to provide
    a unified interface for preprocessing inputs for LongCLIP models.

    Args:
        image_processor (CLIPImageProcessor): Image processor for preprocessing images.
        tokenizer (CLIPTokenizer): Tokenizer for preprocessing text.

    Attributes:
        image_processor_class (str): Name of the image processor class.
        tokenizer_class (str): Name of the tokenizer class.

    Example:
        ```python
        >>> from long_clip_hf import LongCLIPProcessor
        >>> from transformers import CLIPImageProcessor, CLIPTokenizer
        >>> from PIL import Image
        >>>
        >>> # Initialize processor
        >>> image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = LongCLIPProcessor(image_processor=image_processor, tokenizer=tokenizer)
        >>>
        >>> # Process inputs
        >>> image = Image.open("path/to/image.jpg")
        >>> text = "a photo of a cat"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", padding=True, max_length=248)
        >>>
        >>> # inputs contains both 'input_ids', 'attention_mask' and 'pixel_values'
        ```
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = "CLIPTokenizer"

    def __init__(
        self,
        image_processor: Optional[CLIPImageProcessor] = None,
        tokenizer: Optional[CLIPTokenizer] = None,
        **kwargs,
    ):
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
        text: Union[str, List[str], None] = None,
        images=None,
        return_tensors: Optional[str] = "pt",
        padding: Union[bool, str] = True,
        max_length: Optional[int] = 248,
        truncation: Optional[bool] = True,
        **kwargs,
    ):
        """
        Preprocess text and images for LongCLIP model.

        Args:
            text (str, List[str], optional): Text or list of texts to process.
            images: Image or list of images to process. Can be PIL Image, numpy array, or tensor.
            return_tensors (str, optional): Type of tensors to return ('pt' for PyTorch).
            padding (bool or str, optional): Padding strategy. Defaults to True.
            max_length (int, optional): Maximum sequence length. Defaults to 248 for LongCLIP.
            truncation (bool, optional): Whether to truncate sequences. Defaults to True.
            **kwargs: Additional keyword arguments.

        Returns:
            BatchEncoding: Dictionary containing processed inputs with keys:
                - input_ids: Tokenized text (if text provided)
                - attention_mask: Attention mask for text (if text provided)
                - pixel_values: Processed images (if images provided)
        """
        # Process text
        if text is not None:
            text_inputs = self.tokenizer(
                text,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_length,
                truncation=truncation,
                **kwargs,
            )
        else:
            text_inputs = {}

        # Process images
        if images is not None:
            image_inputs = self.image_processor(
                images,
                return_tensors=return_tensors,
            )
        else:
            image_inputs = {}

        # Combine inputs
        return {**text_inputs, **image_inputs}

    def batch_decode(self, *args, **kwargs):
        """
        Decode token IDs back to text.

        This method is forwarded to the tokenizer's batch_decode method.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        Decode token IDs back to text.

        This method is forwarded to the tokenizer's decode method.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    def model_input_names(self):
        """
        Get the names of model inputs.

        Returns:
            List[str]: List of input names.
        """
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))