File size: 10,910 Bytes
fc0ff8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import os.path
from typing import List

from PIL import Image
import torch
import torch.nn.functional as F

from open_flamingo.eval.eval_model import BaseEvalModel
from open_flamingo.src.factory import create_model_and_transforms
from contextlib import suppress
from open_flamingo.eval.models.utils import unwrap_model, get_label
from torchvision.transforms import transforms


# adversarial eval model
# adapted from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/eval/models/open_flamingo.py

class EvalModelAdv(BaseEvalModel):
    """OpenFlamingo adversarial model evaluation.

    Attributes:
      model (nn.Module): Underlying Torch model.
      tokenizer (transformers.PreTrainedTokenizer): Tokenizer for model.
      device: Index of GPU to use, or the string "CPU"
    """

    def __init__(self, model_args, adversarial):
        assert (
            "vision_encoder_path" in model_args
            and "lm_path" in model_args
            and "checkpoint_path" in model_args
            and "lm_tokenizer_path" in model_args
            and "cross_attn_every_n_layers" in model_args
            and "vision_encoder_pretrained" in model_args
            and "precision" in model_args
        ), "OpenFlamingo requires vision_encoder_path, lm_path, device, checkpoint_path, lm_tokenizer_path, cross_attn_every_n_layers, vision_encoder_pretrained, and precision arguments to be specified"

        self.device = (
            model_args["device"]
            if ("device" in model_args and model_args["device"] >= 0)
            else "cpu"
        )
        self.model_args = model_args
        # autocast
        self.autocast = get_autocast(model_args["precision"])
        self.cast_dtype = get_cast_dtype(model_args["precision"])

        if model_args["vision_encoder_pretrained"] != "openai":
            # load openai weights first - as we save only the visual weights, it doesn't work to load the full model
            vision_encoder_pretrained_ = "openai"
        else:
            vision_encoder_pretrained_ = model_args["vision_encoder_pretrained"]

        (
            self.model,
            image_processor,
            self.tokenizer,
        ) = create_model_and_transforms(
            model_args["vision_encoder_path"],
            vision_encoder_pretrained_,
            model_args["lm_path"],
            model_args["lm_tokenizer_path"],
            cross_attn_every_n_layers=int(model_args["cross_attn_every_n_layers"]),
            compute_all_grads=adversarial,
        )
        self.image_processor_no_norm = transforms.Compose(image_processor.transforms[:-1])
        self.normalizer = image_processor.transforms[-1]
        del image_processor  # make sure we don't use it by accident
        self.adversarial = adversarial
        # image processor (9B model, probably same for others):
            # Compose(
            #   Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
            #   CenterCrop(size=(224, 224))
            #   <function _convert_to_rgb at 0x7fb90724ee80>
            #   ToTensor()
            # )

        if model_args["vision_encoder_pretrained"] != "openai":
            print("Loading non-openai vision encoder weights")
            self.model.vision_encoder.load_state_dict(torch.load(model_args["vision_encoder_pretrained"], map_location=self.device))


        checkpoint = torch.load(model_args["checkpoint_path"], map_location=self.device)
        if "model_state_dict" in checkpoint:
            checkpoint = checkpoint["model_state_dict"]
            checkpoint = {k.replace("module.", ""): v for k, v in checkpoint.items()}
        self.model.load_state_dict(checkpoint, strict=False)
        self.model.to(self.device, dtype=self.cast_dtype)
        self.model.eval()
        self.tokenizer.padding_side = "left"

    def _prepare_images(self, batch: List[List[torch.Tensor]], preprocessor=None) -> torch.Tensor:
        """Preprocess images and stack them. Returns unnormed images.

        Args:
            batch: A list of lists of images.
            preprocessor: If specified, use this preprocessor instead of the default.

        Returns:
            A Tensor of shape
            (batch_size, images_per_example, frames, channels, height, width).
        """
        images_per_example = max(len(x) for x in batch)
        batch_images = None
        for iexample, example in enumerate(batch):
            for iimage, image in enumerate(example):
                preprocessed = self.image_processor_no_norm(image) if not preprocessor else preprocessor(image)

                if batch_images is None:
                    batch_images = torch.zeros(
                        (len(batch), images_per_example, 1) + preprocessed.shape,
                        dtype=preprocessed.dtype,
                    )
                batch_images[iexample, iimage, 0] = preprocessed
        return batch_images

    def get_outputs(
        self,
        batch_text: List[str],
        batch_images: torch.Tensor,
        min_generation_length: int,
        max_generation_length: int,
        num_beams: int,
        length_penalty: float,
    ) -> List[str]:
        encodings = self.tokenizer(
            batch_text,
            padding="longest",
            truncation=True,
            return_tensors="pt",
            max_length=2000,
        )
        input_ids = encodings["input_ids"]
        attention_mask = encodings["attention_mask"]

        with torch.inference_mode():
            with self.autocast():
                # x_vis = self._prepare_images(batch_images).to(
                #         self.device, dtype=self.cast_dtype, non_blocking=True
                #     )
                x_vis = batch_images.to(
                        self.device, dtype=self.cast_dtype, non_blocking=True
                    )
                x_vis = self.normalizer(x_vis)
                outputs = unwrap_model(self.model).generate(
                    x_vis,
                    input_ids.to(self.device, non_blocking=True),
                    attention_mask=attention_mask.to(
                        self.device, dtype=self.cast_dtype, non_blocking=True
                    ),
                    min_new_tokens=min_generation_length,
                    max_new_tokens=max_generation_length,
                    num_beams=num_beams,
                    length_penalty=length_penalty,
                )

        outputs = outputs[:, len(input_ids[0]) :]

        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

    def get_logits(
        self,
        lang_x: torch.Tensor,
        vision_x_unnorm: torch.Tensor = None,
        attention_mask: torch.Tensor = None,
        past_key_values: torch.Tensor = None,
        clear_conditioned_layers: bool = False,
        labels: torch.Tensor = None,
    ):
        with torch.inference_mode(not self.adversarial):
            with self.autocast():
                outputs = self.model(
                    vision_x=self.normalizer(vision_x_unnorm),
                    lang_x=lang_x,
                    labels=labels,
                    attention_mask=attention_mask.bool(),
                    clear_conditioned_layers=clear_conditioned_layers,
                    past_key_values=past_key_values,
                    use_cache=(past_key_values is not None),
                )
        return outputs

    def __call__(self, vision_x_unnorm):
        assert self.lang_x is not None
        assert self.attention_mask is not None
        assert self.labels is not None
        outputs = self.get_logits(
            self.lang_x,
            vision_x_unnorm=vision_x_unnorm,
            attention_mask=self.attention_mask,
            past_key_values=self.past_key_values,
            clear_conditioned_layers=True,
            labels=None  # labels are considered below
        )
        logits = outputs.logits
        loss_expanded = compute_loss(logits, self.labels)
        return loss_expanded
        # return outputs.loss

    def set_inputs(
        self,
        batch_text: List[str],
        past_key_values: torch.Tensor = None,
        to_device: bool = False,
    ):
        encodings = self.tokenizer(
            batch_text,
            padding="longest",
            truncation=True,
            return_tensors="pt",
            max_length=2000,
        )
        self.lang_x = encodings["input_ids"]
        labels = get_label(lang_x=self.lang_x, tokenizer=self.tokenizer, mode="colon")
        self.labels = labels
        self.attention_mask = encodings["attention_mask"]
        self.past_key_values = past_key_values
        if to_device:
            self.lang_x = self.lang_x.to(self.device)
            self.attention_mask = self.attention_mask.to(self.device)
            self.labels = self.labels.to(self.device)
            if self.past_key_values is not None:
                self.past_key_values = self.past_key_values.to(self.device)


    def encode_vision_x(self, image_tensor: torch.Tensor):
        unwrap_model(self.model)._encode_vision_x(image_tensor.to(self.device))

    def uncache_media(self):
        unwrap_model(self.model).uncache_media()

    def cache_media(self, input_ids, vision_x):
        unwrap_model(self.model).cache_media(input_ids=input_ids, vision_x=vision_x)

    def get_vqa_prompt(self, question, answer=None) -> str:
        if answer and ":" in answer:
            answer = answer.replace(":", "")
        return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"

    def get_caption_prompt(self, caption=None) -> str:
        if caption and ":" in caption:
            caption = caption.replace(":", "")
        return f"<image>Output:{caption if caption is not None else ''}{'<|endofchunk|>' if caption is not None else ''}"

def compute_loss(logits, labels):
    bs = logits.shape[0]
    labels = torch.roll(labels, shifts=-1)
    labels[:, -1] = -100
    loss_expanded = F.cross_entropy(
        logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1),
        reduction='none'
    )
    loss_expanded = loss_expanded.view(bs, -1).sum(-1)
    return loss_expanded

def get_cast_dtype(precision: str):
    if precision == "bf16":
        cast_dtype = torch.bfloat16
    elif precision in ["fp16", "float16"]:
        cast_dtype = torch.float16
    elif precision in ["fp32", "float32", "amp_bf16"]:
        cast_dtype = None
    else:
        raise ValueError(f"Unknown precision {precision}")
    return cast_dtype


def get_autocast(precision):
    if precision == "amp":
        return torch.cuda.amp.autocast
    elif precision == "amp_bfloat16" or precision == "amp_bf16":
        # amp_bfloat16 is more stable than amp float16 for clip training
        return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
    else:
        return suppress