Spaces:

TaliDror
/

AAS2F

Running on Zero

App Files Files Community

TaliDror commited on 8 days ago

Commit

efd5117

1 Parent(s): 780f1aa

updated _make_causal_mask and _expand_mask to newer transformer version

Browse files

Files changed (1) hide show

external/arc2face/models.py +41 -50

external/arc2face/models.py CHANGED Viewed

@@ -2,58 +2,36 @@ import torch
 from transformers import CLIPTextModel
 from typing import Any, Callable, Dict, Optional, Tuple, Union, List
 from transformers.modeling_outputs import BaseModelOutputWithPooling
-#from transformers.models.clip.modeling_clip import _make_causal_mask, _expand_mask
-try:
-    from transformers.models.clip.modeling_clip import _make_causal_mask, _expand_mask
-except ImportError:
-    # transformers >=4.47 removed these internal helpers from modeling_clip.
-    # Reimplement them directly from the transformers 4.34 source so the mask
-    # format (additive, shape [bsz,1,tgt,src]) matches what CLIPEncoder expects.
-    def _make_causal_mask(input_ids_shape, dtype, device, past_key_values_length=0):
-        bsz, tgt_len = input_ids_shape
-        mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-        mask_cond = torch.arange(tgt_len, device=device)
-        mask.masked_fill_(mask_cond < (mask_cond + 1).view(tgt_len, 1), 0)
-        mask = mask.to(dtype)
-        if past_key_values_length > 0:
-            mask = torch.cat(
-                [torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1
-            )
-        return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-    def _expand_mask(mask, dtype, tgt_len=None):
-        bsz, src_len = mask.shape
-        tgt_len = tgt_len if tgt_len is not None else src_len
-        expanded = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-        inverted = 1.0 - expanded
-        return inverted.masked_fill(inverted.to(torch.bool), torch.finfo(dtype).min)
 class CLIPTextModelWrapper(CLIPTextModel):
     # Adapted from https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/clip/modeling_clip.py#L812
     # Modified to accept precomputed token embeddings "input_token_embs" as input or calculate them from input_ids and return them.
-    # Supports both transformers <=4.46 (self.text_model sub-attribute) and >=4.47 (flat structure, no text_model).
     def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        input_token_embs: Optional[torch.Tensor] = None,
-        return_token_embs: Optional[bool] = False,
     ) -> Union[Tuple, torch.Tensor, BaseModelOutputWithPooling]:
-        # In transformers <=4.46 the transformer lives in self.text_model;
-        # in >=4.47 it was inlined directly onto CLIPTextModel (flat structure).
-        tm = getattr(self, 'text_model', self)
         if return_token_embs:
-            return tm.embeddings.token_embedding(input_ids)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         if input_ids is None:
             raise ValueError("You have to specify input_ids")
@@ -61,13 +39,19 @@ class CLIPTextModelWrapper(CLIPTextModel):
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_shape[-1])
-        hidden_states = tm.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=input_token_embs)
-        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         if attention_mask is not None:
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
-        encoder_outputs = tm.encoder(
             inputs_embeds=hidden_states,
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
@@ -77,18 +61,25 @@ class CLIPTextModelWrapper(CLIPTextModel):
         )
         last_hidden_state = encoder_outputs[0]
-        last_hidden_state = tm.final_layer_norm(last_hidden_state)
-        eos_token_id = getattr(tm, 'eos_token_id', self.config.eos_token_id)
-        if eos_token_id == 2:
             pooled_output = last_hidden_state[
                 torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
                 input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
             ]
         else:
             pooled_output = last_hidden_state[
                 torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == eos_token_id)
                 .int()
                 .argmax(dim=-1),
             ]
@@ -101,4 +92,4 @@ class CLIPTextModelWrapper(CLIPTextModel):
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-        )

 from transformers import CLIPTextModel
 from typing import Any, Callable, Dict, Optional, Tuple, Union, List
 from transformers.modeling_outputs import BaseModelOutputWithPooling
+# from transformers.models.clip.modeling_clip import _make_causal_mask, _expand_mask
+from transformers.modeling_attn_mask_utils import _create_4d_causal_attention_mask, \
+    _prepare_4d_attention_mask  # transformers 4.36.0
 class CLIPTextModelWrapper(CLIPTextModel):
     # Adapted from https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/clip/modeling_clip.py#L812
     # Modified to accept precomputed token embeddings "input_token_embs" as input or calculate them from input_ids and return them.
     def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            input_token_embs: Optional[torch.Tensor] = None,
+            return_token_embs: Optional[bool] = False,
     ) -> Union[Tuple, torch.Tensor, BaseModelOutputWithPooling]:
         if return_token_embs:
+            return self.text_model.embeddings.token_embedding(input_ids)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.text_model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.text_model.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.text_model.config.use_return_dict
         if input_ids is None:
             raise ValueError("You have to specify input_ids")
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.text_model.embeddings(input_ids=input_ids, position_ids=position_ids,
+                                                   inputs_embeds=input_token_embs)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(input_shape, hidden_states.dtype,
+                                                                 device=hidden_states.device)
+        # expand attention_mask
         if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs = self.text_model.encoder(
             inputs_embeds=hidden_states,
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
         )
         last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.text_model.final_layer_norm(last_hidden_state)
+        if self.text_model.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
             pooled_output = last_hidden_state[
                 torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
                 input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
             ]
         else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
             pooled_output = last_hidden_state[
                 torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                    # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.text_model.eos_token_id)
                 .int()
                 .argmax(dim=-1),
             ]
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
+        )