openbmb
/

MiniCPM-Reranker

@@ -27,7 +27,7 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
@@ -35,6 +35,7 @@ from transformers.modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
@@ -320,9 +321,6 @@ class MiniCPMAttention(nn.Module):
         self.rope_theta = config.rope_theta
         self.is_causal = config.is_causal
-        logger.info(f"self.is_causal = {self.is_causal}")
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
@@ -1049,17 +1047,29 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         elif self._use_sdpa and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
         else:
             # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-            )
         # embed positions
         hidden_states = inputs_embeds
@@ -1119,7 +1129,6 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
             attentions=all_self_attns,
         )
 class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1335,6 +1344,32 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
         return response, history
 @add_start_docstrings(
     """
     The MiniCPM Model transformer with a sequence classification head on top (linear layer).

 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import LlamaTokenizer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
+    _prepare_4d_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
         self.rope_theta = config.rope_theta
         self.is_causal = config.is_causal
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
         elif self._use_sdpa and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
+            if self.is_causal:
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa (
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
+            else:
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask,
+                    inputs_embeds.dtype,
+                )
         else:
             # 4d mask is passed through the layers
+            if self.is_causal:
+                attention_mask = _prepare_4d_causal_attention_mask (
+                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+                )
+            else:
+                attention_mask = _prepare_4d_attention_mask(
+                    attention_mask,
+                    inputs_embeds.dtype,
+                )
         # embed positions
         hidden_states = inputs_embeds
             attentions=all_self_attns,
         )
 class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
         return response, history
+class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
+    def build_inputs_with_special_tokens(
+            self, token_ids_0, token_ids_1 = None
+        ):
+            """
+            - single sequence: `<s> X </s>`
+            - pair of sequences: `<s> A </s> B`
+            Args:
+                token_ids_0 (`List[int]`):
+                    List of IDs to which the special tokens will be added.
+                token_ids_1 (`List[int]`, *optional*):
+                    Optional second list of IDs for sequence pairs.
+            Returns:
+                `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+            """
+            if token_ids_1 is None:
+                return super().build_inputs_with_special_tokens(token_ids_0)
+            bos = [self.bos_token_id]
+            sep = [self.eos_token_id]
+            return bos + token_ids_0 + sep + token_ids_1
 @add_start_docstrings(
     """
     The MiniCPM Model transformer with a sequence classification head on top (linear layer).