add tokenizer; reformat

Browse files

Files changed (4) hide show

configuration_aria.py +6 -3
modeling_aria.py +98 -46
tokenization_aria.py +163 -0
tokenizer_config.json +11 -0

configuration_aria.py CHANGED Viewed

@@ -36,15 +36,18 @@ class AriaConfig(PretrainedConfig):
         self.return_dict = return_dict
         if self.intermediate_size % self.hidden_size != 0:
-            raise ValueError("The intermediate size needs to be divisible by hidden size.")
         if self.hidden_size % self.num_attention_heads != 0:
-            raise ValueError("The hidden size needs to be divisible by the number of attention heads.")
     @property
     def ff_mult(self):
         return self.intermediate_size // self.hidden_size
 __all__ = ["AriaConfig"]

         self.return_dict = return_dict
         if self.intermediate_size % self.hidden_size != 0:
+            raise ValueError(
+                "The intermediate size needs to be divisible by hidden size."
+            )
         if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size needs to be divisible by the number of attention heads."
+            )
     @property
     def ff_mult(self):
         return self.intermediate_size // self.hidden_size
 __all__ = ["AriaConfig"]

modeling_aria.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # This is lightly adapted from https://github.com/EleutherAI/aria/blob/main/aria/model.py
-from dataclasses import dataclass
-from typing import Optional, Union, Tuple, List
 import torch
 import torch.utils.checkpoint
@@ -13,7 +12,10 @@ from transformers import Cache, DynamicCache, StaticCache
 from transformers.utils import logging
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from .configuration_aria import AriaConfig
@@ -94,7 +96,7 @@ class AriaBlock(nn.Module):
         self.norm2 = nn.LayerNorm(self.d_model)
     def forward(
-        self,
         x: torch.Tensor,
         attention_mask: torch.Tensor,
         freqs_cis: torch.Tensor,
@@ -104,13 +106,17 @@ class AriaBlock(nn.Module):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.Tensor] = None
     ):
-        attn_output, attn_weights, present = self._att_block(self.norm1(x), attention_mask, freqs_cis,
-                                               past_key_values=past_key_values,
-                                               use_cache=use_cache,
-                                               output_attentions=output_attentions,
-                                               cache_position=cache_position)
         x = x + attn_output
         x = x + self._ff_block(self.norm2(x))
@@ -131,7 +137,7 @@ class AriaBlock(nn.Module):
         past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
-        cache_position: Optional[torch.Tensor] = None
     ):
         batch_size, seq_len, _ = x.shape
         mixed_qkv = self.mixed_qkv(x)
@@ -139,12 +145,8 @@ class AriaBlock(nn.Module):
         # Reshape for rotary embeddings
         # Need contiguous for q, k since in-place RoPE cannot be applied on a view
-        xq = xq.reshape(
-            batch_size, seq_len, self.n_heads, self.d_head
-        ).contiguous()
-        xk = xk.reshape(
-            batch_size, seq_len, self.n_heads, self.d_head
-        ).contiguous()
         xv = xv.view(batch_size, seq_len, self.n_heads, self.d_head)
         # apply_rotary_post_emb expects: (b_sz, s_len, n_head, d_head)
@@ -154,9 +156,9 @@ class AriaBlock(nn.Module):
         if past_key_values is not None:
             cache_kwargs = {
-                #"sin": sin,
-                #"cos": cos,
-                #"partial_rotation_size": self.rotary_ndims,
                 "cache_position": cache_position,
             }
             xk, xv = past_key_values.update(xk, xv, self.layer_idx, cache_kwargs)
@@ -179,10 +181,7 @@ class AriaBlock(nn.Module):
         return self.att_proj_linear(out), att, past_key_values
     def _ff_block(self, x: torch.Tensor):
-        return self.ff_down_proj(
-            F.silu(self.ff_gate_proj(x)) * self.ff_up_proj(x)
-        )
 class AriaModel(AriaPreTrainedModel):
@@ -237,15 +236,27 @@ class AriaModel(AriaPreTrainedModel):
             torch.tensor: Model outputs with shape (batch_size, seq_len,
                 d_model).
         """
-        output_attentions = output_attentions if output_attentions is not None else self.model_config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.model_config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.model_config.use_return_dict
         use_cache = use_cache if use_cache is not None else self.model_config.use_cache
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -272,21 +283,32 @@ class AriaModel(AriaPreTrainedModel):
         seq_length = inputs_embeds.shape[1]
         if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         hidden_states = inputs_embeds
         causal_mask = self._update_causal_mask(
-            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
         )
         if self.freqs_cis is None:
             self.freqs_cis = precompute_freqs_cis(
                 seq_len=self.model_config.max_position_embeddings,
-                n_elem=self.model_config.hidden_size // self.model_config.num_attention_heads,
                 base=500000,
                 dtype=hidden_states.dtype,
             ).to(input_ids.device)
@@ -326,7 +348,9 @@ class AriaModel(AriaPreTrainedModel):
             for layer in self.encode_layers:
                 if output_hidden_states:
                     all_hidden_states = all_hidden_states + (hidden_states,)
-                outputs = layer(hidden_states, causal_mask, freqs_cis=freqs_cis, **kwargs)
                 hidden_states = outputs[0]
                 if use_cache is True:
                     next_decoder_cache = outputs[1]
@@ -342,7 +366,11 @@ class AriaModel(AriaPreTrainedModel):
             next_cache = next_cache.to_legacy_cache()
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
@@ -367,11 +395,17 @@ class AriaModel(AriaPreTrainedModel):
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
         # to infer the attention mask.
-        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.model_config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
@@ -412,7 +446,9 @@ class AriaModel(AriaPreTrainedModel):
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
             min_dtype = torch.finfo(dtype).min
-            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
         return causal_mask
@@ -434,20 +470,30 @@ class AriaModel(AriaPreTrainedModel):
         else:
             min_dtype = torch.finfo(dtype).min
             causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
             )
             if sequence_length != 1:
                 causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
-                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    padding_mask, min_dtype
                 )
         return causal_mask
@@ -483,7 +529,11 @@ class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
         cache_position: Optional[torch.Tensor] = None,
     ):
         """Forward pass of Transformer decoder with LM head."""
-        return_dict = return_dict if return_dict is not None else self.model_config.use_return_dict
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
@@ -507,7 +557,9 @@ class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
             shift_logits = lm_logits[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
         if not return_dict:
             output = (lm_logits,) + outputs[1:]

 # This is lightly adapted from https://github.com/EleutherAI/aria/blob/main/aria/model.py
+from typing import Optional, Union, Tuple
 import torch
 import torch.utils.checkpoint
 from transformers.utils import logging
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from .configuration_aria import AriaConfig
         self.norm2 = nn.LayerNorm(self.d_model)
     def forward(
+        self,
         x: torch.Tensor,
         attention_mask: torch.Tensor,
         freqs_cis: torch.Tensor,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ):
+        attn_output, attn_weights, present = self._att_block(
+            self.norm1(x),
+            attention_mask,
+            freqs_cis,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
         x = x + attn_output
         x = x + self._ff_block(self.norm2(x))
         past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ):
         batch_size, seq_len, _ = x.shape
         mixed_qkv = self.mixed_qkv(x)
         # Reshape for rotary embeddings
         # Need contiguous for q, k since in-place RoPE cannot be applied on a view
+        xq = xq.reshape(batch_size, seq_len, self.n_heads, self.d_head).contiguous()
+        xk = xk.reshape(batch_size, seq_len, self.n_heads, self.d_head).contiguous()
         xv = xv.view(batch_size, seq_len, self.n_heads, self.d_head)
         # apply_rotary_post_emb expects: (b_sz, s_len, n_head, d_head)
         if past_key_values is not None:
             cache_kwargs = {
+                # "sin": sin,
+                # "cos": cos,
+                # "partial_rotation_size": self.rotary_ndims,
                 "cache_position": cache_position,
             }
             xk, xv = past_key_values.update(xk, xv, self.layer_idx, cache_kwargs)
         return self.att_proj_linear(out), att, past_key_values
     def _ff_block(self, x: torch.Tensor):
+        return self.ff_down_proj(F.silu(self.ff_gate_proj(x)) * self.ff_up_proj(x))
 class AriaModel(AriaPreTrainedModel):
             torch.tensor: Model outputs with shape (batch_size, seq_len,
                 d_model).
         """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.model_config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.model_config.output_hidden_states
+        )
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.model_config.use_return_dict
         )
         use_cache = use_cache if use_cache is not None else self.model_config.use_cache
         if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
         if self.gradient_checkpointing and self.training:
             if use_cache:
         seq_length = inputs_embeds.shape[1]
         if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + seq_length,
+                device=inputs_embeds.device,
+            )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         hidden_states = inputs_embeds
         causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
         )
         if self.freqs_cis is None:
             self.freqs_cis = precompute_freqs_cis(
                 seq_len=self.model_config.max_position_embeddings,
+                n_elem=self.model_config.hidden_size
+                // self.model_config.num_attention_heads,
                 base=500000,
                 dtype=hidden_states.dtype,
             ).to(input_ids.device)
             for layer in self.encode_layers:
                 if output_hidden_states:
                     all_hidden_states = all_hidden_states + (hidden_states,)
+                outputs = layer(
+                    hidden_states, causal_mask, freqs_cis=freqs_cis, **kwargs
+                )
                 hidden_states = outputs[0]
                 if use_cache is True:
                     next_decoder_cache = outputs[1]
             next_cache = next_cache.to_legacy_cache()
         if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_attentions]
+                if v is not None
+            )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
         # to infer the attention mask.
+        past_seen_tokens = (
+            past_key_values.get_seq_length() if past_key_values is not None else 0
+        )
         using_static_cache = isinstance(past_key_values, StaticCache)
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.model_config._attn_implementation == "sdpa"
+            and not using_static_cache
+            and not output_attentions
+        ):
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
             # Details: https://github.com/pytorch/pytorch/issues/110213
             min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(
+                causal_mask, min_dtype
+            )
         return causal_mask
         else:
             min_dtype = torch.finfo(dtype).min
             causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=device,
             )
             if sequence_length != 1:
                 causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(
+                target_length, device=device
+            ) > cache_position.reshape(-1, 1)
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
+                causal_mask = (
+                    causal_mask.clone()
+                )  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
+                padding_mask = (
+                    causal_mask[:, :, :, :mask_length]
+                    + attention_mask[:, None, None, :]
                 )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
         return causal_mask
         cache_position: Optional[torch.Tensor] = None,
     ):
         """Forward pass of Transformer decoder with LM head."""
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.model_config.use_return_dict
+        )
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
             shift_logits = lm_logits[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
+            )
         if not return_dict:
             output = (lm_logits,) + outputs[1:]

tokenization_aria.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from typing import TYPE_CHECKING, List, Optional, Tuple
+from transformers.tokenization_utils import PreTrainedTokenizer, BatchEncoding
+from transformers.utils import logging, TensorType, to_py_obj
+try:
+    from ariautils.midi import MidiDict
+    from ariautils.tokenizer import AbsTokenizer
+    from ariautils.tokenizer._base import Token
+except ImportError:
+    raise ImportError(
+        "ariautils is not installed. Please try `pip install git+https://github.com/EleutherAI/aria-utils.git`."
+    )
+if TYPE_CHECKING:
+    pass
+logger = logging.get_logger(__name__)
+class AriaTokenizer(PreTrainedTokenizer):
+    """
+    Aria Tokenizer is NOT a BPE tokenizer. A midi file will be converted to a MidiDict (note: in fact, a MidiDict is not a single dict. It is more about a list of "notes") which represents a sequence of notes, stops, etc. And then, aria tokenizer is simply a dictionary that maps MidiDict to discrete indices according to a hard-coded rule.
+    For a FIM finetuned model, we also follow a simple FIM format to guide a piece of music to a (possibly very different) suffix according to the prompts:
+    <GUIDANCE-START> ... <GUIDANCE-END> <S> <PROMPT-START> ... <PROMPT-END>
+    This way, we expect a continuation that connects PROMPT and GUIDANCE.
+    """
+    vocab_files_names = {}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        **kwargs,
+    ):
+        self._tokenizer = AbsTokenizer()
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        bos_token = self._tokenizer.bos_tok
+        eos_token = self._tokenizer.eos_tok
+        pad_token = self._tokenizer.pad_tok
+        unk_token = self._tokenizer.unk_tok
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+    def __getstate__(self):
+        return {}
+    def __setstate__(self, d):
+        raise NotImplementedError()
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self._tokenizer.vocab_size
+    def get_vocab(self):
+        return self._tokenizer.tok_to_id
+    def tokenize(self, midi_dict: MidiDict, **kwargs) -> List[Token]:
+        return self._tokenizer(midi_dict)
+    def _tokenize(self, midi_dict: MidiDict, **kwargs) -> List[Token]:
+        return self._tokenizer(midi_dict)
+    def __call__(
+        self,
+        midi_dicts: MidiDict | list[MidiDict],
+        padding: bool = False,
+        max_length: int | None = None,
+        pad_to_multiple_of: int | None = None,
+        return_tensors: str | TensorType | None = None,
+        return_attention_mask: bool | None = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """It is impossible to rely on the parent method because the inputs are MidiDict(s) instead of strings. I do not like the idea of going hacky so that two entirely different types of inputs can marry. So here I reimplement __call__ with limited support of certain useful arguments. I do not expect any conflict with other "string-in-ids-out" tokenizers. If you have to mix up the API of string-based tokenizers and our midi-based tokenizer, there must be a problem with your design."""
+        if isinstance(midi_dicts, MidiDict):
+            midi_dicts = [midi_dicts]
+        all_tokens: list[list[int]] = []
+        all_attn_masks: list[list[int]] = []
+        max_len_encoded = 0
+        # TODO: if we decide to optimize batched tokenization on ariautils using some compiled backend, we can change this loop accordingly.
+        for md in midi_dicts:
+            tokens = self._tokenizer.encode(self._tokenizer.tokenize(md))
+            if max_length is not None:
+                tokens = tokens[:max_length]
+            max_len_encoded = max(max_len_encoded, len(tokens))
+            all_tokens.append(tokens)
+            all_attn_masks.append([True] * len(tokens))
+        if pad_to_multiple_of is not None:
+            max_len_encoded = (
+                (max_len_encoded + pad_to_multiple_of) // pad_to_multiple_of
+            ) * pad_to_multiple_of
+        if padding:
+            for tokens, attn_mask in zip(all_tokens, all_attn_masks):
+                tokens.extend([self.pad_token_id] * (max_len_encoded - len(tokens)))
+                attn_mask.extend([False] * (max_len_encoded - len(tokens)))
+        return BatchEncoding(
+            {
+                "input_ids": all_tokens,
+                "attention_masks": all_attn_masks,
+            },
+            tensor_type=return_tensors,
+        )
+    def decode(self, token_ids: List[Token], **kwargs) -> MidiDict:
+        token_ids = to_py_obj(token_ids)
+        return self._tokenizer.detokenize(self._tokenizer.decode(token_ids))
+    def batch_decode(
+        self, token_ids_list: List[List[Token]], **kwargs
+    ) -> List[MidiDict]:
+        results = []
+        for token_ids in token_ids_list:
+            # Can we simply yield (without breaking all HF wrappers)?
+            results.append(self.decode(token_ids))
+        return results
+    def encode_from_file(self, filename: str, **kwargs) -> BatchEncoding:
+        midi_dict = MidiDict.from_midi(filename)
+        return self(midi_dict, **kwargs)
+    def encode_from_files(self, filenames: list[str], **kwargs) -> BatchEncoding:
+        midi_dicts = [MidiDict.from_midi(file) for file in filenames]
+        return self(midi_dicts, **kwargs)
+    def _convert_token_to_id(self, token: Token):
+        """Converts a token (tuple or str) into an id."""
+        return self._tokenizer.tok_to_id.get(
+            token, self._tokenizer.tok_to_id[self.unk_token]
+        )
+    def _convert_id_to_token(self, index: int):
+        """Converts an index (integer) in a token (tuple or str)."""
+        return self._tokenizer.id_to_tok.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens: List[Token]) -> MidiDict:
+        """Converts a sequence of tokens into a single MidiDict."""
+        return self._tokenizer.detokenize(tokens)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        raise NotImplementedError()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_aria.AriaTokenizer",
+      null
+    ]
+  },
+  "tokenizer_class": "AriaTokenizer"
+}