Mehdi Lakbar
Initial demo of Lina-speech (pardi-speech)
56cfa73
from typing import Any
import torch
from transformers.cache_utils import Cache, _static_cache_update
class StaticCache(Cache):
"""
Static Cache class to be used with `torch.compile(model)` and `torch.export()`.
Parameters:
config (`PretrainedConfig`):
The configuration file defining the shape-related attributes required to initialize the static cache.
max_batch_size (`int`):
The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a
smaller batch size is used. If you are manually setting the batch size, make sure to take into account the
number of beams if you are running beam search
max_cache_len (`int`, *optional*):
The maximum sequence length with which the model will be used.
device (`torch.device` or `str`, *optional*):
The device on which the cache should be initialized. If you're using more than 1 computation device, you
should pass the `layer_device_map` argument instead.
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer.
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`.
Example:
```python
>>> from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
>>> inputs = tokenizer(text="My name is Llama", return_tensors="pt")
>>> # Prepare a cache class and pass it to model's forward
>>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
>>> max_generated_length = inputs.input_ids.shape[1] + 10
>>> past_key_values = StaticCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
>>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
>>> outputs.past_key_values # access cache filled with key/values from generation
StaticCache()
```
"""
is_compileable = True
def __init__(
self,
max_batch_size: int,
head_dim: int,
num_key_value_heads: int,
num_hidden_layers: int,
max_cache_len: int | None = None,
device: torch.device | str | None = None,
dtype: torch.dtype = torch.float32,
layer_device_map: dict[int, str | torch.device | int] | None = None,
) -> None:
super().__init__()
self.max_batch_size = max_batch_size
self.max_cache_len = max_cache_len
# Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
self.head_dim = head_dim
self._dtype = dtype
self.num_key_value_heads = num_key_value_heads
self.num_hidden_layers = num_hidden_layers
self.key_cache: list[torch.Tensor] = []
self.value_cache: list[torch.Tensor] = []
# Note: There will be significant perf decrease if switching to use 5D tensors instead.
cache_shape = (
self.max_batch_size,
self.num_key_value_heads,
self.max_cache_len,
self.head_dim,
)
device = torch.device(device) if device is not None else None
for idx in range(self.num_hidden_layers):
if layer_device_map is not None:
layer_device = layer_device_map[idx]
else:
layer_device = device
new_layer_key_cache = torch.zeros(
cache_shape, dtype=self._dtype, device=layer_device
)
new_layer_value_cache = torch.zeros(
cache_shape, dtype=self._dtype, device=layer_device
)
# Note: `mark_static_address` is used to tag the cache as a fixed data pointer,
# preventing compiled graph breaks when updating the cache.
torch._dynamo.mark_static_address(new_layer_key_cache)
torch._dynamo.mark_static_address(new_layer_value_cache)
self.key_cache.append(new_layer_key_cache)
self.value_cache.append(new_layer_value_cache)
def update(
self,
key_states: torch.Tensor,
value_states: torch.Tensor,
layer_idx: int,
cache_kwargs: dict[str, Any] | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
Parameters:
key_states (`torch.Tensor`):
The new key states to cache.
value_states (`torch.Tensor`):
The new value states to cache.
layer_idx (`int`):
The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
to know how where to write in the cache.
Return:
A tuple containing the updated key and value states.
"""
if cache_kwargs is None:
cache_kwargs = {}
key_states = key_states.to(self.key_cache[layer_idx].dtype)
value_states = value_states.to(self.value_cache[layer_idx].dtype)
return _static_cache_update(
self.key_cache[layer_idx],
self.value_cache[layer_idx],
key_states,
value_states,
cache_kwargs.get("cache_position"),
)
def get_seq_length(self, layer_idx: int | None = 0) -> int:
"""Returns the sequence length of the cached states that were seen by the model."""
# Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
# limit the check to the first batch member and head dimension.
# TODO: deprecate this function in favor of `cache_position`
return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
def get_max_cache_shape(self) -> int | None:
return self.max_cache_len
def reset(self):
"""Resets the cache values while preserving the objects"""
for layer_idx in range(len(self.key_cache)):
# In-place ops prevent breaking the static address
self.key_cache[layer_idx].zero_()
self.value_cache[layer_idx].zero_()
def get_mask_sizes(
self, cache_position: torch.Tensor, layer_idx: int
) -> tuple[int, int]:
"""
Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
the given layer at `layer_idx`.
The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
for each layer.
"""
kv_length = self.get_max_cache_shape()
return kv_length, 0
class Cache:
"""
A cache used for storing hidden states produced by flash linear attention models.
It stores the states of each layer as the tensor of shape `[batch_size, key_dim, value_dim]`.
"""
is_compileable = True
def __init__(self, seen_tokens: int = 0) -> Cache:
super().__init__()
self.states: list[dict[str, Any]] = []
self._seen_tokens = seen_tokens # Used in `generate` to keep tally of how many tokens the cache has seen
def __getitem__(self, layer_idx: int) -> dict[str, Any]:
if layer_idx < len(self):
return self.states[layer_idx]
else:
raise KeyError(
f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}"
)
def __iter__(self):
for state in self.states:
yield state
def __len__(self):
return len(self.states)
def update(
self,
recurrent_state: torch.Tensor | None = None,
attn_state: tuple[torch.Tensor, torch.Tensor] | None = None,
conv_state: tuple[torch.Tensor] | None = None,
ffn_state: torch.Tensor | None = None,
layer_idx: int = 0,
offset: int | None = 1,
cache_kwargs: dict | None = None,
):
"""
Updates the cache with the new `recurrent_state`/`attn_state`/`conv_state` for the layer `layer_idx`.
Args:
recurrent_state (`torch.Tensor`, `optional`):
The new recurrent state to cache.
attn_state (`Tuple[torch.Tensor, torch.Tensor]`, `optional`):
The new attention key/value states to cache.
conv_state (`Tuple[torch.Tensor]`, `optional`):
The new convolution state to cache.
layer_idx (`int`, defaults to 0):
The index of the layer to cache the states for.
offset (`int`, `optional`, defaults to 1):
The number of new tokens being processed.
cache_kwargs (`Dict[str, Any]`, `optional`):
Additional arguments for the cache subclass.
Return:
Dictionary of the updated state.
"""
# Update the number of seen tokens
if layer_idx == 0:
self._seen_tokens += offset
if attn_state is not None:
input_size = attn_state[0].shape[-2]
window_size = cache_kwargs.get("window_size", None)
if not isinstance(attn_state, Tuple) or len(attn_state) != 2:
raise ValueError(
"`attn_state` must be a tuple of two tensors for key/value states"
)
if len(self.states) <= layer_idx:
if attn_state is not None:
if window_size is not None and input_size > window_size:
attn_state = (
attn_state[0][..., -window_size:, :].contiguous(),
attn_state[1][..., -window_size:, :].contiguous(),
)
state = dict(
recurrent_state=recurrent_state,
attn_state=attn_state,
conv_state=conv_state,
ffn_state=ffn_state,
)
self.states.append(state)
else:
state = self.states[layer_idx]
if recurrent_state is not None:
state["recurrent_state"] = recurrent_state
if attn_state is not None:
key_state, value_state = state["attn_state"]
if window_size is not None and key_state.shape[-2] == window_size:
# DO NOT allocate new memory if the cache is full
# roll the key/value states to the left by `input_size`
key_state = key_state.roll(-input_size, -2)
value_state = value_state.roll(-input_size, -2)
# replace the last `input_size` tokens with the new key/value states
key_state[..., -input_size:, :] = attn_state[0]
value_state[..., -input_size:, :] = attn_state[1]
attn_state = (key_state, value_state)
else:
attn_state = (
torch.cat([key_state, attn_state[0]], -2),
torch.cat([value_state, attn_state[1]], -2),
)
state["attn_state"] = attn_state
if conv_state is not None:
state["conv_state"] = conv_state
if ffn_state is not None:
state["ffn_state"] = ffn_state
return state
def get_seq_length(self, layer_idx: int | None = 0) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
if len(self.states) <= layer_idx:
return 0
return self._seen_tokens
def get_max_length(self) -> int | None:
"""Returns the maximum sequence length of the cached states. Cache does not have a maximum length."""
return None
def to_legacy_cache(self) -> tuple:
return tuple(self.states)
@classmethod
@torch.compiler.disable
def from_legacy_cache(
cls, past_key_values: tuple | None = None, seen_tokens: int = 0
) -> Cache:
"""Converts a cache in the legacy cache format into an equivalent `Cache`."""
cache = cls(seen_tokens)
if isinstance(past_key_values, list):
for layer_idx in range(len(past_key_values)):
cache.states.append(past_key_values[layer_idx])
return cache