EvaByte-SFT / multibyte_decoding_evabyte.py

Upload EvaByteForCausalLM

474addc verified about 1 year ago

37 kB


	# The implementation of multibyte deocidng is largely adapted from
	# Medusa decoding: https://github.com/FasterDecoding/Medusa
	import torch
	import torch.nn.functional as F
	from transformers.generation.stopping_criteria import (
	MaxLengthCriteria,
	StoppingCriteriaList,
	)
	from typing import Union, List
	from .eva_cache import EvaStaticCacheForTriton
	from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd

	class MultibyteEosTokenCriteria:
	"""
	This class implements a simple stopping criteria to stop generation whenever
	the "end-of-sequence" token is generated in the last `new_tokens` tokens.

	Adapted from
	https://github.com/huggingface/transformers/blob/main/src/transformers/generation/stopping_criteria.py#L446
	By default, it uses the `model.generation_config.eos_token_id`.

	Args:
	eos_token_id (`Union[int, List[int]]`):
	The id(s) of the end-of-sequence token.
	"""

	def __init__(self, eos_token_ids: Union[int, List[int]]):
	if isinstance(eos_token_ids, int):
	eos_token_ids = [eos_token_ids]
	self.eos_token_ids = eos_token_ids

	def __call__(self, input_ids: torch.LongTensor, new_tokens: int) -> bool:
	current_input_len = input_ids.shape[-1]
	new_token_ids = input_ids[:, current_input_len - new_tokens:]
	for eos_token_id in self.eos_token_ids:
	if torch.any(new_token_ids == eos_token_id):
	return True
	return False

	def build_tree(spec):
	nodes_at_depth = []
	nodes_at_depth.append([()]) # Root at depth 1

	for d in range(1, len(spec) + 1):
	prev_nodes = nodes_at_depth[d - 1]
	spec_list = spec[d - 1]
	current_nodes = []
	for node_idx, node in enumerate(prev_nodes):
	if node_idx < len(spec_list):
	num_children = spec_list[node_idx]
	else:
	num_children = 0
	for child_idx in range(num_children):
	new_node = node + (child_idx,)
	current_nodes.append(new_node)
	nodes_at_depth.append(current_nodes)

	# Flatten the list of nodes, excluding the root node if desired
	all_nodes = [node for depth_nodes in nodes_at_depth for node in depth_nodes if node]
	return all_nodes

	evabyte_7b_95 = build_tree(
	[
	[10],
	[10, 8, 2, 2, 1, 1],
	[10, 4, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 1],
	[8, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1],
	[6, 2, 1, 1],
	[4, 2, 1, 1],
	[4, 2, 1],
	]
	)
	evabyte_7b_31 = build_tree(
	[
	[4],
	[3, 2, 1, 1],
	[3, 2, 1, 1],
	[2, 1, 1],
	[2, 1],
	[2, 1],
	[2, 1],
	]
	)
	TOPK = 10 # topk for sparse tree (10 is a placeholder and it is sufficient)

	def pad_path(path, length, pad_value=-2):
	"""
	Pad the given path list with a specific value up to a specified length.

	Parameters:
	- path (list): The original list that needs padding.
	- length (int): The desired length of the padded list.
	- pad_value (optional, default=-2): The value to use for padding.

	Returns:
	- list: A new list based on the original path but padded to the desired length.

	Example:
	>>> pad_path([1,2,3], 5)
	[1, 2, 3, -2, -2]

	Note:
	If the given path is already longer than the specified length,
	then no padding occurs, and the original path is returned.
	"""
	return path + [pad_value] * (length - len(path))

	def reset_past_key_values(passed_key_values):
	"""
	Resets the current lengths in the passed key-values to zero.

	This function is designed to be used during the evaluation of a baseline model.
	It iterates through each layer's key-values and sets their current lengths to zero,
	effectively resetting their state.

	Args:
	- passed_key_values (list of torch.Tensor): Contains past hidden states and past attention values for each layer.

	Returns:
	- passed_key_values (list of torch.Tensor): Updated past hidden states and past attention values with reset lengths.
	"""
	for i in range(len(passed_key_values)):
	for j in range(2):
	passed_key_values[i][j].current_length.fill_(0)
	return passed_key_values

	def get_nucleus_one_token(logit, temperature, top_p):
	"""
	Performs token sampling based on the nucleus (top-p) sampling method.

	This function selects a token from a given logit distribution using the nucleus sampling strategy.
	It allows for more controlled and diverse generation compared to traditional top-k sampling.

	Args:
	logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor (BxC).
	temperature (float): A temperature parameter to control the randomness in sampling.
	Higher values increase diversity, lower values make selections more deterministic.
	top_p (float): The cumulative probability threshold for nucleus sampling.
	It controls the size of the set of high-probability tokens to consider for sampling.

	Returns:
	torch.Tensor: A tensor containing the indices of the sampled tokens.
	"""
	if top_p >= 1:
	return torch.multinomial(F.softmax(logit / temperature, dim=-1), 1)
	logit = logit / temperature
	probs = torch.softmax(logit, dim=-1)
	sorted_logits, sorted_indices = torch.sort(probs, descending=True)
	cum_probs = torch.cumsum(sorted_logits, dim=-1)
	sorted_indices_to_remove = cum_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0
	indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
	logit[indices_to_remove] = float('-inf')
	sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
	return sampled_tokens

	def get_typical_one_token(logit, temperature, posterior_threshold, posterior_alpha):
	"""
	Implements token sampling based on the typical sampling method.

	This function selects a token from a given logit distribution using the typical sampling strategy,
	aiming to balance between diversity and likelihood in a more nuanced way compared to traditional methods.

	Args:
	logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor.
	temperature (float): A parameter to control the randomness in sampling.
	Higher values increase diversity, lower values make selections more deterministic.
	posterior_threshold (float): A threshold to decide the lower bound of probabilities to be considered for sampling.
	posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.

	Returns:
	torch.Tensor: A tensor containing the indices of the sampled tokens.
	"""
	logit = logit / temperature
	probs = torch.softmax(logit, dim=-1)
	entropy = -torch.sum(
	probs * torch.log(probs + 1e-5), dim=-1
	)
	threshold = torch.minimum(
	torch.ones_like(entropy) * posterior_threshold,
	torch.exp(-entropy) * posterior_alpha,
	)
	indices_to_remove = probs < threshold.unsqueeze(-1)
	logit[indices_to_remove] = float('-inf')
	sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
	return sampled_tokens



	def generate_medusa_buffers(medusa_choices, device="cuda"):
	"""
	Generate buffers for the Medusa structure based on the provided choices.

	Parameters:
	- medusa_choices (list): A nested list representing tree in the Medusa structure.
	- device (str): Device to which the tensors should be moved. Default is "cuda".

	Returns:
	- dict: A dictionary containing buffers related to the Medusa structure.
	"""

	# Sort the medusa_choices based on their lengths and then their values
	sorted_medusa_choices = sorted(medusa_choices, key=lambda x: (len(x), x))
	medusa_len = len(sorted_medusa_choices) + 1

	# Initialize depth_counts to keep track of how many choices have a particular depth
	depth_counts = [0] * max([len(path) for path in sorted_medusa_choices])
	for path in sorted_medusa_choices:
	depth_counts[len(path) - 1] += 1

	# Create the attention mask for Medusa
	medusa_attn_mask = torch.eye(medusa_len, medusa_len)
	medusa_attn_mask[:, 0] = 1
	start = 0
	for i in range(len(depth_counts)):
	for j in range(depth_counts[i]):
	cur_medusa_choice = sorted_medusa_choices[start + j]
	# retrieve ancestor position
	if len(cur_medusa_choice) == 1:
	continue
	ancestor_idx = []
	for c in range(len(cur_medusa_choice) - 1):
	ancestor_idx.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]) + 1)
	medusa_attn_mask[j + start + 1, ancestor_idx] = 1
	start += depth_counts[i]

	# Generate tree indices for the Medusa structure
	medusa_tree_indices = torch.zeros(medusa_len, dtype=torch.long)
	medusa_tree_indices[0] = 0
	start = 0
	for i in range(len(depth_counts)):
	for j in range(depth_counts[i]):
	cur_medusa_choice = sorted_medusa_choices[start + j]
	medusa_tree_indices[start + j + 1] = cur_medusa_choice[-1] + TOPK * i + 1
	start += depth_counts[i]

	# Generate position IDs for the Medusa structure
	medusa_position_ids = torch.zeros(medusa_len, dtype=torch.long)
	start = 0
	for i in range(len(depth_counts)):
	medusa_position_ids[start + 1: start + depth_counts[i] + 1] = i + 1
	start += depth_counts[i]

	# Generate retrieval indices for Medusa structure verification
	retrieve_indices_nest = []
	retrieve_paths = []
	for i in range(len(sorted_medusa_choices)):
	cur_medusa_choice = sorted_medusa_choices[-i-1]
	retrieve_indice = []
	if cur_medusa_choice in retrieve_paths:
	continue
	else:
	for c in range(len(cur_medusa_choice)):
	retrieve_indice.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]))
	retrieve_paths.append(cur_medusa_choice[:c+1])
	retrieve_indices_nest.append(retrieve_indice)
	max_length = max([len(x) for x in retrieve_indices_nest])
	retrieve_indices = [pad_path(path, max_length) for path in retrieve_indices_nest]
	retrieve_indices = torch.tensor(retrieve_indices, dtype=torch.long)
	retrieve_indices = retrieve_indices + 1
	retrieve_indices = torch.cat([torch.zeros((retrieve_indices.shape[0], 1), dtype=torch.long), retrieve_indices], dim=1)

	# Aggregate the generated buffers into a dictionary
	medusa_buffers = {
	"medusa_attn_mask": medusa_attn_mask.unsqueeze(0).unsqueeze(0),
	"tree_indices": medusa_tree_indices,
	"medusa_position_ids": medusa_position_ids.unsqueeze(0),
	"retrieve_indices": retrieve_indices,
	}

	# Move the tensors in the dictionary to the specified device
	medusa_buffers = {
	k: v.clone().to(device)
	if isinstance(v, torch.Tensor)
	else torch.tensor(v, device=device)
	for k, v in medusa_buffers.items()
	}
	return medusa_buffers

	def generate_candidates(
	medusa_logits,
	logits,
	tree_indices,
	retrieve_indices,
	temperature = 0,
	posterior_threshold=0.3,
	posterior_alpha = 0.09,
	top_p=0.8,
	sampling = 'typical',
	fast = False
	):
	# Say we have 3 heads, and the top-4 for each head are:
	# [10, 3, 8, 4]
	# [9, 5, 1, 6]
	# [7, 16, 3, 2]

	# candidates_id = 10
	if temperature == 0 or fast:
	candidates_ids = torch.argmax(logits[:, -1]).unsqueeze(0)
	else:
	if sampling == 'typical':
	candidates_ids = get_typical_one_token(logits[:, -1], temperature, posterior_threshold, posterior_alpha).squeeze(0)
	elif sampling == 'nucleus':
	candidates_ids = get_nucleus_one_token(logits[:, -1], temperature, top_p).squeeze(0)
	else:
	raise NotImplementedError

	# this calculates the top-k medusa logits
	# candidates_medusa_id = [
	# [9, 5, 1, 6]
	# [7, 16, 3, 2]
	# ]
	candidates_medusa_ids = torch.topk(medusa_logits[:, 0, -1], TOPK, dim=-1).indices

	# [10, 9, 5, 1, 6, 7, 16, 3, 2]
	candidate_ids = torch.cat([candidates_ids, candidates_medusa_ids.view(-1)], dim=-1)

	# based on the pre-defined tree_indices, select the corresponding candidates
	# if we select top-2 and top-3 for the two heads (we select top-1 for the first head):
	# tree_candidates = [10, 9, 5, 7, 16, 3, 7, 16, 3]
	tree_candidate_ids = candidate_ids[tree_indices]

	# tree_candidate_ids = [10, 9, 5, 7, 16, 3, 7, 16, 3, 0]
	# Sometimes the tree_indices are padded, so we append a zero here
	# so that all padded indices select the appended zero.
	tree_candidate_ids_ext = torch.cat(
	[
	tree_candidate_ids,
	torch.zeros((1), dtype=torch.long, device=tree_candidate_ids.device)
	],
	dim=0
	)
	# [[10, 9, 7], [10, 9, 16], [10, 9, 3], [10, 5, 7], [10, 5, 16], [10, 5, 3]]
	unflattened_candidate_ids = tree_candidate_ids_ext[retrieve_indices]

	tree_candidate_ids = tree_candidate_ids.unsqueeze(0)

	return tree_candidate_ids, unflattened_candidate_ids

	def get_nucleus_posterior_mask(logits, candidates, temperature, top_p):
	"""
	Generates a posterior mask for token candidates using nucleus (top-p) sampling.

	This function applies nucleus sampling to a set of logits, and then generates a mask indicating
	which candidate tokens are selected. It adapts the sampling strategy to accommodate for
	temperature scaling and cumulative probability thresholding.

	Args:
	logits (torch.Tensor): A tensor of logits from a language model output.
	candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
	temperature (float): A parameter to scale the logits, controlling randomness in sampling.
	top_p (float): The cumulative probability threshold for nucleus sampling.

	Returns:
	torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
	"""
	# adapted from https://github.com/huggingface/transformers/blob/18a879f47576822aa1a5c49aecb27d89bfa5fa69/examples/run_generation.py#L79

	# Apply temperature
	logits = logits[:, :-1] / temperature
	n_samples, n_tokens = logits.shape[0], logits.shape[1]
	logits = logits.view(n_samples*n_tokens, -1)
	if top_p >= 1:
	sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
	sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
	posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
	return posterior_mask
	# Convert to probabilities (softmax)
	probs = F.softmax(logits, dim=-1)
	# Sort the probabilities
	sorted_logits, sorted_indices = torch.sort(probs, descending=True)

	# Compute cumulative probabilities
	cum_probs = torch.cumsum(sorted_logits, dim=-1)

	# Create mask for the top-p nucleus
	sorted_indices_to_remove = cum_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0

	indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)


	# Remove low-probability tokens
	logits[indices_to_remove] = float('-inf')
	# Sample from the remaining tokens
	sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
	sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
	# Create a mask for selected tokens
	posterior_mask = (candidates[:, 1:] == sampled_tokens).int()

	return posterior_mask

	def get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha):
	"""
	Args:
	logits (torch.Tensor): A tensor of logits from a language model output.
	candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
	temperature (float): A parameter to scale the logits, controlling randomness in sampling.
	posterior_threshold (float): The minimum threshold for probabilities to be considered in sampling.
	posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.

	Returns:
	torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
	"""
	logits = logits[:, :-1] / temperature
	n_samples, n_tokens = logits.shape[0], logits.shape[1]
	logits = logits.view(n_samples*n_tokens, -1)
	probs = F.softmax(logits, dim=-1)
	entropy = -torch.sum(
	probs * torch.log(probs + 1e-5), dim=-1
	)
	threshold = torch.minimum(
	torch.ones_like(entropy) * posterior_threshold,
	torch.exp(-entropy) * posterior_alpha,
	)
	indices_to_remove = probs < threshold.unsqueeze(-1)
	logits[indices_to_remove] = float('-inf')
	sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
	sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
	posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
	return posterior_mask



	def evaluate_posterior(
	logits,
	candidates,
	temperature,
	posterior_threshold=0.3,
	posterior_alpha = 0.09,
	top_p=0.8,
	sampling = 'typical',
	fast = True
	):
	if logits.shape[1] <= 1:
	return torch.tensor(0, dtype=torch.long, device=candidates.device), 0
	# Greedy decoding based on temperature value
	if temperature == 0:
	# Find the tokens that match the maximum logits for each position in the sequence
	posterior_mask = (
	candidates[:, 1:] == torch.argmax(logits[:, :-1], dim=-1)
	).int()
	candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
	accept_length = candidates_accept_length.max().item()
	# Choose the best candidate
	if accept_length == 0:
	# Default to the first candidate if none are accepted
	best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
	else:
	best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
	return best_candidate, accept_length
	elif sampling == 'typical':
	if fast:
	posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
	candidates_prob = torch.gather(
	posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
	).squeeze(-1)
	posterior_entropy = -torch.sum(
	posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
	) # torch.sum(torch.log(*)) is faster than torch.prod
	threshold = torch.minimum(
	torch.ones_like(posterior_entropy) * posterior_threshold,
	torch.exp(-posterior_entropy) * posterior_alpha,
	)
	posterior_mask = candidates_prob > threshold
	candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)

	# Choose the best candidate based on the evaluated posterior probabilities
	accept_length = candidates_accept_length.max().item()
	if accept_length == 0:
	# If no candidates are accepted, just choose the first one
	best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
	else:
	best_candidates = torch.where(candidates_accept_length == accept_length)[0]
	# Accept the best one according to likelihood
	likelihood = torch.sum(
	torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
	)
	best_candidate = best_candidates[torch.argmax(likelihood)]
	return best_candidate, accept_length
	# Calculate posterior probabilities and thresholds for candidate selection
	posterior_mask = get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha)
	candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
	# Choose the best candidate based on the evaluated posterior probabilities
	accept_length = candidates_accept_length.max().item()

	if accept_length == 0:
	# If no candidates are accepted, just choose the first one
	best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
	else:
	best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
	# Accept the best one according to likelihood
	return best_candidate, accept_length
	elif sampling == 'nucleus':
	assert top_p < 1.0 + 1e-6, "top_p should between 0 and 1"
	posterior_mask = get_nucleus_posterior_mask(logits, candidates, temperature, top_p)
	candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
	accept_length = candidates_accept_length.max().item()
	# Choose the best candidate
	if accept_length == 0:
	# Default to the first candidate if none are accepted
	best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
	else:
	best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
	return best_candidate, accept_length
	else:
	raise NotImplementedError

	def update_inference_inputs(
	input_ids,
	medusa_logits,
	logits,
	candidate_ids,
	best_candidate,
	accept_length,
	):
	input_ids = torch.cat(
	[
	input_ids,
	candidate_ids[None, best_candidate, : accept_length + 1]
	],
	dim=-1
	)
	logits = logits[
	None, best_candidate, accept_length : accept_length + 1
	]
	medusa_logits = medusa_logits[
	:, None, best_candidate, accept_length : accept_length + 1
	]
	# Update the new token counter
	new_token = accept_length + 1
	return input_ids, medusa_logits, logits, new_token

	def split_logits(full_logits):
	# logits has shape [b, n, heads, vocab_size]
	logits = full_logits[..., 0, :]
	medusa_logits = full_logits[..., 1:, :].permute(2, 0, 1, 3)
	return medusa_logits, logits

	class MultiByteDecodingMixin:
	def multi_byte_pred_update_cache(
	self,
	past_key_values,
	retrieve_indices,
	best_candidate,
	new_tokens,
	):
	prev_window_len = past_key_values.get_past_window_pos(0)
	select_indices = (
	retrieve_indices[best_candidate, : new_tokens] + prev_window_len
	)
	for layer_idx in range(self.config.num_hidden_layers):

	past_key_values.update_past_len(new_tokens, layer_idx)

	past_window_k = past_key_values.past_window_k[layer_idx]
	past_window_v = past_key_values.past_window_v[layer_idx]

	tgt_window_k = past_window_k[..., select_indices, :]
	tgt_window_v = past_window_v[..., select_indices, :]

	dst_window_k = past_window_k[..., prev_window_len : prev_window_len + new_tokens, :]
	dst_window_v = past_window_v[..., prev_window_len : prev_window_len + new_tokens, :]

	dst_window_k.copy_(tgt_window_k, non_blocking=True)
	dst_window_v.copy_(tgt_window_v, non_blocking=True)

	new_window_len = prev_window_len + new_tokens
	if new_window_len >= self.config.window_size:
	assert new_window_len < 2 * self.config.window_size

	dump_k = past_window_k[..., :self.config.window_size, :].clone()
	dump_v = past_window_v[..., :self.config.window_size, :].clone()

	_window_len = new_window_len - self.config.window_size

	if _window_len > 0:
	new_window_k = past_window_k[..., self.config.window_size : new_window_len, :]
	new_window_v = past_window_v[..., self.config.window_size : new_window_len, :]

	_dst_window_k = past_window_k[..., : _window_len, :]
	_dst_window_v = past_window_v[..., : _window_len, :]

	_dst_window_k.copy_(new_window_k, non_blocking=True)
	_dst_window_v.copy_(new_window_v, non_blocking=True)

	past_key_values.past_window_pos[layer_idx] = _window_len
	else:
	dump_k = None
	dump_v = None
	past_key_values.past_window_pos[layer_idx] = new_window_len

	if dump_k is not None and dump_v is not None:
	rfa_k, rfa_v = triton_eva_prep_kv_fwd(
	dump_k, dump_v,
	self.model.layers[layer_idx].self_attn.adaptive_mu_k,
	self.model.layers[layer_idx].self_attn.adaptive_phi,
	None,
	self.model.layers[layer_idx].self_attn.head_dim_scaling,
	self.model.layers[layer_idx].self_attn.chunk_size
	)
	rfa_k, rfa_v = past_key_values.update_chunk_rfas(
	rfa_k, rfa_v, layer_idx
	)
	return past_key_values

	def _multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
	self,
	past_key_values,
	):
	prev_window_len = past_key_values.get_past_window_pos(0)
	for layer_idx in range(self.config.num_hidden_layers):

	past_window_k = past_key_values.past_window_k[layer_idx]
	past_window_v = past_key_values.past_window_v[layer_idx]

	new_window_len = prev_window_len
	if new_window_len == self.config.window_size:
	dump_k = past_window_k[..., :self.config.window_size, :].clone()
	dump_v = past_window_v[..., :self.config.window_size, :].clone()
	past_key_values.past_window_pos[layer_idx] = 0

	if dump_k is not None and dump_v is not None:
	rfa_k, rfa_v = triton_eva_prep_kv_fwd(
	dump_k, dump_v,
	self.model.layers[layer_idx].self_attn.adaptive_mu_k,
	self.model.layers[layer_idx].self_attn.adaptive_phi,
	None,
	self.model.layers[layer_idx].self_attn.head_dim_scaling,
	self.model.layers[layer_idx].self_attn.chunk_size
	)
	rfa_k, rfa_v = past_key_values.update_chunk_rfas(
	rfa_k, rfa_v, layer_idx
	)
	return past_key_values

	def multi_byte_pred_update_attn_mask(
	self,
	last_iter_new_tokens,
	tree_candidate_ids,
	past_attn_mask,
	medusa_attn_mask,
	past_key_values,
	):
	batch_size, tree_candidate_len = tree_candidate_ids.shape
	seen_tokens = past_key_values.get_seq_length()
	# NOTE: past_key_values has been updated so now
	# seen_tokens incldues new tokens from the last tree iteration
	assert seen_tokens > 0
	# so one iteration would not cross two windows
	assert last_iter_new_tokens < self.config.window_size

	if past_attn_mask is not None and seen_tokens < self.config.window_size:
	past_attn_mask = torch.cat(
	[
	past_attn_mask,
	torch.ones(
	[batch_size, 1, tree_candidate_len, last_iter_new_tokens],
	dtype=torch.bool,
	device=self.device
	)
	],
	dim=-1
	)
	else:
	# we initialize attn mask each time when
	# 1. the model crosses the window bounary, or
	# 2. after prefilling
	chunks_per_window = int(self.config.window_size // self.config.chunk_size)

	window_tokens = seen_tokens % self.config.window_size
	num_windows_seen_so_far = seen_tokens // self.config.window_size
	attn_mask_len = num_windows_seen_so_far * chunks_per_window + window_tokens
	past_attn_mask = torch.ones(
	(batch_size, 1, tree_candidate_len, attn_mask_len),
	dtype=torch.bool,
	device=self.device
	)

	# note that 1 indicates the position is not masked
	tree_attn_mask = torch.cat(
	[
	past_attn_mask,
	medusa_attn_mask.to(torch.bool)
	],
	dim=-1
	)
	return tree_attn_mask, past_attn_mask

	@torch.no_grad()
	def multi_byte_generate(
	self,
	input_ids,
	attention_mask=None,
	temperature=0.0,
	max_length=None,
	max_new_tokens=None,
	stopping_criteria=None,
	posterior_threshold=0.09,
	posterior_alpha=0.3,
	top_p=0.8,
	sampling='typical',
	fast=True,
	do_sample=False,
	medusa_choices=None,
	return_acc_lengths=False
	):
	if do_sample or temperature > 0.0:
	fast = False

	### Prepare `max_length` depending on other stopping criteria.
	if max_new_tokens is not None:
	max_length = max_new_tokens + input_ids.shape[-1]
	elif max_new_tokens is None and max_length is None:
	max_length = getattr(self.config, "max_position_embeddings", 32768)

	### Set up stopping criteria
	eos_stop_criteria = MultibyteEosTokenCriteria(self.generation_config.eos_token_id)
	stop_criteria = StoppingCriteriaList()
	if max_length is not None:
	max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
	stop_criteria.append(
	MaxLengthCriteria(
	max_length=max_length,
	max_position_embeddings=max_position_embeddings,
	)
	)
	if stopping_criteria is not None and len(stopping_criteria) > 0:
	stop_criteria.extend(stopping_criteria)

	assert input_ids.shape[0] == 1, "Only support batch size 1 for now"
	assert attention_mask is None, "Only support attention mask None for now"
	# Avoid modifying the input_ids in-place
	input_ids = input_ids.clone()
	position_ids = torch.arange(0, input_ids.shape[1], device=self.device, dtype=int).reshape(1, -1)

	####################################################
	# 0. initialize the medusa buffers
	####################################################
	if medusa_choices is None:
	medusa_choices = evabyte_7b_95
	medusa_buffers = generate_medusa_buffers(
	medusa_choices, device=self.device
	)

	past_key_values = EvaStaticCacheForTriton(
	input_ids.shape[0],
	self.config.num_attention_heads,
	# we add 256 to allow tree ids
	self.config.window_size + 256,
	self.config.hidden_size // self.config.num_attention_heads,
	self.config.num_hidden_layers,
	self.lm_head.weight.dtype,
	self.lm_head.weight.device,
	)
	# prefill to get medusa logits and logits
	full_logits, past_key_values = self.forward(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	use_cache=True,
	past_key_values=past_key_values,
	return_all_pred_logits=True,
	multibyte_decoding=False,
	)
	# handles an edge case where the prefill length == window_size
	# we force the previous window to be dumped into RFA chunks
	past_key_values = self._multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
	past_key_values
	)
	medusa_logits, logits = split_logits(full_logits)

	past_attn_mask = None
	last_iter_new_tokens = 0
	max_iters = 32768
	if return_acc_lengths:
	acc_lengths = []
	for _ in range(max_iters):
	####################################################
	# 1. generate candidate_ids with topk predictions from Medusa heads
	####################################################
	tree_candidate_ids, unflattened_candidate_ids = generate_candidates(
	medusa_logits,
	logits,
	medusa_buffers["tree_indices"],
	medusa_buffers["retrieve_indices"],
	temperature=temperature,
	posterior_alpha=posterior_alpha,
	posterior_threshold=posterior_threshold,
	top_p=top_p,
	sampling=sampling,
	fast=fast,
	)

	####################################################
	# 2. Build the medusa attention mask and position ids
	####################################################
	# NOTE: 1 indicates the position is not masked
	medusa_attn_mask, past_attn_mask = self.multi_byte_pred_update_attn_mask(
	last_iter_new_tokens,
	tree_candidate_ids,
	past_attn_mask,
	medusa_buffers["medusa_attn_mask"],
	past_key_values,
	)
	medusa_position_ids = medusa_buffers["medusa_position_ids"] + input_ids.shape[1]

	####################################################
	# 3. tree decoding
	####################################################
	tree_full_logits, past_key_values = self.forward(
	tree_candidate_ids,
	past_key_values=past_key_values,
	attention_mask=medusa_attn_mask,
	position_ids=medusa_position_ids,
	return_all_pred_logits=True,
	multibyte_decoding=True,
	)
	_medusa_logits, _logits = split_logits(tree_full_logits)
	medusa_logits = _medusa_logits[..., 0, medusa_buffers["retrieve_indices"], :]
	logits = _logits[..., 0, medusa_buffers["retrieve_indices"], :]

	####################################################
	# 4. candidate selection
	####################################################
	# if the current iteration, with tree tokens, crosses window
	# boundaries, trim the condidate_ids to be within the window
	# so that those exceeded tokens (which will be inaccurate)
	# will not be considered
	tree_depth = unflattened_candidate_ids.shape[-1]
	if tree_depth + past_key_values.get_past_window_pos(0) > self.config.window_size:
	max_acc_len = self.config.window_size - past_key_values.get_past_window_pos(0)
	_trimmed_unflattened_candidate_ids = unflattened_candidate_ids[:, :max_acc_len]
	_trimmed_logits = logits[:, :max_acc_len]
	else:
	_trimmed_unflattened_candidate_ids = unflattened_candidate_ids
	_trimmed_logits = logits
	best_candidate, accept_length = evaluate_posterior(
	_trimmed_logits,
	_trimmed_unflattened_candidate_ids,
	temperature,
	posterior_threshold,
	posterior_alpha,
	top_p=top_p,
	sampling=sampling,
	fast=fast
	)

	####################################################
	# 5. update model inputs and caches
	####################################################
	input_ids, medusa_logits, logits, last_iter_new_tokens = update_inference_inputs(
	input_ids,
	medusa_logits,
	logits,
	unflattened_candidate_ids,
	best_candidate,
	accept_length,
	)

	past_key_values = self.multi_byte_pred_update_cache(
	past_key_values,
	medusa_buffers["retrieve_indices"],
	best_candidate,
	last_iter_new_tokens,
	)

	if return_acc_lengths:
	acc_lengths.append(last_iter_new_tokens)
	if stop_criteria(input_ids, None) or eos_stop_criteria(input_ids, last_iter_new_tokens):
	if return_acc_lengths:
	return input_ids, acc_lengths
	else:
	return input_ids
	if return_acc_lengths:
	return input_ids, acc_lengths
	else:
	return input_ids