| | from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaAttention, LlamaRotaryEmbedding |
| | from transformers.models.llama.configuration_llama import LlamaConfig |
| | import torch |
| |
|
| |
|
| | class CodeLlamaConfig(LlamaConfig): |
| | def __init__(self, **kwargs): |
| | super().__init__(**kwargs) |
| | self.rope_theta = 10000.0 |
| | if kwargs.get("rope_theta"): |
| | try: |
| | self.rope_theta = float(kwargs["rope_theta"]) |
| | print(f"Rope theta set to {self.rope_theta}") |
| | except Exception: |
| | print("Could not set rope theta properly, ensure it is a number") |
| |
|
| | |
| | class CodeLlamaNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): |
| |
|
| | def __init__(self, dim, max_position_embeddings=2048, base=1000000.0, device=None, scaling_factor=1.0): |
| | self.scaling_factor = scaling_factor |
| | self.base = base |
| | super().__init__(dim, max_position_embeddings, base, device) |
| |
|
| | def _set_cos_sin_cache(self, seq_len, device, dtype): |
| | self.max_seq_len_cached = seq_len |
| | |
| | inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) |
| | self.register_buffer("inv_freq", inv_freq, persistent=False) |
| |
|
| | t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) |
| |
|
| | freqs = torch.einsum("i,j->ij", t, self.inv_freq) |
| | |
| | emb = torch.cat((freqs, freqs), dim=-1) |
| | self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) |
| | self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) |
| |
|
| | class CodeLlamaForCausalLM(LlamaForCausalLM): |
| | _tied_weights_keys = ["lm_head.weight"] |
| | |
| | config_class = CodeLlamaConfig |
| | |
| | def __init__(self, config): |
| | super().__init__(config) |
| | for layer in self.model.layers: |
| | attn = layer.self_attn |
| | head_dim = attn.head_dim |
| | max_embeddings = attn.max_position_embeddings |
| | base = config.rope_theta |
| | |
| | attn.rotary_emb = CodeLlamaNTKScalingRotaryEmbedding(head_dim, max_embeddings, base=base) |
| | |
| | |