| | from llama_cpp import * |
| | from ctypes import POINTER, c_size_t |
| | from llama_cpp._internals import ( |
| | _LlamaModel, |
| | _LlamaContext, |
| | _LlamaBatch, |
| | _LlamaTokenDataArray, |
| | ) |
| |
|
| | from KMP_list import kmp_search, compute_lps_array |
| | from Turbo_Colormap import map_value_to_color, NOCOLOR, LEGEND, BACK_WHITE |
| |
|
| |
|
| | class LLMGenerate: |
| | def __init__( |
| | self, |
| | model, |
| | n_keep, |
| | n_discard: int = 256, |
| | im_start=None, |
| | top_k: int = 40, |
| | top_p: float = 0.95, |
| | min_p: float = 0.05, |
| | typical_p: float = 1.0, |
| | temp: float = 0.80, |
| | repeat_penalty: float = 1.1, |
| | repeat_last_n: int = 64, |
| | frequency_penalty: float = 0.0, |
| | presence_penalty: float = 0.0, |
| | tfs_z: float = 1.0, |
| | mirostat_mode: int = 0, |
| | mirostat_tau: float = 5.0, |
| | mirostat_eta: float = 0.1 |
| | ): |
| | def _eval_t(tokens): |
| | return model.eval_t( |
| | tokens=tokens, |
| | n_keep=n_keep, |
| | n_discard=n_discard, |
| | im_start=im_start |
| | ) |
| |
|
| | def _sample_t(logits_processor): |
| | return model.sample_t( |
| | top_k=top_k, |
| | top_p=top_p, |
| | min_p=min_p, |
| | typical_p=typical_p, |
| | temp=temp, |
| | repeat_penalty=repeat_penalty, |
| | repeat_last_n=repeat_last_n, |
| | frequency_penalty=frequency_penalty, |
| | presence_penalty=presence_penalty, |
| | tfs_z=tfs_z, |
| | mirostat_mode=mirostat_mode, |
| | mirostat_tau=mirostat_tau, |
| | mirostat_eta=mirostat_eta, |
| | logits_processor=logits_processor |
| | ) |
| |
|
| | self._eval_t = _eval_t |
| | self._sample_t = _sample_t |
| | self.str_detokenize = model.str_detokenize |
| | self.venv_pop_token = model.venv_pop_token |
| | |
| | self.t_bot = [] |
| | self.completion_tokens = [] |
| | self.history = '' |
| | self.token = None |
| |
|
| | def eval_t(self, tokens): |
| | |
| | self.completion_tokens.extend(tokens) |
| | all_text = self.str_detokenize(self.completion_tokens) |
| | if all_text: |
| | self.t_bot.extend(self.completion_tokens) |
| | self.history += all_text |
| | self.completion_tokens = [] |
| | return self._eval_t(tokens) |
| |
|
| | def sample_t(self, logits_processor): |
| | self.token = self._sample_t(logits_processor) |
| | return self.token |
| |
|
| | def detokenize_sample_t(self): |
| | self.completion_tokens.append(self.token) |
| | all_text = self.str_detokenize(self.completion_tokens) |
| | if not all_text: |
| | return False |
| | self.t_bot.extend(self.completion_tokens) |
| | self.history += all_text |
| | self.completion_tokens = [] |
| | return True |
| |
|
| | def eval_sample_t(self): |
| | return self._eval_t([self.token]) |
| |
|
| | def endswith_t(self, token_list): |
| | return self.token in token_list |
| |
|
| | def endswith_s(self, start_func, str_list, com_func=str.rstrip): |
| | if self.completion_tokens: |
| | return False |
| |
|
| | history = self.history |
| | t_bot = self.t_bot |
| |
|
| | if start_func(history): |
| | history = com_func(history) |
| | for x in str_list: |
| | if history.endswith(x): |
| | n = len(t_bot) |
| | for i in range(1, n): |
| | tmp = self.str_detokenize(t_bot[n - i:]) |
| | tmp = com_func(tmp) |
| | if tmp.endswith(x): |
| | if i > 1: |
| | self.venv_pop_token(i - 1) |
| | if history.endswith(tmp): |
| | self.history = history[:-len(tmp)] |
| | return True |
| | return False |
| |
|
| |
|
| | kv_cache_type = { |
| | 'f32': 0, |
| | 'f16': 1, |
| | 'q8_0': 8, |
| | 'q4_0': 2, |
| | 'q4_1': 3, |
| | 'iq4_nl': 20, |
| | 'q5_0': 6, |
| | 'q5_1': 7 |
| | } |
| |
|
| | class StreamingLLM(Llama): |
| |
|
| | __backend_initialized = False |
| |
|
| | def __init__( |
| | self, |
| | model_path: str, |
| | *, |
| | |
| | n_gpu_layers: int = 0, |
| | split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, |
| | main_gpu: int = 0, |
| | tensor_split: Optional[List[float]] = None, |
| | vocab_only: bool = False, |
| | use_mmap: bool = True, |
| | use_mlock: bool = False, |
| | kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None, |
| | |
| | seed: int = llama_cpp.LLAMA_DEFAULT_SEED, |
| | n_ctx: int = 512, |
| | n_batch: int = 512, |
| | n_threads: Optional[int] = None, |
| | n_threads_batch: Optional[int] = None, |
| | rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, |
| | pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, |
| | rope_freq_base: float = 0.0, |
| | rope_freq_scale: float = 0.0, |
| | yarn_ext_factor: float = -1.0, |
| | yarn_attn_factor: float = 1.0, |
| | yarn_beta_fast: float = 32.0, |
| | yarn_beta_slow: float = 1.0, |
| | yarn_orig_ctx: int = 0, |
| | logits_all: bool = False, |
| | embedding: bool = False, |
| | offload_kqv: bool = True, |
| | |
| | last_n_tokens_size: int = 64, |
| | |
| | lora_base: Optional[str] = None, |
| | lora_scale: float = 1.0, |
| | lora_path: Optional[str] = None, |
| | |
| | numa: Union[bool, int] = False, |
| | |
| | chat_format: Optional[str] = None, |
| | chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, |
| | |
| | draft_model: Optional[LlamaDraftModel] = None, |
| | |
| | tokenizer: Optional[BaseLlamaTokenizer] = None, |
| | |
| | verbose: bool = True, |
| | |
| | type_k: str = 'f16', |
| | type_v: str = 'f16', |
| | **kwargs, |
| | ): |
| | """Load a llama.cpp model from `model_path`. |
| | |
| | Examples: |
| | Basic usage |
| | |
| | >>> import llama_cpp |
| | >>> model = llama_cpp.Llama( |
| | ... model_path="path/to/model", |
| | ... ) |
| | >>> print(model("The quick brown fox jumps ", stop=["."])["choices"][0]["text"]) |
| | the lazy dog |
| | |
| | Loading a chat model |
| | |
| | >>> import llama_cpp |
| | >>> model = llama_cpp.Llama( |
| | ... model_path="path/to/model", |
| | ... chat_format="llama-2", |
| | ... ) |
| | >>> print(model.create_chat_completion( |
| | ... messages=[{ |
| | ... "role": "user", |
| | ... "content": "what is the meaning of life?" |
| | ... }] |
| | ... )) |
| | |
| | Args: |
| | model_path: Path to the model. |
| | n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded. |
| | split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. |
| | main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored |
| | tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. |
| | vocab_only: Only load the vocabulary no weights. |
| | use_mmap: Use mmap if possible. |
| | use_mlock: Force the system to keep the model in RAM. |
| | kv_overrides: Key-value overrides for the model. |
| | seed: RNG seed, -1 for random |
| | n_ctx: Text context, 0 = from model |
| | n_batch: Prompt processing maximum batch size |
| | n_threads: Number of threads to use for generation |
| | n_threads_batch: Number of threads to use for batch processing |
| | rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 |
| | pooling_type: Pooling type, from `enum llama_pooling_type`. |
| | rope_freq_base: RoPE base frequency, 0 = from model |
| | rope_freq_scale: RoPE frequency scaling factor, 0 = from model |
| | yarn_ext_factor: YaRN extrapolation mix factor, negative = from model |
| | yarn_attn_factor: YaRN magnitude scaling factor |
| | yarn_beta_fast: YaRN low correction dim |
| | yarn_beta_slow: YaRN high correction dim |
| | yarn_orig_ctx: YaRN original context size |
| | logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs. |
| | embedding: Embedding mode only. |
| | offload_kqv: Offload K, Q, V to GPU. |
| | last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. |
| | lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. |
| | lora_path: Path to a LoRA file to apply to the model. |
| | numa: numa policy |
| | chat_format: String specifying the chat format to use when calling create_chat_completion. |
| | chat_handler: Optional chat handler to use when calling create_chat_completion. |
| | draft_model: Optional draft model to use for speculative decoding. |
| | tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. |
| | verbose: Print verbose output to stderr. |
| | |
| | Raises: |
| | ValueError: If the model path does not exist. |
| | |
| | Returns: |
| | A Llama instance. |
| | """ |
| | self.verbose = verbose |
| |
|
| | set_verbose(verbose) |
| |
|
| | if not StreamingLLM.__backend_initialized: |
| | with suppress_stdout_stderr(disable=verbose): |
| | llama_cpp.llama_backend_init() |
| | StreamingLLM.__backend_initialized = True |
| |
|
| | if isinstance(numa, bool): |
| | self.numa = ( |
| | llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE |
| | if numa |
| | else llama_cpp.GGML_NUMA_STRATEGY_DISABLED |
| | ) |
| | else: |
| | self.numa = numa |
| |
|
| | if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED: |
| | with suppress_stdout_stderr(disable=verbose): |
| | llama_cpp.llama_numa_init(self.numa) |
| |
|
| | self.model_path = model_path |
| |
|
| | |
| | self.model_params = llama_cpp.llama_model_default_params() |
| | self.model_params.n_gpu_layers = ( |
| | 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers |
| | ) |
| | self.model_params.split_mode = split_mode |
| | self.model_params.main_gpu = main_gpu |
| | self.tensor_split = tensor_split |
| | self._c_tensor_split = None |
| | if self.tensor_split is not None: |
| | if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES: |
| | raise ValueError( |
| | f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}" |
| | ) |
| | |
| | FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES |
| | self._c_tensor_split = FloatArray( |
| | *tensor_split |
| | ) |
| | self.model_params.tensor_split = self._c_tensor_split |
| | self.model_params.vocab_only = vocab_only |
| | self.model_params.use_mmap = use_mmap if lora_path is None else False |
| | self.model_params.use_mlock = use_mlock |
| |
|
| | |
| | self.kv_overrides = kv_overrides |
| | if kv_overrides is not None: |
| | |
| | kvo_array_len = len(kv_overrides) + 1 |
| | self._kv_overrides_array = ( |
| | llama_cpp.llama_model_kv_override * kvo_array_len |
| | )() |
| |
|
| | for i, (k, v) in enumerate(kv_overrides.items()): |
| | self._kv_overrides_array[i].key = k.encode("utf-8") |
| | if isinstance(v, bool): |
| | self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL |
| | self._kv_overrides_array[i].value.bool_value = v |
| | elif isinstance(v, int): |
| | self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT |
| | self._kv_overrides_array[i].value.int_value = v |
| | elif isinstance(v, float): |
| | self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT |
| | self._kv_overrides_array[i].value.float_value = v |
| | else: |
| | raise ValueError(f"Unknown value type for {k}: {v}") |
| |
|
| | self._kv_overrides_array[-1].key = ( |
| | b"\0" |
| | ) |
| | self.model_params.kv_overrides = self._kv_overrides_array |
| |
|
| | self.n_batch = min(n_ctx, n_batch) |
| | self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) |
| | self.n_threads_batch = n_threads_batch or max( |
| | multiprocessing.cpu_count() // 2, 1 |
| | ) |
| |
|
| | |
| | self.context_params = llama_cpp.llama_context_default_params() |
| | self.context_params.seed = seed |
| | self.context_params.n_ctx = n_ctx |
| | self.context_params.n_batch = self.n_batch |
| | self.context_params.n_threads = self.n_threads |
| | self.context_params.n_threads_batch = self.n_threads_batch |
| | self.context_params.rope_scaling_type = ( |
| | rope_scaling_type |
| | if rope_scaling_type is not None |
| | else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED |
| | ) |
| | self.context_params.pooling_type = pooling_type |
| | self.context_params.rope_freq_base = ( |
| | rope_freq_base if rope_freq_base != 0.0 else 0 |
| | ) |
| | self.context_params.rope_freq_scale = ( |
| | rope_freq_scale if rope_freq_scale != 0.0 else 0 |
| | ) |
| | self.context_params.yarn_ext_factor = ( |
| | yarn_ext_factor if yarn_ext_factor != 0.0 else 0 |
| | ) |
| | self.context_params.yarn_attn_factor = ( |
| | yarn_attn_factor if yarn_attn_factor != 0.0 else 0 |
| | ) |
| | self.context_params.yarn_beta_fast = ( |
| | yarn_beta_fast if yarn_beta_fast != 0.0 else 0 |
| | ) |
| | self.context_params.yarn_beta_slow = ( |
| | yarn_beta_slow if yarn_beta_slow != 0.0 else 0 |
| | ) |
| | self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 |
| | self.context_params.logits_all = ( |
| | logits_all if draft_model is None else True |
| | ) |
| | self.context_params.embeddings = embedding |
| |
|
| | |
| | print(self.context_params.type_k, self.context_params.type_v) |
| | self.context_params.type_k = kv_cache_type[type_k] |
| | self.context_params.type_v = kv_cache_type[type_v] |
| |
|
| | self.context_params.offload_kqv = offload_kqv |
| |
|
| | |
| | self.last_n_tokens_size = last_n_tokens_size |
| |
|
| | self.cache: Optional[BaseLlamaCache] = None |
| |
|
| | self.lora_base = lora_base |
| | self.lora_scale = lora_scale |
| | self.lora_path = lora_path |
| |
|
| | if not os.path.exists(model_path): |
| | raise ValueError(f"Model path does not exist: {model_path}") |
| |
|
| | self._model = _LlamaModel( |
| | path_model=self.model_path, params=self.model_params, verbose=self.verbose |
| | ) |
| |
|
| | |
| | self.tokenizer_ = tokenizer or LlamaTokenizer(self) |
| |
|
| | |
| | if n_ctx == 0: |
| | n_ctx = self._model.n_ctx_train() |
| | self.n_batch = min(n_ctx, n_batch) |
| | self.context_params.n_ctx = self._model.n_ctx_train() |
| | self.context_params.n_batch = self.n_batch |
| |
|
| | self._ctx = _LlamaContext( |
| | model=self._model, |
| | params=self.context_params, |
| | verbose=self.verbose, |
| | ) |
| |
|
| | self._batch = _LlamaBatch( |
| | n_tokens=self.n_batch, |
| | embd=0, |
| | n_seq_max=self.context_params.n_ctx, |
| | verbose=self.verbose, |
| | ) |
| |
|
| | if self.lora_path: |
| | if self._model.apply_lora_from_file( |
| | self.lora_path, |
| | self.lora_scale, |
| | self.lora_base, |
| | self.n_threads, |
| | ): |
| | raise RuntimeError( |
| | f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" |
| | ) |
| |
|
| | if self.verbose: |
| | print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) |
| |
|
| | self.chat_format = chat_format |
| | self.chat_handler = chat_handler |
| |
|
| | self.draft_model = draft_model |
| |
|
| | self._n_vocab = self.n_vocab() |
| | self._n_ctx = self.n_ctx() |
| |
|
| | self._token_nl = self.token_nl() |
| | self._token_eos = self.token_eos() |
| |
|
| | self._candidates = _LlamaTokenDataArray(n_vocab=self._n_vocab) |
| |
|
| | self.n_tokens = 0 |
| | self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) |
| | self.scores: npt.NDArray[np.single] = np.ndarray( |
| | (n_ctx, self._n_vocab), dtype=np.single |
| | ) |
| |
|
| | self._mirostat_mu = ctypes.c_float( |
| | 2.0 * 5.0 |
| | ) |
| |
|
| | try: |
| | self.metadata = self._model.metadata() |
| | except Exception as e: |
| | self.metadata = {} |
| | if self.verbose: |
| | print(f"Failed to load metadata: {e}", file=sys.stderr) |
| |
|
| | if self.verbose: |
| | print(f"Model metadata: {self.metadata}", file=sys.stderr) |
| |
|
| | if ( |
| | self.chat_format is None |
| | and self.chat_handler is None |
| | and "tokenizer.chat_template" in self.metadata |
| | ): |
| | chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata( |
| | self.metadata |
| | ) |
| |
|
| | if chat_format is not None: |
| | self.chat_format = chat_format |
| | if self.verbose: |
| | print(f"Guessed chat format: {chat_format}", file=sys.stderr) |
| | else: |
| | template = self.metadata["tokenizer.chat_template"] |
| | try: |
| | eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"]) |
| | except: |
| | eos_token_id = self.token_eos() |
| | try: |
| | bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"]) |
| | except: |
| | bos_token_id = self.token_bos() |
| |
|
| | eos_token = self._model.token_get_text(eos_token_id) |
| | bos_token = self._model.token_get_text(bos_token_id) |
| |
|
| | if self.verbose: |
| | print(f"Using gguf chat template: {template}", file=sys.stderr) |
| | print(f"Using chat eos_token: {eos_token}", file=sys.stderr) |
| | print(f"Using chat bos_token: {bos_token}", file=sys.stderr) |
| |
|
| | self.chat_handler = llama_chat_format.Jinja2ChatFormatter( |
| | template=template, eos_token=eos_token, bos_token=bos_token |
| | ).to_chat_handler() |
| |
|
| | if self.chat_format is None and self.chat_handler is None: |
| | self.chat_format = "llama-2" |
| | if self.verbose: |
| | print(f"Using fallback chat format: {chat_format}", file=sys.stderr) |
| | self._venv_init() |
| |
|
| | def str_detokenize(self, tokens) -> str: |
| | return self.detokenize(tokens).decode('utf-8', errors='ignore') |
| |
|
| | def kv_cache_seq_trim(self): |
| | self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) |
| |
|
| | def _venv_init(self): |
| | self.venv = [0] |
| | self.venv_idx_map = [] |
| |
|
| | def venv_create(self, name: str): |
| | self.venv.append(0) |
| | self.venv_idx_map.append(name) |
| | return name |
| |
|
| | def venv_disband(self, name_set): |
| | if len(self.venv) <= 1: |
| | return False |
| | name_set = {x for x in name_set if x in self.venv_idx_map} |
| | if not name_set: |
| | return False |
| | while self.venv_idx_map: |
| | if self.venv_idx_map[0] in name_set: |
| | self.venv_idx_map.pop(0) |
| | tmp = self.venv.pop(1) |
| | self.venv[0] += tmp |
| | else: |
| | break |
| | return True |
| |
|
| | def venv_revision(self, name: str): |
| | if len(self.venv) <= 1: |
| | return False |
| | if name not in self.venv_idx_map: |
| | return False |
| | _s = 0 |
| | while self.venv_idx_map: |
| | if self.venv_idx_map[-1] == name: |
| | break |
| | self.venv_idx_map.pop() |
| | _s += self.venv.pop() |
| | if _s: |
| | self.n_tokens -= min(_s, self.n_tokens) |
| | self.kv_cache_seq_trim() |
| | return True |
| |
|
| | def venv_remove(self, name: str, keep_last=0): |
| | if len(self.venv) <= 1: |
| | return False |
| | if name not in self.venv_idx_map: |
| | return False |
| | venv_idx = self.venv_idx_map.index(name) + 1 |
| | count_name = self.venv_idx_map.count(name) if keep_last else 0 |
| | while self.venv_idx_map: |
| | if keep_last and count_name <= keep_last: |
| | break |
| | self.venv_idx_map.pop(venv_idx - 1) |
| | if venv_idx == len(self.venv) - 1: |
| | |
| | self.n_tokens -= min(self.venv.pop(), self.n_tokens) |
| | self.kv_cache_seq_trim() |
| | break |
| | else: |
| | |
| | n_keep = self.n_tokens - sum(self.venv[i] for i in range(venv_idx, len(self.venv))) |
| | n_discard = self.venv.pop(venv_idx) |
| | self.kv_cache_seq_ltrim(n_keep, n_discard) |
| | try: |
| | venv_idx = self.venv_idx_map.index(name, venv_idx - 1) + 1 |
| | except ValueError: |
| | break |
| | count_name -= 1 |
| | return True |
| |
|
| | def venv_pop_token(self, n=1): |
| | self.n_tokens -= n |
| | self.venv[-1] -= n |
| | self.kv_cache_seq_trim() |
| |
|
| | @property |
| | def venv_info(self): |
| | return str((self.n_tokens, self.venv, self.venv_idx_map)) |
| |
|
| | def venv_viz(self): |
| | completion_tokens = [] |
| | history = LEGEND + '\n' |
| | text_color = NOCOLOR |
| | for i in range(self.venv[-1]): |
| | idx = self.n_tokens - self.venv[-1] + i |
| | token = self._input_ids[idx] |
| | if not completion_tokens: |
| | |
| | score = self.scores[idx-1: idx, :].ravel() |
| | score = np.exp(score) |
| | sum_score = np.sum(score) |
| | probabilities = score[token] / sum_score |
| | if probabilities < 0.001: |
| | text_color = NOCOLOR |
| | else: |
| | if text_color is NOCOLOR: |
| | text_color = BACK_WHITE + map_value_to_color(probabilities) |
| | else: |
| | text_color = map_value_to_color(probabilities) |
| | history += text_color |
| | |
| | completion_tokens.append(token) |
| | all_text = self.str_detokenize(completion_tokens) |
| | if not all_text: |
| | continue |
| | completion_tokens = [] |
| | history += repr(all_text)[1:-1] |
| | return history + NOCOLOR |
| |
|
| | def kv_cache_seq_ltrim(self, n_keep, n_discard=256, n_past=-1, im_start=None): |
| | if n_keep < 0: |
| | return |
| | if n_past < 0: |
| | n_past = self.n_tokens |
| | if im_start is not None: |
| | lps = compute_lps_array(im_start) |
| | _idx = kmp_search(self.input_ids, im_start, n_keep + n_discard, n_past, lps) |
| | if _idx >= n_keep: |
| | n_discard = _idx - n_keep |
| | else: |
| | _idx = kmp_search(self.input_ids, im_start, n_keep, n_past, lps) |
| | if _idx >= n_keep: |
| | n_keep = _idx + len(im_start) |
| | print(im_start, n_keep, n_discard, _idx) |
| | self._ctx.kv_cache_seq_rm(-1, n_keep, n_keep + n_discard) |
| | self._ctx.kv_cache_seq_shift(0, n_keep + n_discard, n_past, -n_discard) |
| | self.input_ids[n_keep:n_past - n_discard] = self.input_ids[n_keep + n_discard:n_past] |
| | self.n_tokens = n_past - n_discard |
| |
|
| | def eval_t(self, tokens, n_keep=4, n_discard=256, im_start=None): |
| | if self._n_ctx < self.n_tokens + len(tokens): |
| | tmp_n_discard = max(n_discard, self.n_tokens + len(tokens) - self._n_ctx) |
| | self.kv_cache_seq_ltrim(n_keep, tmp_n_discard, im_start=im_start) |
| | for i in range(0, len(tokens), self.n_batch): |
| | batch = tokens[i: i + self.n_batch] |
| | n_past = self.n_tokens |
| | n_tokens = len(batch) |
| | self._batch.set_batch( |
| | batch=batch, n_past=n_past, logits_all=self.context_params.logits_all |
| | ) |
| | self._ctx.decode(self._batch) |
| | |
| | self.input_ids[n_past: n_past + n_tokens] = batch |
| | |
| | rows = n_tokens |
| | cols = self._n_vocab |
| | offset = ( |
| | 0 if self.context_params.logits_all else n_tokens - 1 |
| | ) |
| | self.scores[n_past + offset: n_past + n_tokens, :].reshape(-1)[ |
| | : |
| | ] = self._ctx.get_logits()[offset * cols: rows * cols] |
| | |
| | self.n_tokens += n_tokens |
| | self.venv[-1] += n_tokens |
| | return self.n_tokens |
| |
|
| | def sample_t( |
| | self, |
| | top_k: int = 40, |
| | top_p: float = 0.95, |
| | min_p: float = 0.05, |
| | typical_p: float = 1.0, |
| | temp: float = 0.80, |
| | repeat_penalty: float = 1.1, |
| | repeat_last_n: int = 64, |
| | frequency_penalty: float = 0.0, |
| | presence_penalty: float = 0.0, |
| | tfs_z: float = 1.0, |
| | mirostat_mode: int = 0, |
| | mirostat_eta: float = 0.1, |
| | mirostat_tau: float = 5.0, |
| | penalize_nl: bool = True, |
| | logits_processor=None, |
| | grammar: Optional[LlamaGrammar] = None, |
| | ): |
| | last_n_tokens_data = [llama_cpp.llama_token(0)] * max( |
| | 0, repeat_last_n - self.n_tokens |
| | ) + self._input_ids[-repeat_last_n:].tolist() |
| | last_n_tokens_size = len(last_n_tokens_data) |
| | n_vocab = self._n_vocab |
| | n_ctx = self._n_ctx |
| | top_k = n_vocab if top_k <= 0 else top_k |
| | last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size |
| | last_n_tokens_data_c = (llama_cpp.llama_token * last_n_tokens_size)( |
| | *last_n_tokens_data |
| | ) |
| | logits: npt.NDArray[np.single] = self.scores[self.n_tokens - 1: self.n_tokens, :].ravel() |
| |
|
| | if logits_processor is not None: |
| | logits[:] = logits_processor(self._input_ids, logits) |
| |
|
| | self._candidates.copy_logits(logits) |
| | self._ctx.sample_repetition_penalties( |
| | candidates=self._candidates, |
| | last_tokens_data=last_n_tokens_data_c, |
| | penalty_last_n=last_n_tokens_size, |
| | penalty_repeat=repeat_penalty, |
| | penalty_freq=frequency_penalty, |
| | penalty_present=presence_penalty, |
| | ) |
| | if not penalize_nl: |
| | nl_logit = logits[self._token_nl] |
| | self._candidates.candidates.data[self._token_nl].logit = llama_cpp.c_float( |
| | nl_logit |
| | ) |
| |
|
| | if grammar is not None: |
| | self._ctx.sample_grammar( |
| | candidates=self._candidates, |
| | grammar=grammar, |
| | ) |
| |
|
| | if temp < 0.0: |
| | self._ctx.sample_softmax(candidates=self._candidates) |
| | id_ = self._candidates.candidates.data[0].id |
| | elif temp == 0.0: |
| | id_ = self._ctx.sample_token_greedy(candidates=self._candidates) |
| | elif mirostat_mode == 1: |
| | self._ctx.sample_temp(candidates=self._candidates, temp=temp) |
| | id_ = self._ctx.sample_token_mirostat( |
| | candidates=self._candidates, |
| | tau=mirostat_tau, |
| | eta=mirostat_eta, |
| | mu=2.0 * mirostat_tau, |
| | m=100, |
| | ) |
| | elif mirostat_mode == 2: |
| | self._ctx.sample_temp(candidates=self._candidates, temp=temp) |
| | id_ = self._ctx.sample_token_mirostat_v2( |
| | candidates=self._candidates, |
| | tau=mirostat_tau, |
| | eta=mirostat_eta, |
| | mu=2.0 * mirostat_tau, |
| | ) |
| | else: |
| | self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) |
| | self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1) |
| | self._ctx.sample_typical( |
| | candidates=self._candidates, p=typical_p, min_keep=1 |
| | ) |
| | self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1) |
| | self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1) |
| | self._ctx.sample_temp(candidates=self._candidates, temp=temp) |
| | id_ = self._ctx.sample_token(candidates=self._candidates) |
| | if grammar is not None: |
| | self._ctx.grammar_accept_token(grammar=grammar, token=id_) |
| | return id_ |
| |
|
| | def generate_t( |
| | self, |
| | tokens: Sequence[int], |
| | n_keep, |
| | n_discard: int = 256, |
| | im_start=None, |
| | top_k: int = 40, |
| | top_p: float = 0.95, |
| | min_p: float = 0.05, |
| | typical_p: float = 1.0, |
| | temp: float = 0.80, |
| | repeat_penalty: float = 1.1, |
| | repeat_last_n: int = 64, |
| | frequency_penalty: float = 0.0, |
| | presence_penalty: float = 0.0, |
| | tfs_z: float = 1.0, |
| | mirostat_mode: int = 0, |
| | mirostat_tau: float = 5.0, |
| | mirostat_eta: float = 0.1, |
| | logits_processor: Optional[LogitsProcessorList] = None, |
| | stopping_criteria: Optional[StoppingCriteriaList] = None, |
| | grammar: Optional[LlamaGrammar] = None, |
| | ) -> Generator[int, Optional[Sequence[int]], None]: |
| | typical_p = float(typical_p) |
| | frequency_penalty = float(frequency_penalty) |
| | presence_penalty = float(presence_penalty) |
| | tfs_z = float(tfs_z) |
| | mirostat_tau = float(mirostat_tau) |
| | while True: |
| | self.eval_t(tokens, n_keep, n_discard, im_start=im_start) |
| | token = self.sample_t( |
| | top_k=top_k, |
| | top_p=top_p, |
| | min_p=min_p, |
| | typical_p=typical_p, |
| | temp=temp, |
| | repeat_penalty=repeat_penalty, |
| | repeat_last_n=repeat_last_n, |
| | frequency_penalty=frequency_penalty, |
| | presence_penalty=presence_penalty, |
| | tfs_z=tfs_z, |
| | mirostat_mode=mirostat_mode, |
| | mirostat_tau=mirostat_tau, |
| | mirostat_eta=mirostat_eta, |
| | logits_processor=logits_processor, |
| | grammar=grammar, |
| | ) |
| | if stopping_criteria is not None and stopping_criteria( |
| | self._input_ids, self._scores[-1, :] |
| | ): |
| | return |
| | tokens = yield token |
| | if tokens is None: |
| | tokens = [token] |
| |
|
| | def load_session(self, filepath: str): |
| | n_tokens = POINTER(c_size_t)(c_size_t(0)) |
| | tokens = (llama_cpp.llama_token * self.n_ctx())() |
| | retn = llama_cpp.llama_load_session_file(self._ctx.ctx, |
| | filepath.encode('utf-8'), |
| | tokens, |
| | self.n_ctx(), |
| | n_tokens) |
| | self.n_tokens = n_tokens.contents.value |
| | self.input_ids[:self.n_tokens] = tokens[:self.n_tokens] |
| | self._venv_init() |
| | return retn |
| |
|
| | def save_session(self, filepath: str): |
| | tokens = self._input_ids.tolist() |
| | tokens = (llama_cpp.llama_token * len(tokens))(*tokens) |
| | return llama_cpp.llama_save_session_file(self._ctx.ctx, filepath.encode('utf-8'), tokens, self.n_tokens) |
| |
|