| | #include "llama-adapter.h" |
| |
|
| | #include "llama-impl.h" |
| | #include "llama-mmap.h" |
| | #include "llama-model.h" |
| |
|
| | #include <algorithm> |
| | #include <map> |
| | #include <cassert> |
| | #include <stdexcept> |
| |
|
| | |
| |
|
| | struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { |
| | if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { |
| | return nullptr; |
| | } |
| |
|
| | return tensors[il]; |
| | } |
| |
|
| | struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { |
| | ggml_tensor * layer_dir = tensor_for(il); |
| | if (layer_dir != nullptr) { |
| | cur = ggml_add(ctx, cur, layer_dir); |
| | } |
| |
|
| | return cur; |
| | } |
| |
|
| | bool llama_adapter_cvec::init(const llama_model & model) { |
| | const auto & hparams = model.hparams; |
| |
|
| | GGML_ASSERT(tensors.empty()); |
| | GGML_ASSERT(ctxs.empty()); |
| | GGML_ASSERT(bufs.empty()); |
| |
|
| | |
| | std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
| | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
| | auto it = ctx_map.find(buft); |
| | if (it == ctx_map.end()) { |
| | struct ggml_init_params params = { |
| | hparams.n_layer*ggml_tensor_overhead(), |
| | NULL, |
| | true, |
| | }; |
| |
|
| | ggml_context * ctx = ggml_init(params); |
| | if (!ctx) { |
| | return nullptr; |
| | } |
| |
|
| | ctx_map[buft] = ctx; |
| | ctxs.emplace_back(ctx); |
| |
|
| | return ctx; |
| | } |
| |
|
| | return it->second; |
| | }; |
| |
|
| | |
| | tensors.reserve(hparams.n_layer); |
| | tensors.push_back(nullptr); |
| | for (size_t il = 1; il < hparams.n_layer; il++) { |
| | ggml_backend_buffer_type_t buft = model.select_buft(il); |
| | ggml_context * ctx = ctx_for_buft(buft); |
| | if (!ctx) { |
| | LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); |
| | return false; |
| | } |
| | ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); |
| | tensors.push_back(tensor); |
| | } |
| |
|
| | |
| | bufs.reserve(ctx_map.size()); |
| | for (auto it : ctx_map) { |
| | ggml_backend_buffer_type_t buft = it.first; |
| | ggml_context * ctx = it.second; |
| | ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); |
| | if (!buf) { |
| | LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); |
| | return false; |
| | } |
| | ggml_backend_buffer_clear(buf, 0); |
| | bufs.emplace_back(buf); |
| | } |
| |
|
| | return true; |
| | } |
| |
|
| | int32_t llama_adapter_cvec::apply( |
| | const llama_model & model, |
| | const float * data, |
| | size_t len, |
| | int32_t n_embd, |
| | int32_t il_start, |
| | int32_t il_end) { |
| | const auto & hparams = model.hparams; |
| |
|
| | if (data == nullptr) { |
| | |
| | layer_start = -1; |
| | layer_end = -1; |
| | return 0; |
| | } |
| |
|
| | if (n_embd != (int) hparams.n_embd) { |
| | LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); |
| | return 1; |
| | } |
| |
|
| | if (tensors.empty()) { |
| | if (!init(model)) { |
| | return 1; |
| | } |
| | } |
| |
|
| | layer_start = il_start; |
| | layer_end = il_end; |
| |
|
| | for (size_t il = 1; il < hparams.n_layer; il++) { |
| | assert(tensors[il] != nullptr); |
| |
|
| | const size_t off = n_embd * (il - 1); |
| | if (off + n_embd <= len) { |
| | ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il])); |
| | } |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | |
| |
|
| | llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) { |
| | const std::string name(w->name); |
| |
|
| | const auto pos = ab_map.find(name); |
| | if (pos != ab_map.end()) { |
| | return &pos->second; |
| | } |
| |
|
| | return nullptr; |
| | } |
| |
|
| | static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) { |
| | LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); |
| |
|
| | ggml_context * ctx_init; |
| | struct gguf_init_params meta_gguf_params = { |
| | true, |
| | &ctx_init, |
| | }; |
| |
|
| | gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; |
| | if (!ctx_gguf) { |
| | throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); |
| | } |
| |
|
| | ggml_context_ptr ctx { ctx_init }; |
| |
|
| | |
| | { |
| | auto get_kv_str = [&](const std::string & key) -> std::string { |
| | int id = gguf_find_key(ctx_gguf.get(), key.c_str()); |
| | return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); |
| | }; |
| | auto get_kv_f32 = [&](const std::string & key) -> float { |
| | int id = gguf_find_key(ctx_gguf.get(), key.c_str()); |
| | return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); |
| | }; |
| | LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); |
| |
|
| | auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); |
| | if (general_type != "adapter") { |
| | throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); |
| | } |
| |
|
| | auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); |
| | auto general_arch = llm_arch_from_string(general_arch_str); |
| | if (general_arch != model.arch) { |
| | throw std::runtime_error("model arch and LoRA arch mismatch"); |
| | } |
| |
|
| | auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); |
| | if (adapter_type != "lora") { |
| | throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); |
| | } |
| |
|
| | adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); |
| | } |
| |
|
| | int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); |
| |
|
| | |
| | std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
| | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { |
| | auto it = ctx_map.find(buft); |
| | if (it == ctx_map.end()) { |
| | |
| | struct ggml_init_params params = { |
| | n_tensors*ggml_tensor_overhead(), |
| | NULL, |
| | true, |
| | }; |
| | ggml_context * buft_ctx = ggml_init(params); |
| | if (!buft_ctx) { |
| | return nullptr; |
| | } |
| | ctx_map[buft] = buft_ctx; |
| | adapter.ctxs.emplace_back(buft_ctx); |
| | return buft_ctx; |
| | }; |
| | return it->second; |
| | }; |
| |
|
| | |
| | std::map<std::string, llama_adapter_lora_weight> ab_map; |
| | auto str_endswith = [](const std::string & str, const std::string & suffix) { |
| | return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; |
| | }; |
| |
|
| | for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { |
| | std::string name(cur->name); |
| | if (str_endswith(name, ".lora_a")) { |
| | replace_all(name, ".lora_a", ""); |
| | if (ab_map.find(name) == ab_map.end()) { |
| | ab_map[name] = llama_adapter_lora_weight(cur, nullptr); |
| | } else { |
| | ab_map[name].a = cur; |
| | } |
| | } else if (str_endswith(name, ".lora_b")) { |
| | replace_all(name, ".lora_b", ""); |
| | if (ab_map.find(name) == ab_map.end()) { |
| | ab_map[name] = llama_adapter_lora_weight(nullptr, cur); |
| | } else { |
| | ab_map[name].b = cur; |
| | } |
| | } else if (str_endswith(name, "_norm.weight")) { |
| | |
| | |
| | continue; |
| | } else { |
| | throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); |
| | } |
| | } |
| |
|
| | |
| | for (auto & it : ab_map) { |
| | const std::string & name = it.first; |
| | llama_adapter_lora_weight & w = it.second; |
| | bool is_token_embd = str_endswith(name, "token_embd.weight"); |
| |
|
| | if (!w.a || !w.b) { |
| | throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); |
| | } |
| |
|
| | |
| | const auto * model_tensor = model.get_tensor(name.c_str()); |
| | if (!model_tensor) { |
| | throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); |
| | } |
| |
|
| | struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); |
| | |
| | if (is_token_embd) { |
| | |
| | if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) { |
| | throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); |
| | } |
| | } else { |
| | if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { |
| | throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); |
| | } |
| | if (w.a->ne[1] != w.b->ne[0]) { |
| | throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); |
| | } |
| | } |
| |
|
| | |
| | struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); |
| | struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); |
| | ggml_set_name(tensor_a, w.a->name); |
| | ggml_set_name(tensor_b, w.b->name); |
| | adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b); |
| | } |
| |
|
| | |
| | { |
| | adapter.ctxs.reserve(ctx_map.size()); |
| | adapter.bufs.reserve(ctx_map.size()); |
| | for (auto & it : ctx_map) { |
| | ggml_backend_buffer_type_t buft = it.first; |
| | ggml_context * ctx_dev = it.second; |
| | ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; |
| | if (!buf) { |
| | throw std::runtime_error("failed to allocate buffer for lora adapter\n"); |
| | } |
| | LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); |
| | adapter.bufs.emplace_back(std::move(buf)); |
| | } |
| | } |
| |
|
| | |
| | { |
| | llama_file gguf_file(path_lora, "rb"); |
| | std::vector<uint8_t> read_buf; |
| | auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { |
| | size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); |
| | size_t size = ggml_nbytes(orig); |
| | read_buf.resize(size); |
| | gguf_file.seek(offs, SEEK_SET); |
| | gguf_file.read_raw(read_buf.data(), size); |
| | ggml_backend_tensor_set(dev, read_buf.data(), 0, size); |
| | }; |
| | for (auto & it : adapter.ab_map) { |
| | auto orig = ab_map[it.first]; |
| | auto dev = it.second; |
| | set_tensor(orig.a, dev.a); |
| | set_tensor(orig.b, dev.b); |
| | } |
| | } |
| |
|
| | LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); |
| | } |
| |
|
| | struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) { |
| | struct llama_adapter_lora * adapter = new llama_adapter_lora(); |
| |
|
| | try { |
| | llama_adapter_lora_init_impl(*model, path_lora, *adapter); |
| | return adapter; |
| | } catch (const std::exception & err) { |
| | LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); |
| |
|
| | delete adapter; |
| | } |
| |
|
| | return nullptr; |
| | } |
| |
|
| | void llama_adapter_lora_free(struct llama_adapter_lora * adapter) { |
| | delete adapter; |
| | } |
| |
|