| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Merge a LoRA into its base model, convert to GGUF Q5_K_M, push to Hub. |
| |
| Runs inside HF Jobs. Expects env vars: |
| BASE_MODEL - e.g. Sao10K/Llama-3.1-8B-Stheno-v3.4 |
| ADAPTER_MODEL - the trained LoRA repo |
| GGUF_REPO - target repo for the .gguf |
| QUANTIZE - one of f16, q8_0, q5_k_m, q4_k_m (default q5_k_m) |
| HF_TOKEN - write-scope token (passed via secrets) |
| """ |
| import os |
| import subprocess |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi, create_repo |
| from peft import PeftModel |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
| BASE_MODEL = os.environ["BASE_MODEL"] |
| ADAPTER_MODEL = os.environ["ADAPTER_MODEL"] |
| GGUF_REPO = os.environ["GGUF_REPO"] |
| QUANTIZE = os.environ.get("QUANTIZE", "q5_k_m") |
| TOKEN = os.environ["HF_TOKEN"] |
|
|
|
|
| def main() -> None: |
| print(f"[1/5] Loading base model: {BASE_MODEL}") |
| tok = AutoTokenizer.from_pretrained(BASE_MODEL, token=TOKEN) |
| base = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype="auto", token=TOKEN) |
|
|
| print(f"[2/5] Loading LoRA adapter: {ADAPTER_MODEL}") |
| merged = PeftModel.from_pretrained(base, ADAPTER_MODEL, token=TOKEN) |
|
|
| print("[3/5] Merging LoRA weights into base") |
| merged = merged.merge_and_unload() |
| merged_dir = Path("merged-model") |
| merged.save_pretrained(merged_dir, safe_serialization=True) |
| tok.save_pretrained(merged_dir) |
|
|
| print("[4/5] Cloning llama.cpp and running GGUF conversion (f16 intermediate)") |
| subprocess.run(["git", "clone", "--depth=1", "https://github.com/ggerganov/llama.cpp", "llama.cpp"], check=True) |
| subprocess.run(["pip", "install", "-r", "llama.cpp/requirements.txt"], check=True) |
| f16_path = Path("model-f16.gguf") |
| subprocess.run([ |
| "python", "llama.cpp/convert_hf_to_gguf.py", |
| str(merged_dir), "--outtype", "f16", "--outfile", str(f16_path), |
| ], check=True) |
|
|
| print(f"[5/5] Installing build tools + building quantize binary, writing {QUANTIZE.upper()}") |
| |
| |
| subprocess.run(["apt-get", "update", "-qq"], check=True) |
| subprocess.run(["apt-get", "install", "-y", "-qq", "cmake", "build-essential"], check=True) |
| subprocess.run(["cmake", "-B", "llama.cpp/build", "llama.cpp"], check=True) |
| subprocess.run(["cmake", "--build", "llama.cpp/build", "--target", "llama-quantize", "-j"], check=True) |
| quantize_bin = "llama.cpp/build/bin/llama-quantize" |
| out_path = Path(f"model-{QUANTIZE}.gguf") |
| subprocess.run([quantize_bin, str(f16_path), str(out_path), QUANTIZE.upper()], check=True) |
|
|
| print(f"Uploading {out_path} to {GGUF_REPO}") |
| create_repo(GGUF_REPO, repo_type="model", private=True, token=TOKEN, exist_ok=True) |
| api = HfApi(token=TOKEN) |
| api.upload_file( |
| path_or_fileobj=str(out_path), |
| path_in_repo=out_path.name, |
| repo_id=GGUF_REPO, |
| repo_type="model", |
| commit_message=f"upload {out_path.name}", |
| ) |
| print(f"Done -> https://huggingface.co/{GGUF_REPO}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|