raunch-training-scripts / convert_gguf.py
4moha's picture
fix: apt-install cmake + build-essential before llama.cpp build
d7a8652 verified
Raw
History Blame Contribute Delete
3.37 kB
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "transformers>=4.45",
# "peft>=0.7.0",
# "huggingface_hub>=0.24",
# "torch>=2.4",
# "sentencepiece",
# "protobuf",
# "gguf",
# ]
# ///
"""Merge a LoRA into its base model, convert to GGUF Q5_K_M, push to Hub.
Runs inside HF Jobs. Expects env vars:
BASE_MODEL - e.g. Sao10K/Llama-3.1-8B-Stheno-v3.4
ADAPTER_MODEL - the trained LoRA repo
GGUF_REPO - target repo for the .gguf
QUANTIZE - one of f16, q8_0, q5_k_m, q4_k_m (default q5_k_m)
HF_TOKEN - write-scope token (passed via secrets)
"""
import os
import subprocess
from pathlib import Path
from huggingface_hub import HfApi, create_repo
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
BASE_MODEL = os.environ["BASE_MODEL"]
ADAPTER_MODEL = os.environ["ADAPTER_MODEL"]
GGUF_REPO = os.environ["GGUF_REPO"]
QUANTIZE = os.environ.get("QUANTIZE", "q5_k_m")
TOKEN = os.environ["HF_TOKEN"]
def main() -> None:
print(f"[1/5] Loading base model: {BASE_MODEL}")
tok = AutoTokenizer.from_pretrained(BASE_MODEL, token=TOKEN)
base = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype="auto", token=TOKEN)
print(f"[2/5] Loading LoRA adapter: {ADAPTER_MODEL}")
merged = PeftModel.from_pretrained(base, ADAPTER_MODEL, token=TOKEN)
print("[3/5] Merging LoRA weights into base")
merged = merged.merge_and_unload()
merged_dir = Path("merged-model")
merged.save_pretrained(merged_dir, safe_serialization=True)
tok.save_pretrained(merged_dir)
print("[4/5] Cloning llama.cpp and running GGUF conversion (f16 intermediate)")
subprocess.run(["git", "clone", "--depth=1", "https://github.com/ggerganov/llama.cpp", "llama.cpp"], check=True)
subprocess.run(["pip", "install", "-r", "llama.cpp/requirements.txt"], check=True)
f16_path = Path("model-f16.gguf")
subprocess.run([
"python", "llama.cpp/convert_hf_to_gguf.py",
str(merged_dir), "--outtype", "f16", "--outfile", str(f16_path),
], check=True)
print(f"[5/5] Installing build tools + building quantize binary, writing {QUANTIZE.upper()}")
# The uv Docker image (python3.12-bookworm) has no cmake by default. Install
# via apt (container runs as root). build-essential brings gcc/g++/make.
subprocess.run(["apt-get", "update", "-qq"], check=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "cmake", "build-essential"], check=True)
subprocess.run(["cmake", "-B", "llama.cpp/build", "llama.cpp"], check=True)
subprocess.run(["cmake", "--build", "llama.cpp/build", "--target", "llama-quantize", "-j"], check=True)
quantize_bin = "llama.cpp/build/bin/llama-quantize"
out_path = Path(f"model-{QUANTIZE}.gguf")
subprocess.run([quantize_bin, str(f16_path), str(out_path), QUANTIZE.upper()], check=True)
print(f"Uploading {out_path} to {GGUF_REPO}")
create_repo(GGUF_REPO, repo_type="model", private=True, token=TOKEN, exist_ok=True)
api = HfApi(token=TOKEN)
api.upload_file(
path_or_fileobj=str(out_path),
path_in_repo=out_path.name,
repo_id=GGUF_REPO,
repo_type="model",
commit_message=f"upload {out_path.name}",
)
print(f"Done -> https://huggingface.co/{GGUF_REPO}")
if __name__ == "__main__":
main()