Instructions to use compilade/quant-tests with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- llama-cpp-python
How to use compilade/quant-tests with llama-cpp-python:
# !pip install llama-cpp-python from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="compilade/quant-tests", filename="TriLM_1.5B_Unpacked-TQ1_0-F16.gguf", )
output = llm( "Once upon a time,", max_tokens=512, echo=True ) print(output)
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- llama.cpp
How to use compilade/quant-tests with llama.cpp:
Install from brew
brew install llama.cpp # Start a local OpenAI-compatible server with a web UI: llama-server -hf compilade/quant-tests:F16 # Run inference directly in the terminal: llama-cli -hf compilade/quant-tests:F16
Install from WinGet (Windows)
winget install llama.cpp # Start a local OpenAI-compatible server with a web UI: llama-server -hf compilade/quant-tests:F16 # Run inference directly in the terminal: llama-cli -hf compilade/quant-tests:F16
Use pre-built binary
# Download pre-built binary from: # https://github.com/ggerganov/llama.cpp/releases # Start a local OpenAI-compatible server with a web UI: ./llama-server -hf compilade/quant-tests:F16 # Run inference directly in the terminal: ./llama-cli -hf compilade/quant-tests:F16
Build from source code
git clone https://github.com/ggerganov/llama.cpp.git cd llama.cpp cmake -B build cmake --build build -j --target llama-server llama-cli # Start a local OpenAI-compatible server with a web UI: ./build/bin/llama-server -hf compilade/quant-tests:F16 # Run inference directly in the terminal: ./build/bin/llama-cli -hf compilade/quant-tests:F16
Use Docker
docker model run hf.co/compilade/quant-tests:F16
- LM Studio
- Jan
- Ollama
How to use compilade/quant-tests with Ollama:
ollama run hf.co/compilade/quant-tests:F16
- Unsloth Studio
How to use compilade/quant-tests with Unsloth Studio:
Install Unsloth Studio (macOS, Linux, WSL)
curl -fsSL https://unsloth.ai/install.sh | sh # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for compilade/quant-tests to start chatting
Install Unsloth Studio (Windows)
irm https://unsloth.ai/install.ps1 | iex # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for compilade/quant-tests to start chatting
Using HuggingFace Spaces for Unsloth
# No setup required # Open https://huggingface.co/spaces/unsloth/studio in your browser # Search for compilade/quant-tests to start chatting
- Docker Model Runner
How to use compilade/quant-tests with Docker Model Runner:
docker model run hf.co/compilade/quant-tests:F16
- Lemonade
How to use compilade/quant-tests with Lemonade:
Pull the model
# Download Lemonade from https://lemonade-server.ai/ lemonade pull compilade/quant-tests:F16
Run and chat with the model
lemonade run user.quant-tests-F16
List all available models
lemonade list
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from urllib import request | |
| import os | |
| import shlex | |
| import shutil | |
| import subprocess | |
| import sys | |
| from typing import Any, Sequence | |
| import logging | |
| import json | |
| import argparse | |
| curdir = Path(os.path.dirname(__file__)) | |
| logger = logging.getLogger("bench") | |
| MODEL_DIR = curdir / "bench-TriLMs-models" | |
| LLAMA_CPP_PATH = curdir / "." | |
| MODEL_SIZES = ("1.5", "2.4", "3.9") | |
| ALL_TYPES = ("TQ1_0", "TQ2_0", "Q4_K_M", "Q8_0", "F16", "BF16") | |
| GPU_TYPES = ("TQ2_0", "Q4_K_M", "Q8_0", "F16") | |
| def gather_models(sizes: Sequence[str] = MODEL_SIZES): | |
| logger.info("Gathering models") | |
| if not MODEL_DIR.exists(): | |
| MODEL_DIR.mkdir(parents=True, exist_ok=True) | |
| for size in sizes: | |
| filename = f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" | |
| file = MODEL_DIR / filename | |
| if not file.exists(): | |
| url = ( | |
| f"https://huggingface.co/compilade/quant-tests/resolve/main/{filename}" | |
| ) | |
| logger.info(f"Fetching {filename} from {url}") | |
| request.urlretrieve(url, file) | |
| def build_llama_cpp(options: Sequence[str]): | |
| logger.info("Building llama.cpp") | |
| builddir = LLAMA_CPP_PATH / "build" | |
| if builddir.exists(): | |
| # Clear previous config | |
| cmake_cache = builddir / "CMakeCache.txt" | |
| cmake_files = builddir / "CMakeFiles" | |
| logger.info("Removing %s and %s", cmake_cache, cmake_files) | |
| os.system(shlex.join(("rm", "-rf", str(cmake_cache), str(cmake_files)))) | |
| builddir.mkdir(exist_ok=True) | |
| old_cwd = os.path.curdir | |
| os.chdir(builddir) | |
| os.system(shlex.join(("cmake", "..", *options))) | |
| os.system(f"make -j{os.cpu_count()} llama-bench llama-quantize test-backend-ops") | |
| os.chdir(old_cwd) | |
| def quantize(types: Sequence[str] = ALL_TYPES, sizes: Sequence[str] = MODEL_SIZES): | |
| logger.info("Make all model types we'll test") | |
| for size in sizes: | |
| source = MODEL_DIR / f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" | |
| for ty in types: | |
| target = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" | |
| if not target.exists() or target.is_file() and target.stat().st_size == 0: | |
| command = shlex.join( | |
| ( | |
| str(LLAMA_CPP_PATH / "build" / "bin" / "llama-quantize"), | |
| "--allow-requantize", | |
| str(source), | |
| str(target), | |
| ty, | |
| ) | |
| ) | |
| logger.info("Running: %s", command) | |
| ret = os.system(command) | |
| if ret != 0 or target.is_file() and target.stat().st_size == 0: | |
| logger.error("Failed to quantize to %s", target) | |
| # Should it still continue? | |
| def llama_bench( | |
| repetitions: int = 5, | |
| types: Sequence[str] = ALL_TYPES, | |
| sizes: Sequence[str] = MODEL_SIZES, | |
| ) -> list[dict[str, Any]]: | |
| logger.info("Test each model one by one for different numbers of threads") | |
| threads = [2**i for i in range(5) if 2**i <= os.cpu_count()] | |
| logger.info(f"Numbers of threads to be tested: {threads}") | |
| out = [] | |
| for size in sizes: | |
| for ty in types: | |
| for th in threads: | |
| model_path = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" | |
| args = [ | |
| "-v", | |
| "-m", | |
| str(model_path), | |
| "-t", | |
| str(th), | |
| "-r", | |
| str(repetitions), | |
| "-p", | |
| "512", | |
| "-n", | |
| "128", | |
| "-o", | |
| "json", | |
| ] | |
| command = [str(LLAMA_CPP_PATH / "build" / "bin" / "llama-bench")] + args | |
| logger.info("Running: %s", " ".join(command)) | |
| result = subprocess.run(command, capture_output=True) | |
| logger.debug(result.stderr.decode(errors="ignore")) | |
| if result.returncode != 0 or len(result.stdout) == 0: | |
| logger.error("Failed to run %s", " ".join(command)) | |
| break | |
| new_output = json.loads(result.stdout) | |
| logger.info(json.dumps(new_output, indent=4)) | |
| out.extend(new_output) | |
| return out | |
| def test_backend_perf() -> str: | |
| logger.info("Test MUL_MAT performance") | |
| result = subprocess.run( | |
| [ | |
| str(LLAMA_CPP_PATH / "build" / "bin" / "test-backend-ops"), | |
| "perf", | |
| "-o", | |
| "MUL_MAT", | |
| ], | |
| capture_output=True, | |
| ) | |
| logger.debug(result.stdout.decode()) | |
| return result.stdout.decode(encoding="utf-8") | |
| def parse_args(args: Sequence[str]): | |
| parser = argparse.ArgumentParser( | |
| prog=args[0], description="Benchmark ternary models" | |
| ) | |
| parser.add_argument("--gpu", action="store_true", help="Run benchmarks on GPU") | |
| parser.add_argument("--cpu", action="store_true", help="Run benchmarks on CPU") | |
| parser.add_argument( | |
| "--llama-cpp-path", | |
| type=Path, | |
| default=LLAMA_CPP_PATH, | |
| help="Path to a llama.cpp checkout", | |
| ) | |
| parser.add_argument( | |
| "--model-dir", | |
| type=Path, | |
| default=MODEL_DIR, | |
| help="Where the tested models will be stored", | |
| ) | |
| parser.add_argument( | |
| "--repetitions", | |
| type=int, | |
| default=5, | |
| required=False, | |
| help="How many repetitions are run for each test", | |
| ) | |
| parser.add_argument( | |
| "--out", | |
| type=Path, | |
| default=Path(os.path.curdir) / "result.json", | |
| help="Path of the benchmark results to be written", | |
| ) | |
| parser.add_argument( | |
| "--force", action="store_true", help="Overwrite the result file without asking" | |
| ) | |
| return parser.parse_args(args[1:]) | |
| if __name__ == "__main__": | |
| args = parse_args(sys.argv) | |
| logging.basicConfig(level=logging.DEBUG) | |
| LLAMA_CPP_PATH = args.llama_cpp_path | |
| MODEL_DIR = args.model_dir | |
| output_file = Path(args.out).absolute() | |
| if output_file.exists() and not args.force: | |
| ask = input("Result file exists. Do you want to overwrite it? [y/N]") | |
| if not ask.strip().lower().startswith("y"): | |
| logger.info("Not running, leaving output file intact") | |
| exit() | |
| results = [] | |
| mulmat_perf = [] | |
| repetitions: int = args.repetitions | |
| if args.cpu: | |
| gather_models() | |
| build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CPU=ON"]) | |
| quantize() | |
| mulmat_perf.append(test_backend_perf()) | |
| results.extend(llama_bench(repetitions=repetitions)) | |
| if args.gpu: | |
| gather_models() | |
| build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CUDA=ON", "-DGGML_CUDA_F16=ON"]) | |
| quantize() | |
| mulmat_perf.append(test_backend_perf()) | |
| results.extend(llama_bench(repetitions=repetitions, types=GPU_TYPES)) | |
| final_result: dict[str, Any] = { | |
| "mulmat_perf": mulmat_perf, | |
| "results": results, | |
| } | |
| if shutil.which("lscpu") is not None: | |
| logger.info("Getting CPU info") | |
| final_result["cpuinfo"] = subprocess.run( | |
| ["lscpu"], capture_output=True | |
| ).stdout.decode(encoding="utf-8") | |
| if args.gpu and shutil.which("nvidia-smi") is not None: | |
| logger.info("Getting NVIDIA GPU info") | |
| final_result["gpuinfo"] = subprocess.run( | |
| ["nvidia-smi", "-q"], capture_output=True | |
| ).stdout.decode(encoding="utf-8") | |
| logger.info("Writing output to: %s", output_file) | |
| logger.debug("Final results: %s", json.dumps(final_result, indent=4)) | |
| with open(output_file, "w") as f: | |
| json.dump(final_result, f, indent=4) | |
| f.flush() | |