# dynamic int8 quantize the onnx via ort. dynamic (not static) because we do # not have a calibration dataset and dynamic works well for decoder-style LMs. from pathlib import Path def quantize_int8(fp32_onnx: Path, out_dir: Path) -> Path: from onnxruntime.quantization import quantize_dynamic, QuantType fp32_onnx = Path(fp32_onnx) out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) int8_path = out_dir / "model.onnx" print(f"[quantize] {fp32_onnx} -> {int8_path}") quantize_dynamic( model_input=str(fp32_onnx), model_output=str(int8_path), weight_type=QuantType.QInt8, ) print(f"[quantize] wrote {int8_path} ({int8_path.stat().st_size / 1e6:.1f} MB)") return int8_path