proactive-cache / examples /quickstart.py
skhavin's picture
feat: initial release of proactive-cache v0.1.0
b786614
Raw
History Blame Contribute Delete
1.25 kB
"""
quickstart.py — Minimal 10-line example of ProactiveCache.
Shows how to apply O(n) generation to any HuggingFace model.
"""
from transformers import AutoModelForCausalLM, AutoTokenizer
from proactive_cache import ProactiveCache
MODEL = "meta-llama/Llama-3.1-8B" # replace with any HF model
# Load model (any HuggingFace CausalLM)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")
# ── Step 1: Apply O(n) eviction (one line) ───────────────────────────────────
model = ProactiveCache.apply(model, budget=512)
# ── Step 2: Profile once on calibration data (saves proactive_cache_prototypes.pkl)
ProactiveCache.profile(model, tokenizer, corpus="wikitext", num_docs=50)
# ── Step 3: All inference is now O(n) ────────────────────────────────────────
prompt = "In the age of long-context language models,"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
print(tokenizer.decode(output[0], skip_special_tokens=True))