Spaces:
Sleeping
Sleeping
File size: 1,245 Bytes
b786614 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | """
quickstart.py — Minimal 10-line example of ProactiveCache.
Shows how to apply O(n) generation to any HuggingFace model.
"""
from transformers import AutoModelForCausalLM, AutoTokenizer
from proactive_cache import ProactiveCache
MODEL = "meta-llama/Llama-3.1-8B" # replace with any HF model
# Load model (any HuggingFace CausalLM)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")
# ── Step 1: Apply O(n) eviction (one line) ───────────────────────────────────
model = ProactiveCache.apply(model, budget=512)
# ── Step 2: Profile once on calibration data (saves proactive_cache_prototypes.pkl)
ProactiveCache.profile(model, tokenizer, corpus="wikitext", num_docs=50)
# ── Step 3: All inference is now O(n) ────────────────────────────────────────
prompt = "In the age of long-context language models,"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
print(tokenizer.decode(output[0], skip_special_tokens=True))
|