""" quickstart.py — Minimal 10-line example of ProactiveCache. Shows how to apply O(n) generation to any HuggingFace model. """ from transformers import AutoModelForCausalLM, AutoTokenizer from proactive_cache import ProactiveCache MODEL = "meta-llama/Llama-3.1-8B" # replace with any HF model # Load model (any HuggingFace CausalLM) tokenizer = AutoTokenizer.from_pretrained(MODEL) model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto") # ── Step 1: Apply O(n) eviction (one line) ─────────────────────────────────── model = ProactiveCache.apply(model, budget=512) # ── Step 2: Profile once on calibration data (saves proactive_cache_prototypes.pkl) ProactiveCache.profile(model, tokenizer, corpus="wikitext", num_docs=50) # ── Step 3: All inference is now O(n) ──────────────────────────────────────── prompt = "In the age of long-context language models," inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=200, do_sample=False) print(tokenizer.decode(output[0], skip_special_tokens=True))