Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,9 @@ import gradio as gr
|
|
| 8 |
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
|
| 9 |
from peft import PeftModel
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# ----------------------------
|
| 13 |
# 1. CONFIG
|
|
@@ -63,15 +66,24 @@ if tokenizer.pad_token_id is None:
|
|
| 63 |
|
| 64 |
def load_lora(repo_id: str):
|
| 65 |
"""Load one LoRA adapter on top of the base model."""
|
|
|
|
| 66 |
base = AutoModelForCausalLM.from_pretrained(
|
| 67 |
BASE_MODEL,
|
| 68 |
torch_dtype=DTYPE,
|
| 69 |
device_map="auto",
|
| 70 |
)
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
model.eval()
|
| 73 |
return model
|
| 74 |
|
|
|
|
| 75 |
print("Loading LoRA models (this happens once at startup)...")
|
| 76 |
model_confusion = load_lora(HF_CONFUSION)
|
| 77 |
model_engagement = load_lora(HF_ENGAGEMENT)
|
|
@@ -198,8 +210,11 @@ def make_prompt(label_name: str, means: dict):
|
|
| 198 |
|
| 199 |
def run_one_model(model, label_name: str, means: dict) -> int:
|
| 200 |
prompt = make_prompt(label_name, means)
|
|
|
|
|
|
|
|
|
|
| 201 |
with torch.no_grad():
|
| 202 |
-
toks = tokenizer(prompt, return_tensors="pt").to(
|
| 203 |
out = model.generate(
|
| 204 |
**toks,
|
| 205 |
max_new_tokens=1,
|
|
@@ -208,11 +223,15 @@ def run_one_model(model, label_name: str, means: dict) -> int:
|
|
| 208 |
pad_token_id=tokenizer.pad_token_id,
|
| 209 |
eos_token_id=tokenizer.eos_token_id,
|
| 210 |
)
|
| 211 |
-
text = tokenizer.decode(
|
|
|
|
|
|
|
|
|
|
| 212 |
m = re.search(r"[0-3]", text)
|
| 213 |
return int(m.group()) if m else -1
|
| 214 |
|
| 215 |
|
|
|
|
| 216 |
# ----------------------------
|
| 217 |
# 5. GRADIO PIPELINE
|
| 218 |
# ----------------------------
|
|
|
|
| 8 |
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
|
| 9 |
from peft import PeftModel
|
| 10 |
|
| 11 |
+
OFFLOAD_DIR = "offload"
|
| 12 |
+
os.makedirs(OFFLOAD_DIR, exist_ok=True)
|
| 13 |
+
|
| 14 |
|
| 15 |
# ----------------------------
|
| 16 |
# 1. CONFIG
|
|
|
|
| 66 |
|
| 67 |
def load_lora(repo_id: str):
|
| 68 |
"""Load one LoRA adapter on top of the base model."""
|
| 69 |
+
# Base model, let HF/accelerate place layers (GPU + CPU)
|
| 70 |
base = AutoModelForCausalLM.from_pretrained(
|
| 71 |
BASE_MODEL,
|
| 72 |
torch_dtype=DTYPE,
|
| 73 |
device_map="auto",
|
| 74 |
)
|
| 75 |
+
|
| 76 |
+
# IMPORTANT: give accelerate an offload folder
|
| 77 |
+
model = PeftModel.from_pretrained(
|
| 78 |
+
base,
|
| 79 |
+
repo_id,
|
| 80 |
+
device_map="auto",
|
| 81 |
+
offload_folder=OFFLOAD_DIR, # <-- fixes the offload_dir error
|
| 82 |
+
)
|
| 83 |
model.eval()
|
| 84 |
return model
|
| 85 |
|
| 86 |
+
|
| 87 |
print("Loading LoRA models (this happens once at startup)...")
|
| 88 |
model_confusion = load_lora(HF_CONFUSION)
|
| 89 |
model_engagement = load_lora(HF_ENGAGEMENT)
|
|
|
|
| 210 |
|
| 211 |
def run_one_model(model, label_name: str, means: dict) -> int:
|
| 212 |
prompt = make_prompt(label_name, means)
|
| 213 |
+
# figure out the right device (for auto-sharded models this is e.g. "cuda:0")
|
| 214 |
+
device = getattr(model, "device", DEVICE)
|
| 215 |
+
|
| 216 |
with torch.no_grad():
|
| 217 |
+
toks = tokenizer(prompt, return_tensors="pt").to(device)
|
| 218 |
out = model.generate(
|
| 219 |
**toks,
|
| 220 |
max_new_tokens=1,
|
|
|
|
| 223 |
pad_token_id=tokenizer.pad_token_id,
|
| 224 |
eos_token_id=tokenizer.eos_token_id,
|
| 225 |
)
|
| 226 |
+
text = tokenizer.decode(
|
| 227 |
+
out[0, toks["input_ids"].shape[1]:],
|
| 228 |
+
skip_special_tokens=True,
|
| 229 |
+
)
|
| 230 |
m = re.search(r"[0-3]", text)
|
| 231 |
return int(m.group()) if m else -1
|
| 232 |
|
| 233 |
|
| 234 |
+
|
| 235 |
# ----------------------------
|
| 236 |
# 5. GRADIO PIPELINE
|
| 237 |
# ----------------------------
|