lakki03 commited on
Commit
e984579
·
verified ·
1 Parent(s): d34b730

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -3
app.py CHANGED
@@ -8,6 +8,9 @@ import gradio as gr
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
9
  from peft import PeftModel
10
 
 
 
 
11
 
12
  # ----------------------------
13
  # 1. CONFIG
@@ -63,15 +66,24 @@ if tokenizer.pad_token_id is None:
63
 
64
  def load_lora(repo_id: str):
65
  """Load one LoRA adapter on top of the base model."""
 
66
  base = AutoModelForCausalLM.from_pretrained(
67
  BASE_MODEL,
68
  torch_dtype=DTYPE,
69
  device_map="auto",
70
  )
71
- model = PeftModel.from_pretrained(base, repo_id)
 
 
 
 
 
 
 
72
  model.eval()
73
  return model
74
 
 
75
  print("Loading LoRA models (this happens once at startup)...")
76
  model_confusion = load_lora(HF_CONFUSION)
77
  model_engagement = load_lora(HF_ENGAGEMENT)
@@ -198,8 +210,11 @@ def make_prompt(label_name: str, means: dict):
198
 
199
  def run_one_model(model, label_name: str, means: dict) -> int:
200
  prompt = make_prompt(label_name, means)
 
 
 
201
  with torch.no_grad():
202
- toks = tokenizer(prompt, return_tensors="pt").to(DEVICE)
203
  out = model.generate(
204
  **toks,
205
  max_new_tokens=1,
@@ -208,11 +223,15 @@ def run_one_model(model, label_name: str, means: dict) -> int:
208
  pad_token_id=tokenizer.pad_token_id,
209
  eos_token_id=tokenizer.eos_token_id,
210
  )
211
- text = tokenizer.decode(out[0, toks["input_ids"].shape[1]:], skip_special_tokens=True)
 
 
 
212
  m = re.search(r"[0-3]", text)
213
  return int(m.group()) if m else -1
214
 
215
 
 
216
  # ----------------------------
217
  # 5. GRADIO PIPELINE
218
  # ----------------------------
 
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
9
  from peft import PeftModel
10
 
11
+ OFFLOAD_DIR = "offload"
12
+ os.makedirs(OFFLOAD_DIR, exist_ok=True)
13
+
14
 
15
  # ----------------------------
16
  # 1. CONFIG
 
66
 
67
  def load_lora(repo_id: str):
68
  """Load one LoRA adapter on top of the base model."""
69
+ # Base model, let HF/accelerate place layers (GPU + CPU)
70
  base = AutoModelForCausalLM.from_pretrained(
71
  BASE_MODEL,
72
  torch_dtype=DTYPE,
73
  device_map="auto",
74
  )
75
+
76
+ # IMPORTANT: give accelerate an offload folder
77
+ model = PeftModel.from_pretrained(
78
+ base,
79
+ repo_id,
80
+ device_map="auto",
81
+ offload_folder=OFFLOAD_DIR, # <-- fixes the offload_dir error
82
+ )
83
  model.eval()
84
  return model
85
 
86
+
87
  print("Loading LoRA models (this happens once at startup)...")
88
  model_confusion = load_lora(HF_CONFUSION)
89
  model_engagement = load_lora(HF_ENGAGEMENT)
 
210
 
211
  def run_one_model(model, label_name: str, means: dict) -> int:
212
  prompt = make_prompt(label_name, means)
213
+ # figure out the right device (for auto-sharded models this is e.g. "cuda:0")
214
+ device = getattr(model, "device", DEVICE)
215
+
216
  with torch.no_grad():
217
+ toks = tokenizer(prompt, return_tensors="pt").to(device)
218
  out = model.generate(
219
  **toks,
220
  max_new_tokens=1,
 
223
  pad_token_id=tokenizer.pad_token_id,
224
  eos_token_id=tokenizer.eos_token_id,
225
  )
226
+ text = tokenizer.decode(
227
+ out[0, toks["input_ids"].shape[1]:],
228
+ skip_special_tokens=True,
229
+ )
230
  m = re.search(r"[0-3]", text)
231
  return int(m.group()) if m else -1
232
 
233
 
234
+
235
  # ----------------------------
236
  # 5. GRADIO PIPELINE
237
  # ----------------------------