bhardwaj08sarthak commited on
Commit
700d92f
·
verified ·
1 Parent(s): 767f314

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -51
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import gradio as gr
 
4
  from huggingface_hub import InferenceClient
5
  from smolagents import CodeAgent, InferenceClientModel, tool
6
 
@@ -77,23 +78,7 @@ def classify_and_score(
77
  target_dok: str,
78
  agg: str = "max"
79
  ) -> dict:
80
- """Classify a question against Bloom’s and DOK targets and return guidance.
81
-
82
- Args:
83
- question: The question text to evaluate for cognitive demand.
84
- target_bloom: Target Bloom’s level or range. Accepts exact (e.g., "Analyze")
85
- or plus form (e.g., "Apply+") meaning that level or higher.
86
- target_dok: Target DOK level or range. Accepts exact (e.g., "DOK3")
87
- or span (e.g., "DOK2-DOK3").
88
- agg: Aggregation method over phrase similarities within a level
89
- (choices: "mean", "max", "topk_mean").
90
-
91
- Returns:
92
- A dictionary with:
93
- ok: True if both Bloom’s and DOK match the targets.
94
- measured: Dict with best levels and per-level scores for Bloom’s and DOK.
95
- feedback: Brief guidance describing how to adjust the question to hit targets.
96
- """
97
  res = classify_levels_phrases(
98
  question,
99
  BLOOMS_PHRASES,
@@ -173,15 +158,16 @@ def classify_and_score(
173
  "feedback": " ".join(feedback_parts) if feedback_parts else "On target.",
174
  }
175
 
176
- # ------------------------ Backend selection + caching ------------------------
177
- _LOCAL_MODEL_CACHE = {
178
- "model": None,
179
- "model_id": None,
180
- }
181
 
182
- def _get_local_model(model_id: str):
183
- """Lazy-load and cache a local Transformers model for smolagents."""
184
- # Import here so Hosted mode doesn't require local deps.
 
 
 
 
185
  try:
186
  from smolagents import TransformersModel # provided by smolagents
187
  except Exception as e:
@@ -196,16 +182,14 @@ def _get_local_model(model_id: str):
196
  ):
197
  return _LOCAL_MODEL_CACHE["model"]
198
 
199
- # Instantiate and cache
200
  local_model = TransformersModel(
201
  model_id=model_id,
202
- device_map="auto"
203
  )
204
  _LOCAL_MODEL_CACHE["model"] = local_model
205
  _LOCAL_MODEL_CACHE["model_id"] = model_id
206
  return local_model
207
 
208
- # ------------------------ Agent setup with timeout ------------------------
209
  def make_agent(
210
  backend_choice: str, # "Hosted API" | "Local GPU"
211
  hf_token: str,
@@ -215,7 +199,8 @@ def make_agent(
215
  max_tokens: int
216
  ):
217
  if backend_choice == "Local GPU":
218
- model = _get_local_model(model_id)
 
219
  else:
220
  client = InferenceClient(
221
  model=model_id,
@@ -225,7 +210,6 @@ def make_agent(
225
  model = InferenceClientModel(client=client)
226
 
227
  agent = CodeAgent(model=model, tools=[classify_and_score])
228
- # Not used by agent core; helpful for debugging
229
  agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}
230
  return agent
231
 
@@ -251,7 +235,6 @@ If you output JSON, ensure it is valid JSON (no trailing commas, use double quot
251
 
252
  # ------------------------ Utility: robust JSON extractor ------------------
253
  def extract_top_level_json(s: str) -> str:
254
- """Extract the first top-level JSON object by tracking braces."""
255
  start = s.find("{")
256
  if start == -1:
257
  return ""
@@ -296,7 +279,6 @@ def run_pipeline(
296
  max_tokens=int(max_tokens),
297
  )
298
  except Exception as e:
299
- # Surface backend/model setup errors directly
300
  err = f"ERROR initializing backend '{backend_choice}': {e}"
301
  return "", err
302
 
@@ -324,18 +306,6 @@ def run_pipeline(
324
 
325
  return final_json, result_text
326
 
327
- # ------------------------ Optional Spaces warmup --------------------------
328
- # If you deploy on HF Spaces and want to pre-allocate GPU for Local mode,
329
- # you can try to warm up the model at startup by setting:
330
- # BACKEND_WARMUP=1 and BACKEND_WARMUP_MODEL=<model id>
331
- if (os.getenv("SYSTEM") == "spaces") and os.getenv("BACKEND_WARMUP") == "1":
332
- try:
333
- wm = os.getenv("BACKEND_WARMUP_MODEL", "swiss-ai/Apertus-70B-Instruct-2509")
334
- _get_local_model(wm)
335
- print(f"[Warmup] Local GPU model loaded: {wm}")
336
- except Exception as e:
337
- print(f"[Warmup] Skipped or failed: {e}")
338
-
339
  # ------------------------ Gradio UI --------------------------------------
340
  with gr.Blocks() as demo:
341
  gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
@@ -358,7 +328,7 @@ with gr.Blocks() as demo:
358
  )
359
  model_id = gr.Textbox(
360
  value="swiss-ai/Apertus-70B-Instruct-2509",
361
- label="Model ID (repo or local path)"
362
  )
363
  timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)")
364
 
@@ -393,7 +363,6 @@ with gr.Blocks() as demo:
393
  temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
394
  max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")
395
 
396
- # Helpful hint text depending on backend
397
  backend_tips = gr.Markdown(
398
  "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n"
399
  "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. "
@@ -405,13 +374,11 @@ with gr.Blocks() as demo:
405
  final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
406
  transcript = gr.Textbox(label="Agent Transcript", lines=18)
407
 
408
- # Dynamically show/hide token & timeout based on backend
409
  def _toggle_backend_fields(choice):
410
- # Show token + timeout only for Hosted API
411
  return (
412
- gr.update(visible=(choice == "Hosted API")),
413
- gr.update(visible=True), # model_id always visible
414
- gr.update(visible=(choice == "Hosted API"))
415
  )
416
 
417
  backend_choice.change(
 
1
  import os
2
  import json
3
  import gradio as gr
4
+ import spaces
5
  from huggingface_hub import InferenceClient
6
  from smolagents import CodeAgent, InferenceClientModel, tool
7
 
 
78
  target_dok: str,
79
  agg: str = "max"
80
  ) -> dict:
81
+ """Classify a question against Bloom’s and DOK targets and return guidance."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  res = classify_levels_phrases(
83
  question,
84
  BLOOMS_PHRASES,
 
158
  "feedback": " ".join(feedback_parts) if feedback_parts else "On target.",
159
  }
160
 
161
+ # ------------------------ Backend selection + GPU-wrapped local loader ------------------------
162
+ _LOCAL_MODEL_CACHE = {"model": None, "model_id": None}
 
 
 
163
 
164
+ @spaces.GPU(duration=30) # request GPU only when loading/using local model
165
+ def get_local_model_gpu(model_id: str):
166
+ """
167
+ Load and cache a local Transformers model for smolagents on GPU.
168
+ Decorated so Spaces knows this task needs a GPU.
169
+ """
170
+ # Import here to keep Hosted mode lightweight.
171
  try:
172
  from smolagents import TransformersModel # provided by smolagents
173
  except Exception as e:
 
182
  ):
183
  return _LOCAL_MODEL_CACHE["model"]
184
 
 
185
  local_model = TransformersModel(
186
  model_id=model_id,
187
+ device_map="auto" # lets accelerate pick the best device(s)
188
  )
189
  _LOCAL_MODEL_CACHE["model"] = local_model
190
  _LOCAL_MODEL_CACHE["model_id"] = model_id
191
  return local_model
192
 
 
193
  def make_agent(
194
  backend_choice: str, # "Hosted API" | "Local GPU"
195
  hf_token: str,
 
199
  max_tokens: int
200
  ):
201
  if backend_choice == "Local GPU":
202
+ # This call is GPU-annotated; Spaces will allocate a GPU for it.
203
+ model = get_local_model_gpu(model_id)
204
  else:
205
  client = InferenceClient(
206
  model=model_id,
 
210
  model = InferenceClientModel(client=client)
211
 
212
  agent = CodeAgent(model=model, tools=[classify_and_score])
 
213
  agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}
214
  return agent
215
 
 
235
 
236
  # ------------------------ Utility: robust JSON extractor ------------------
237
  def extract_top_level_json(s: str) -> str:
 
238
  start = s.find("{")
239
  if start == -1:
240
  return ""
 
279
  max_tokens=int(max_tokens),
280
  )
281
  except Exception as e:
 
282
  err = f"ERROR initializing backend '{backend_choice}': {e}"
283
  return "", err
284
 
 
306
 
307
  return final_json, result_text
308
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  # ------------------------ Gradio UI --------------------------------------
310
  with gr.Blocks() as demo:
311
  gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
 
328
  )
329
  model_id = gr.Textbox(
330
  value="swiss-ai/Apertus-70B-Instruct-2509",
331
+ label="Model ID (repo id for Hosted, or local repo for GPU)"
332
  )
333
  timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)")
334
 
 
363
  temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
364
  max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")
365
 
 
366
  backend_tips = gr.Markdown(
367
  "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n"
368
  "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. "
 
374
  final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
375
  transcript = gr.Textbox(label="Agent Transcript", lines=18)
376
 
 
377
  def _toggle_backend_fields(choice):
 
378
  return (
379
+ gr.update(visible=(choice == "Hosted API")), # hf_token
380
+ gr.update(visible=True), # model_id always visible
381
+ gr.update(visible=(choice == "Hosted API")) # timeout slider
382
  )
383
 
384
  backend_choice.change(