Spaces:

bhardwaj08sarthak
/

STEM-Question-Generator

Running

App Files Files Community

bhardwaj08sarthak commited on Sep 15

Commit

700d92f

verified ·

1 Parent(s): 767f314

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -51

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import json
 import gradio as gr
 from huggingface_hub import InferenceClient
 from smolagents import CodeAgent, InferenceClientModel, tool
@@ -77,23 +78,7 @@ def classify_and_score(
     target_dok: str,
     agg: str = "max"
 ) -> dict:
-    """Classify a question against Bloom’s and DOK targets and return guidance.
-    Args:
-        question: The question text to evaluate for cognitive demand.
-        target_bloom: Target Bloom’s level or range. Accepts exact (e.g., "Analyze")
-            or plus form (e.g., "Apply+") meaning that level or higher.
-        target_dok: Target DOK level or range. Accepts exact (e.g., "DOK3")
-            or span (e.g., "DOK2-DOK3").
-        agg: Aggregation method over phrase similarities within a level
-            (choices: "mean", "max", "topk_mean").
-    Returns:
-        A dictionary with:
-            ok: True if both Bloom’s and DOK match the targets.
-            measured: Dict with best levels and per-level scores for Bloom’s and DOK.
-            feedback: Brief guidance describing how to adjust the question to hit targets.
-    """
     res = classify_levels_phrases(
         question,
         BLOOMS_PHRASES,
@@ -173,15 +158,16 @@ def classify_and_score(
         "feedback": " ".join(feedback_parts) if feedback_parts else "On target.",
     }
-# ------------------------ Backend selection + caching ------------------------
-_LOCAL_MODEL_CACHE = {
-    "model": None,
-    "model_id": None,
-}
-def _get_local_model(model_id: str):
-    """Lazy-load and cache a local Transformers model for smolagents."""
-    # Import here so Hosted mode doesn't require local deps.
     try:
         from smolagents import TransformersModel  # provided by smolagents
     except Exception as e:
@@ -196,16 +182,14 @@ def _get_local_model(model_id: str):
     ):
         return _LOCAL_MODEL_CACHE["model"]
-    # Instantiate and cache
     local_model = TransformersModel(
         model_id=model_id,
-        device_map="auto"
     )
     _LOCAL_MODEL_CACHE["model"] = local_model
     _LOCAL_MODEL_CACHE["model_id"] = model_id
     return local_model
-# ------------------------ Agent setup with timeout ------------------------
 def make_agent(
     backend_choice: str,          # "Hosted API" | "Local GPU"
     hf_token: str,
@@ -215,7 +199,8 @@ def make_agent(
     max_tokens: int
 ):
     if backend_choice == "Local GPU":
-        model = _get_local_model(model_id)
     else:
         client = InferenceClient(
             model=model_id,
@@ -225,7 +210,6 @@ def make_agent(
         model = InferenceClientModel(client=client)
     agent = CodeAgent(model=model, tools=[classify_and_score])
-    # Not used by agent core; helpful for debugging
     agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}
     return agent
@@ -251,7 +235,6 @@ If you output JSON, ensure it is valid JSON (no trailing commas, use double quot
 # ------------------------ Utility: robust JSON extractor ------------------
 def extract_top_level_json(s: str) -> str:
-    """Extract the first top-level JSON object by tracking braces."""
     start = s.find("{")
     if start == -1:
         return ""
@@ -296,7 +279,6 @@ def run_pipeline(
             max_tokens=int(max_tokens),
         )
     except Exception as e:
-        # Surface backend/model setup errors directly
         err = f"ERROR initializing backend '{backend_choice}': {e}"
         return "", err
@@ -324,18 +306,6 @@ def run_pipeline(
     return final_json, result_text
-# ------------------------ Optional Spaces warmup --------------------------
-# If you deploy on HF Spaces and want to pre-allocate GPU for Local mode,
-# you can try to warm up the model at startup by setting:
-#   BACKEND_WARMUP=1 and BACKEND_WARMUP_MODEL=<model id>
-if (os.getenv("SYSTEM") == "spaces") and os.getenv("BACKEND_WARMUP") == "1":
-    try:
-        wm = os.getenv("BACKEND_WARMUP_MODEL", "swiss-ai/Apertus-70B-Instruct-2509")
-        _get_local_model(wm)
-        print(f"[Warmup] Local GPU model loaded: {wm}")
-    except Exception as e:
-        print(f"[Warmup] Skipped or failed: {e}")
 # ------------------------ Gradio UI --------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
@@ -358,7 +328,7 @@ with gr.Blocks() as demo:
             )
             model_id = gr.Textbox(
                 value="swiss-ai/Apertus-70B-Instruct-2509",
-                label="Model ID (repo or local path)"
             )
         timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)")
@@ -393,7 +363,6 @@ with gr.Blocks() as demo:
         temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
         max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")
-    # Helpful hint text depending on backend
     backend_tips = gr.Markdown(
         "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n"
         "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. "
@@ -405,13 +374,11 @@ with gr.Blocks() as demo:
     final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
     transcript = gr.Textbox(label="Agent Transcript", lines=18)
-    # Dynamically show/hide token & timeout based on backend
     def _toggle_backend_fields(choice):
-        # Show token + timeout only for Hosted API
         return (
-            gr.update(visible=(choice == "Hosted API")),
-            gr.update(visible=True),  # model_id always visible
-            gr.update(visible=(choice == "Hosted API"))
         )
     backend_choice.change(

 import os
 import json
 import gradio as gr
+import spaces
 from huggingface_hub import InferenceClient
 from smolagents import CodeAgent, InferenceClientModel, tool
     target_dok: str,
     agg: str = "max"
 ) -> dict:
+    """Classify a question against Bloom’s and DOK targets and return guidance."""
     res = classify_levels_phrases(
         question,
         BLOOMS_PHRASES,
         "feedback": " ".join(feedback_parts) if feedback_parts else "On target.",
     }
+# ------------------------ Backend selection + GPU-wrapped local loader ------------------------
+_LOCAL_MODEL_CACHE = {"model": None, "model_id": None}
+@spaces.GPU(duration=30)  # request GPU only when loading/using local model
+def get_local_model_gpu(model_id: str):
+    """
+    Load and cache a local Transformers model for smolagents on GPU.
+    Decorated so Spaces knows this task needs a GPU.
+    """
+    # Import here to keep Hosted mode lightweight.
     try:
         from smolagents import TransformersModel  # provided by smolagents
     except Exception as e:
     ):
         return _LOCAL_MODEL_CACHE["model"]
     local_model = TransformersModel(
         model_id=model_id,
+        device_map="auto"  # lets accelerate pick the best device(s)
     )
     _LOCAL_MODEL_CACHE["model"] = local_model
     _LOCAL_MODEL_CACHE["model_id"] = model_id
     return local_model
 def make_agent(
     backend_choice: str,          # "Hosted API" | "Local GPU"
     hf_token: str,
     max_tokens: int
 ):
     if backend_choice == "Local GPU":
+        # This call is GPU-annotated; Spaces will allocate a GPU for it.
+        model = get_local_model_gpu(model_id)
     else:
         client = InferenceClient(
             model=model_id,
         model = InferenceClientModel(client=client)
     agent = CodeAgent(model=model, tools=[classify_and_score])
     agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}
     return agent
 # ------------------------ Utility: robust JSON extractor ------------------
 def extract_top_level_json(s: str) -> str:
     start = s.find("{")
     if start == -1:
         return ""
             max_tokens=int(max_tokens),
         )
     except Exception as e:
         err = f"ERROR initializing backend '{backend_choice}': {e}"
         return "", err
     return final_json, result_text
 # ------------------------ Gradio UI --------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
             )
             model_id = gr.Textbox(
                 value="swiss-ai/Apertus-70B-Instruct-2509",
+                label="Model ID (repo id for Hosted, or local repo for GPU)"
             )
         timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)")
         temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
         max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")
     backend_tips = gr.Markdown(
         "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n"
         "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. "
     final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
     transcript = gr.Textbox(label="Agent Transcript", lines=18)
     def _toggle_backend_fields(choice):
         return (
+            gr.update(visible=(choice == "Hosted API")),  # hf_token
+            gr.update(visible=True),                      # model_id always visible
+            gr.update(visible=(choice == "Hosted API"))   # timeout slider
         )
     backend_choice.change(