Spaces:

Jiaqi-hkust
/

Robust-R1

Running on Zero

App Files Files Community

Jiaqi-hkust commited on 19 days ago

Commit

4f2a894

verified ·

1 Parent(s): b0ac577

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +25 -16

app.py CHANGED Viewed

@@ -40,12 +40,6 @@ if not is_spaces:
 MODEL_PATH = os.getenv("MODEL_PATH", "Jiaqi-hkust/Robust-R1-RL")
-def gpu_decorator(func):
-    """条件应用 GPU 装饰器"""
-    if spaces_available and GPU is not None:
-        return GPU(func)
-    return func
 print(f"==========================================")
 print(f"Initializing application...")
 print(f"==========================================")
@@ -55,27 +49,38 @@ class ModelHandler:
         self.model_path = model_path
         self.model = None
         self.processor = None
-        self._load_model()
     def _load_model(self):
         try:
             print(f"⏳ Loading model weights, this may take a few minutes...")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
-            if torch.cuda.is_available():
-                device_capability = torch.cuda.get_device_capability()
-                use_flash_attention = device_capability[0] >= 8
-                print(f"🔧 CUDA available, device capability: {device_capability}")
-            else:
-                use_flash_attention = False
-                print(f"🔧 Using CPU or non-CUDA device")
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 self.model_path,
-                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                 device_map="auto",
-                # attn_implementation="flash_attention_2" if use_flash_attention else "eager",
                 attn_implementation="sdpa",
                 trust_remote_code=True
             )
@@ -85,6 +90,10 @@ class ModelHandler:
             raise e
     def predict(self, message_dict, history, temperature, max_tokens):
         text = message_dict.get("text", "")
         files = message_dict.get("files", [])

 MODEL_PATH = os.getenv("MODEL_PATH", "Jiaqi-hkust/Robust-R1-RL")
 print(f"==========================================")
 print(f"Initializing application...")
 print(f"==========================================")
         self.model_path = model_path
         self.model = None
         self.processor = None
+        # 不在 __init__ 中加载模型，延迟到实际使用时
     def _load_model(self):
+        """延迟加载模型，在 GPU 装饰器函数内部调用"""
+        if self.model is not None:
+            return  # 已经加载过了
         try:
             print(f"⏳ Loading model weights, this may take a few minutes...")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
+            # 在 ZeroGPU 环境中，避免过早检查 CUDA
+            # 让 device_map="auto" 自动处理设备分配
+            try:
+                cuda_available = torch.cuda.is_available()
+                if cuda_available:
+                    device_capability = torch.cuda.get_device_capability()
+                    print(f"🔧 CUDA available, device capability: {device_capability}")
+                    torch_dtype = torch.bfloat16
+                else:
+                    print(f"🔧 Using CPU or non-CUDA device")
+                    torch_dtype = torch.float32
+            except RuntimeError:
+                # ZeroGPU 环境中可能暂时无法检查 CUDA
+                print(f"🔧 CUDA check skipped (ZeroGPU environment)")
+                torch_dtype = torch.bfloat16  # 假设有 GPU，让 device_map 处理
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 self.model_path,
+                torch_dtype=torch_dtype,
                 device_map="auto",
                 attn_implementation="sdpa",
                 trust_remote_code=True
             )
             raise e
     def predict(self, message_dict, history, temperature, max_tokens):
+        # 确保模型已加载
+        if self.model is None:
+            self._load_model()
         text = message_dict.get("text", "")
         files = message_dict.get("files", [])