Spaces:

Ryan-PR
/

Refacade

Sleeping

App Files Files Community

Ryan-PR commited on 21 days ago

Commit

103eee6

verified ·

1 Parent(s): f77a896

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -22

app.py CHANGED Viewed

@@ -341,7 +341,7 @@ def track_video(n_frames, video_state):
     images = [cv2.resize(img, (W_, H_)) for img in images]
     video_state["origin_images"] = images
-    images = np.array(images)
     sam2_checkpoint = "./sam2/SAM2-Video-Predictor/checkpoints/sam2_hiera_large.pt"
     config = "sam2_hiera_l.yaml"
@@ -350,19 +350,19 @@ def track_video(n_frames, video_state):
     )
     inference_state = video_predictor_local.init_state(
-        images=images / 255, device="cuda"
     )
     if len(torch.from_numpy(video_state["masks"][0]).shape) == 3:
-        mask = torch.from_numpy(video_state["masks"][0])[:, :, 0]
     else:
-        mask = torch.from_numpy(video_state["masks"][0])
     video_predictor_local.add_new_mask(
         inference_state=inference_state,
         frame_idx=0,
         obj_id=obj_id,
-        mask=mask,
     )
     output_frames = []
@@ -375,7 +375,7 @@ def track_video(n_frames, video_state):
     for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor_local.propagate_in_video(
         inference_state
     ):
-        frame = images[out_frame_idx].astype(np.float32) / 255.0
         mask = np.zeros((H, W, 3), dtype=np.float32)
         for i, logit in enumerate(out_mask_logits):
             out_mask = logit.cpu().squeeze().detach().numpy()
@@ -388,8 +388,6 @@ def track_video(n_frames, video_state):
         painted = np.uint8(np.clip(painted * 255, 0, 255))
         output_frames.append(painted)
-    video_state["masks"] = mask_frames
     video_file = f"/tmp/{time.time()}-{random.random()}-tracked_output.mp4"
     clip = ImageSequenceClip(output_frames, fps=15)
     clip.write_videofile(
@@ -404,7 +402,7 @@ def track_video(n_frames, video_state):
     except Exception as e:
         print("Error checking video file:", repr(e))
-    return video_file, video_state
 @spaces.GPU(duration=150)
@@ -415,10 +413,11 @@ def inference_and_return_video(
     ref_patch_ratio,
     fg_threshold,
     seed,
-    video_state,
     ref_state,
 ):
-    if video_state["origin_images"] is None or video_state["masks"] is None:
         print("No video frames or video masks.")
         return None, None, None
@@ -426,11 +425,11 @@ def inference_and_return_video(
         print("Reference image or reference mask missing.")
         return None, None, None
-    images = video_state["origin_images"]
-    masks = video_state["masks"]
-    video_frames = []
-    mask_frames = []
     for img, msk in zip(images, masks):
         if not isinstance(img, np.ndarray):
             img = np.asarray(img)
@@ -447,10 +446,10 @@ def inference_and_return_video(
         m2 = (m2 > 0.5).astype(np.uint8) * 255
         msk_pil = Image.fromarray(m2, mode="L")
-        video_frames.append(img_pil)
-        mask_frames.append(msk_pil)
-    num_frames = len(video_frames)
     h0, w0 = images[0].shape[:2]
     if h0 > w0:
@@ -470,8 +469,8 @@ def inference_and_return_video(
     pipe.to("cuda")
     with torch.no_grad():
         retex_frames, mesh_frames, ref_img_out = pipe(
-            video=video_frames,
-            mask=mask_frames,
             reference_image=ref_img_pil,
             reference_mask=ref_mask_pil,
             conditioning_scale=1.0,
@@ -608,6 +607,9 @@ with gr.Blocks() as demo:
         }
     )
     gr.Markdown(f"<div style='text-align:center;'>{text}</div>")
     with gr.Column():
@@ -754,7 +756,8 @@ with gr.Blocks() as demo:
                 ref_patch_slider,
                 fg_threshold_slider,
                 seed_slider,
-                video_state,
                 ref_state,
             ],
             outputs=[remove_video, mesh_video, ref_image_final],
@@ -777,7 +780,7 @@ with gr.Blocks() as demo:
         track_btn.click(
             track_video,
             inputs=[n_frames_slider, video_state],
-            outputs=[video_output, video_state],
         )
         ref_image_input.change(

     images = [cv2.resize(img, (W_, H_)) for img in images]
     video_state["origin_images"] = images
+    images_np = np.array(images)
     sam2_checkpoint = "./sam2/SAM2-Video-Predictor/checkpoints/sam2_hiera_large.pt"
     config = "sam2_hiera_l.yaml"
     )
     inference_state = video_predictor_local.init_state(
+        images=images_np / 255, device="cuda"
     )
     if len(torch.from_numpy(video_state["masks"][0]).shape) == 3:
+        mask0 = torch.from_numpy(video_state["masks"][0])[:, :, 0]
     else:
+        mask0 = torch.from_numpy(video_state["masks"][0])
     video_predictor_local.add_new_mask(
         inference_state=inference_state,
         frame_idx=0,
         obj_id=obj_id,
+        mask=mask0,
     )
     output_frames = []
     for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor_local.propagate_in_video(
         inference_state
     ):
+        frame = images_np[out_frame_idx].astype(np.float32) / 255.0
         mask = np.zeros((H, W, 3), dtype=np.float32)
         for i, logit in enumerate(out_mask_logits):
             out_mask = logit.cpu().squeeze().detach().numpy()
         painted = np.uint8(np.clip(painted * 255, 0, 255))
         output_frames.append(painted)
     video_file = f"/tmp/{time.time()}-{random.random()}-tracked_output.mp4"
     clip = ImageSequenceClip(output_frames, fps=15)
     clip.write_videofile(
     except Exception as e:
         print("Error checking video file:", repr(e))
+    return video_file, images, mask_frames
 @spaces.GPU(duration=150)
     ref_patch_ratio,
     fg_threshold,
     seed,
+    video_frames,
+    mask_frames,
     ref_state,
 ):
+    if video_frames is None or mask_frames is None:
         print("No video frames or video masks.")
         return None, None, None
         print("Reference image or reference mask missing.")
         return None, None, None
+    images = video_frames
+    masks = mask_frames
+    video_frames_pil = []
+    mask_frames_pil = []
     for img, msk in zip(images, masks):
         if not isinstance(img, np.ndarray):
             img = np.asarray(img)
         m2 = (m2 > 0.5).astype(np.uint8) * 255
         msk_pil = Image.fromarray(m2, mode="L")
+        video_frames_pil.append(img_pil)
+        mask_frames_pil.append(msk_pil)
+    num_frames = len(video_frames_pil)
     h0, w0 = images[0].shape[:2]
     if h0 > w0:
     pipe.to("cuda")
     with torch.no_grad():
         retex_frames, mesh_frames, ref_img_out = pipe(
+            video=video_frames_pil,
+            mask=mask_frames_pil,
             reference_image=ref_img_pil,
             reference_mask=ref_mask_pil,
             conditioning_scale=1.0,
         }
     )
+    video_frames_state = gr.State(None)
+    mask_frames_state = gr.State(None)
     gr.Markdown(f"<div style='text-align:center;'>{text}</div>")
     with gr.Column():
                 ref_patch_slider,
                 fg_threshold_slider,
                 seed_slider,
+                video_frames_state,
+                mask_frames_state,
                 ref_state,
             ],
             outputs=[remove_video, mesh_video, ref_image_final],
         track_btn.click(
             track_video,
             inputs=[n_frames_slider, video_state],
+            outputs=[video_output, video_frames_state, mask_frames_state],
         )
         ref_image_input.change(