Fix crash when last audio chunk has fewer frames than patch stride

Pad the last chunk to self.time_patches frames when it's shorter after
splitting, preventing a RuntimeError in the Conv2d patch embedding when
input is not a multiple of target_length.

Files changed (1) hide show

modeling_dasheng_encoder.py +8 -2

modeling_dasheng_encoder.py CHANGED Viewed

@@ -235,10 +235,16 @@ class DashengEncoder(nn.Module):
         x = rearrange(x, "b f t -> b 1 f t")
         x = self.init_bn(x)
-        input_splits = x.split(self.target_length, dim=-1)
         masks = [None for _ in range(len(input_splits))]
         if attention_mask is not None:
-            masks = attention_mask.split(self.target_length, dim=-1)
         outputs = []
         for i, (input_split_x, mask) in enumerate(zip(input_splits, masks)):

         x = rearrange(x, "b f t -> b 1 f t")
         x = self.init_bn(x)
+        input_splits = list(x.split(self.target_length, dim=-1))
         masks = [None for _ in range(len(input_splits))]
         if attention_mask is not None:
+            masks = list(attention_mask.split(self.target_length, dim=-1))
+        if input_splits[-1].shape[-1] < self.time_patches:
+            pad_size = self.time_patches - input_splits[-1].shape[-1]
+            input_splits[-1] = torch.nn.functional.pad(input_splits[-1], (0, pad_size))
+            if masks[-1] is not None:
+                masks[-1] = torch.nn.functional.pad(masks[-1], (0, pad_size), value=0)
         outputs = []
         for i, (input_split_x, mask) in enumerate(zip(input_splits, masks)):