OVI

Paused

App Files Files Community

alex commited on Oct 3

Commit

b2ed9cf

1 Parent(s): 49ba373

3 is good enough

Browse files

Files changed (1) hide show

ovi/modules/attention.py +29 -9

ovi/modules/attention.py CHANGED Viewed

@@ -55,7 +55,7 @@ def flash_attention(
     assert q.device.type == 'cuda' and q.size(-1) <= 256
     # params
-    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
     def half(x):
         return x if x.dtype in half_dtypes else x.to(dtype)
@@ -93,26 +93,46 @@ def flash_attention(
     # apply attention
     if FLASH_ATTN_3_AVAILABLE:
-        # Note: dropout_p, window_size are not supported in FA3 now.
-        x = flash_attn_interface.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v,
             cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
                 0, dtype=torch.int32).to(q.device, non_blocking=True),
             cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
             seqused_q=None,
             seqused_k=None,
             max_seqlen_q=lq,
             max_seqlen_k=lk,
             softmax_scale=softmax_scale,
             causal=causal,
-            deterministic=deterministic)
-        if isinstance(x, tuple):
-            x = x[0]
-        x = x.unflatten(0, (b, lq))
     else:
         assert FLASH_ATTN_2_AVAILABLE

     assert q.device.type == 'cuda' and q.size(-1) <= 256
     # params
+    b, lq, nheads, lk, out_dtype = q.size(0), q.size(1), q.size(2), k.size(1), q.dtype
     def half(x):
         return x if x.dtype in half_dtypes else x.to(dtype)
     # apply attention
     if FLASH_ATTN_3_AVAILABLE:
+        ret = flash_attn_interface.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v,
             cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
                 0, dtype=torch.int32).to(q.device, non_blocking=True),
             cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(k.device, non_blocking=True),
             seqused_q=None,
             seqused_k=None,
             max_seqlen_q=lq,
             max_seqlen_k=lk,
             softmax_scale=softmax_scale,
             causal=causal,
+            deterministic=deterministic
+        )
+        # Some FA3 wheels return (out, softmax_lse); some return just out.
+        out0 = ret[0] if isinstance(ret, (tuple, list)) else ret
+        # Normalize FA3 output layout to (total_q, nheads, headdim)
+        total_q = b * lq
+        if out0.dim() == 3:
+            if out0.shape[0] == total_q:
+                pass  # (total_q, nheads, headdim) -> good
+            elif out0.shape[0] == nheads and out0.shape[1] == total_q:
+                # heads-first -> transpose to (total_q, nheads, headdim)
+                out0 = out0.transpose(0, 1).contiguous()
+            else:
+                raise RuntimeError(
+                    f"Unexpected FA3 output shape {tuple(out0.shape)}; "
+                    f"expected (total_q, nheads, headdim) or (nheads, total_q, headdim)"
+                )
+        else:
+            raise RuntimeError(
+                f"Unexpected FA3 output rank {out0.dim()} with shape {tuple(out0.shape)}; "
+                f"expected a 3D tensor."
+            )
+        x = out0.unflatten(0, (b, lq))
     else:
         assert FLASH_ATTN_2_AVAILABLE