Spaces:

AiSudo
/

ZIT-Controlnet

Running on Zero

App Files Files Community

Alexander Bagus commited on 2 days ago

Commit

26893dc

1 Parent(s): be751d2

22

Browse files

Files changed (11) hide show

.gitignore +1 -1
videox_fun/dist/__init__.py +72 -0
videox_fun/dist/cogvideox_xfuser.py +93 -0
videox_fun/dist/flux2_xfuser.py +194 -0
videox_fun/dist/flux_xfuser.py +165 -0
videox_fun/dist/fsdp.py +44 -0
videox_fun/dist/fuser.py +87 -0
videox_fun/dist/hunyuanvideo_xfuser.py +166 -0
videox_fun/dist/qwen_xfuser.py +176 -0
videox_fun/dist/wan_xfuser.py +180 -0
videox_fun/dist/z_image_xfuser.py +85 -0

.gitignore CHANGED Viewed

@@ -1,9 +1,9 @@
 # Packages
 *.egg
 *.egg-info
-dist
 build
 eggs
 parts

+/models/
 # Packages
 *.egg
 *.egg-info
 build
 eggs
 parts

videox_fun/dist/__init__.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import importlib.util
+from .cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0
+from .flux2_xfuser import Flux2MultiGPUsAttnProcessor2_0
+from .flux_xfuser import FluxMultiGPUsAttnProcessor2_0
+from .fsdp import shard_model
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    get_world_group, init_distributed_environment,
+                    initialize_model_parallel, sequence_parallel_all_gather,
+                    sequence_parallel_chunk, set_multi_gpus_devices,
+                    xFuserLongContextAttention)
+from .hunyuanvideo_xfuser import HunyuanVideoMultiGPUsAttnProcessor2_0
+from .qwen_xfuser import QwenImageMultiGPUsAttnProcessor2_0
+from .wan_xfuser import usp_attn_forward, usp_attn_s2v_forward
+from .z_image_xfuser import ZMultiGPUsSingleStreamAttnProcessor
+# The pai_fuser is an internally developed acceleration package, which can be used on PAI.
+if importlib.util.find_spec("paifuser") is not None:
+    # --------------------------------------------------------------- #
+    #   The simple_wrapper is used to solve the problem
+    #   about conflicts between cython and torch.compile
+    # --------------------------------------------------------------- #
+    def simple_wrapper(func):
+        def inner(*args, **kwargs):
+            return func(*args, **kwargs)
+        return inner
+    # --------------------------------------------------------------- #
+    #   Sparse Attention Kernel
+    # --------------------------------------------------------------- #
+    from paifuser.models import parallel_magvit_vae
+    from paifuser.ops import wan_usp_sparse_attention_wrapper
+    from . import wan_xfuser
+    # --------------------------------------------------------------- #
+    #   Sparse Attention
+    # --------------------------------------------------------------- #
+    usp_sparse_attn_wrap_forward = simple_wrapper(wan_usp_sparse_attention_wrapper()(wan_xfuser.usp_attn_forward))
+    wan_xfuser.usp_attn_forward = usp_sparse_attn_wrap_forward
+    usp_attn_forward = usp_sparse_attn_wrap_forward
+    print("Import PAI VAE Turbo and Sparse Attention")
+    # --------------------------------------------------------------- #
+    #   Fast Rope Kernel
+    # --------------------------------------------------------------- #
+    import types
+    import torch
+    from paifuser.ops import (ENABLE_KERNEL, usp_fast_rope_apply_qk,
+                              usp_rope_apply_real_qk)
+    def deepcopy_function(f):
+        return types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__,closure=f.__closure__)
+    local_rope_apply_qk = deepcopy_function(wan_xfuser.rope_apply_qk)
+    if ENABLE_KERNEL:
+        def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
+            if torch.is_grad_enabled():
+                return local_rope_apply_qk(q, k, grid_sizes, freqs)
+            else:
+                return usp_fast_rope_apply_qk(q, k, grid_sizes, freqs)
+    else:
+        def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
+            return usp_rope_apply_real_qk(q, k, grid_sizes, freqs)
+    wan_xfuser.rope_apply_qk = adaptive_fast_usp_rope_apply_qk
+    rope_apply_qk = adaptive_fast_usp_rope_apply_qk
+    print("Import PAI Fast rope")

videox_fun/dist/cogvideox_xfuser.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import Optional
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention import Attention
+from diffusers.models.embeddings import apply_rotary_emb
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+class CogVideoXMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+        img_q = query[:, :, text_seq_length:].transpose(1, 2)
+        txt_q = query[:, :, :text_seq_length].transpose(1, 2)
+        img_k = key[:, :, text_seq_length:].transpose(1, 2)
+        txt_k = key[:, :, :text_seq_length].transpose(1, 2)
+        img_v = value[:, :, text_seq_length:].transpose(1, 2)
+        txt_v = value[:, :, :text_seq_length].transpose(1, 2)
+        hidden_states = xFuserLongContextAttention()(
+            None,
+            img_q, img_k, img_v, dropout_p=0.0, causal=False,
+            joint_tensor_query=txt_q,
+            joint_tensor_key=txt_k,
+            joint_tensor_value=txt_v,
+            joint_strategy='front',
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states

videox_fun/dist/flux2_xfuser.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from .fuser import xFuserLongContextAttention
+def _get_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+    return query, key, value, encoder_query, encoder_key, encoder_value
+def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+    sequence_dim: int = 2,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        if sequence_dim == 2:
+            cos = cos[None, None, :, :]
+            sin = sin[None, None, :, :]
+        elif sequence_dim == 1:
+            cos = cos[None, :, None, :]
+            sin = sin[None, :, None, :]
+        else:
+            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class Flux2MultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Flux2MultiGPUsAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: "FluxAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        text_seq_len: int = None,
+    ) -> torch.FloatTensor:
+        # Determine which type of attention we're processing
+        is_parallel_self_attn = hasattr(attn, 'to_qkv_mlp_proj') and attn.to_qkv_mlp_proj is not None
+        if is_parallel_self_attn:
+            # Parallel in (QKV + MLP in) projection
+            hidden_states = attn.to_qkv_mlp_proj(hidden_states)
+            qkv, mlp_hidden_states = torch.split(
+                hidden_states, [3 * attn.inner_dim, attn.mlp_hidden_dim * attn.mlp_mult_factor], dim=-1
+            )
+            # Handle the attention logic
+            query, key, value = qkv.chunk(3, dim=-1)
+        else:
+            query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+                attn, hidden_states, encoder_hidden_states
+            )
+        # Common processing for query, key, value
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        # Handle encoder projections (only for standard attention)
+        if not is_parallel_self_attn and attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+        # Apply rotary embeddings
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+        if not is_parallel_self_attn and attn.added_kv_proj_dim is not None and text_seq_len is None:
+            text_seq_len = encoder_query.shape[1]
+        txt_query, txt_key, txt_value = query[:, :text_seq_len], key[:, :text_seq_len], value[:, :text_seq_len]
+        img_query, img_key, img_value = query[:, text_seq_len:], key[:, text_seq_len:], value[:, text_seq_len:]
+        half_dtypes = (torch.float16, torch.bfloat16)
+        def half(x):
+            return x if x.dtype in half_dtypes else x.to(torch.bfloat16)
+        hidden_states = xFuserLongContextAttention()(
+            None,
+            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
+            joint_tensor_query=half(txt_query) if txt_query is not None else None,
+            joint_tensor_key=half(txt_key) if txt_key is not None else None,
+            joint_tensor_value=half(txt_value) if txt_value is not None else None,
+            joint_strategy='front',
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+        if is_parallel_self_attn:
+            # Handle the feedforward (FF) logic
+            mlp_hidden_states = attn.mlp_act_fn(mlp_hidden_states)
+            # Concatenate and parallel output projection
+            hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
+            hidden_states = attn.to_out(hidden_states)
+            return hidden_states
+        else:
+            # Split encoder and latent hidden states if encoder was used
+            if encoder_hidden_states is not None:
+                encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                    [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+                )
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            # Project output
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)
+            if encoder_hidden_states is not None:
+                return hidden_states, encoder_hidden_states
+            else:
+                return hidden_states

videox_fun/dist/flux_xfuser.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from .fuser import xFuserLongContextAttention
+def _get_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+    return query, key, value, encoder_query, encoder_key, encoder_value
+def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+    sequence_dim: int = 2,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        if sequence_dim == 2:
+            cos = cos[None, None, :, :]
+            sin = sin[None, None, :, :]
+        elif sequence_dim == 1:
+            cos = cos[None, :, None, :]
+            sin = sin[None, :, None, :]
+        else:
+            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class FluxMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxMultiGPUsAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: "FluxAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        text_seq_len: int = None,
+    ) -> torch.FloatTensor:
+        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+            attn, hidden_states, encoder_hidden_states
+        )
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        if attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+        # Apply rotary embeddings
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+        if attn.added_kv_proj_dim is not None and text_seq_len is None:
+            text_seq_len = encoder_query.shape[1]
+        txt_query, txt_key, txt_value = query[:, :text_seq_len], key[:, :text_seq_len], value[:, :text_seq_len]
+        img_query, img_key, img_value = query[:, text_seq_len:], key[:, text_seq_len:], value[:, text_seq_len:]
+        half_dtypes = (torch.float16, torch.bfloat16)
+        def half(x):
+            return x if x.dtype in half_dtypes else x.to(torch.bfloat16)
+        hidden_states = xFuserLongContextAttention()(
+            None,
+            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
+            joint_tensor_query=half(txt_query) if txt_query is not None else None,
+            joint_tensor_key=half(txt_key) if txt_key is not None else None,
+            joint_tensor_value=half(txt_value) if txt_value is not None else None,
+            joint_strategy='front',
+        )
+        # Reshape back
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(img_query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+            )
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states

videox_fun/dist/fsdp.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyied from https://github.com/Wan-Video/Wan2.1/blob/main/wan/distributed/fsdp.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+    module_to_wrapper=None,
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in (model.blocks if module_to_wrapper is None else module_to_wrapper)),
+        mixed_precision=MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

videox_fun/dist/fuser.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import importlib.util
+import torch
+import torch.distributed as dist
+try:
+    # The pai_fuser is an internally developed acceleration package, which can be used on PAI.
+    if importlib.util.find_spec("paifuser") is not None:
+        import paifuser
+        from paifuser.xfuser.core.distributed import (
+            get_sequence_parallel_rank, get_sequence_parallel_world_size,
+            get_sp_group, get_world_group, init_distributed_environment,
+            initialize_model_parallel, model_parallel_is_initialized)
+        from paifuser.xfuser.core.long_ctx_attention import \
+            xFuserLongContextAttention
+        print("Import PAI DiT Turbo")
+    else:
+        import xfuser
+        from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                             get_sequence_parallel_world_size,
+                                             get_sp_group, get_world_group,
+                                             init_distributed_environment,
+                                             initialize_model_parallel,
+                                             model_parallel_is_initialized)
+        from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+        print("Xfuser import sucessful")
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    get_world_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+def set_multi_gpus_devices(ulysses_degree, ring_degree, classifier_free_guidance_degree=1):
+    if ulysses_degree > 1 or ring_degree > 1 or classifier_free_guidance_degree > 1:
+        if get_sp_group is None:
+            raise RuntimeError("xfuser is not installed.")
+        dist.init_process_group("nccl")
+        print('parallel inference enabled: ulysses_degree=%d ring_degree=%d classifier_free_guidance_degree=% rank=%d world_size=%d' % (
+            ulysses_degree, ring_degree, classifier_free_guidance_degree, dist.get_rank(),
+            dist.get_world_size()))
+        assert dist.get_world_size() == ring_degree * ulysses_degree * classifier_free_guidance_degree, \
+                    "number of GPUs(%d) should be equal to ring_degree * ulysses_degree * classifier_free_guidance_degree." % dist.get_world_size()
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+        initialize_model_parallel(sequence_parallel_degree=ring_degree * ulysses_degree,
+                classifier_free_guidance_degree=classifier_free_guidance_degree,
+                ring_degree=ring_degree,
+                ulysses_degree=ulysses_degree)
+        # device = torch.device("cuda:%d" % dist.get_rank())
+        device = torch.device(f"cuda:{get_world_group().local_rank}")
+        print('rank=%d device=%s' % (get_world_group().rank, str(device)))
+    else:
+        device = "cuda"
+    return device
+def sequence_parallel_chunk(x, dim=1):
+    if get_sequence_parallel_world_size is None or not model_parallel_is_initialized():
+        return x
+    sp_world_size = get_sequence_parallel_world_size()
+    if sp_world_size <= 1:
+        return x
+    sp_rank = get_sequence_parallel_rank()
+    sp_group = get_sp_group()
+    if x.size(1) % sp_world_size != 0:
+        raise ValueError(f"Dim 1 of x ({x.size(1)}) not divisible by SP world size ({sp_world_size})")
+    chunks = torch.chunk(x, sp_world_size, dim=1)
+    x = chunks[sp_rank]
+    return x
+def sequence_parallel_all_gather(x, dim=1):
+    if get_sequence_parallel_world_size is None or not model_parallel_is_initialized():
+        return x
+    sp_world_size = get_sequence_parallel_world_size()
+    if sp_world_size <= 1:
+        return x  # No gathering needed
+    sp_group = get_sp_group()
+    gathered_x = sp_group.all_gather(x, dim=dim)
+    return gathered_x

videox_fun/dist/hunyuanvideo_xfuser.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from typing import Optional
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention import Attention
+from diffusers.models.embeddings import apply_rotary_emb
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+def extract_seqlens_from_mask(attn_mask, text_seq_length):
+    if attn_mask is None:
+        return None
+    if len(attn_mask.shape) == 4:
+        bs, _, _, seq_len = attn_mask.shape
+        if attn_mask.dtype == torch.bool:
+            valid_mask = attn_mask.squeeze(1).squeeze(1)
+        else:
+            valid_mask = ~torch.isinf(attn_mask.squeeze(1).squeeze(1))
+    elif len(attn_mask.shape) == 3:
+        raise ValueError(
+            "attn_mask should be 2D or 4D tensor, but got {}".format(
+                attn_mask.shape))
+    seqlens = valid_mask[:, -text_seq_length:].sum(dim=1)
+    return seqlens
+class HunyuanVideoMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if xFuserLongContextAttention is not None:
+            try:
+                self.hybrid_seq_parallel_attn = xFuserLongContextAttention()
+            except Exception:
+                self.hybrid_seq_parallel_attn = None
+        else:
+            self.hybrid_seq_parallel_attn = None
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if attn.add_q_proj is None and encoder_hidden_states is not None:
+            hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        # 1. QKV projections
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        # 2. QK normalization
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # 3. Rotational positional embeddings applied to latent stream
+        if image_rotary_emb is not None:
+            if attn.add_q_proj is None and encoder_hidden_states is not None:
+                query = torch.cat(
+                    [
+                        apply_rotary_emb(query[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
+                        query[:, :, -encoder_hidden_states.shape[1] :],
+                    ],
+                    dim=2,
+                )
+                key = torch.cat(
+                    [
+                        apply_rotary_emb(key[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
+                        key[:, :, -encoder_hidden_states.shape[1] :],
+                    ],
+                    dim=2,
+                )
+            else:
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+        # 4. Encoder condition QKV projection and normalization
+        if attn.add_q_proj is not None and encoder_hidden_states is not None:
+            encoder_query = attn.add_q_proj(encoder_hidden_states)
+            encoder_key = attn.add_k_proj(encoder_hidden_states)
+            encoder_value = attn.add_v_proj(encoder_hidden_states)
+            encoder_query = encoder_query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            encoder_key = encoder_key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            encoder_value = encoder_value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_query = attn.norm_added_q(encoder_query)
+            if attn.norm_added_k is not None:
+                encoder_key = attn.norm_added_k(encoder_key)
+            query = torch.cat([query, encoder_query], dim=2)
+            key = torch.cat([key, encoder_key], dim=2)
+            value = torch.cat([value, encoder_value], dim=2)
+        # 5. Attention
+        if encoder_hidden_states is not None:
+            text_seq_length = encoder_hidden_states.size(1)
+            q_lens = k_lens = extract_seqlens_from_mask(attention_mask, text_seq_length)
+            img_q = query[:, :, :-text_seq_length].transpose(1, 2)
+            txt_q = query[:, :, -text_seq_length:].transpose(1, 2)
+            img_k = key[:, :, :-text_seq_length].transpose(1, 2)
+            txt_k = key[:, :, -text_seq_length:].transpose(1, 2)
+            img_v = value[:, :, :-text_seq_length].transpose(1, 2)
+            txt_v = value[:, :, -text_seq_length:].transpose(1, 2)
+            hidden_states = torch.zeros_like(query.transpose(1, 2))
+            local_q_length = img_q.size()[1]
+            for i in range(len(q_lens)):
+                hidden_states[i][:local_q_length + q_lens[i]] = self.hybrid_seq_parallel_attn(
+                    None,
+                    img_q[i].unsqueeze(0), img_k[i].unsqueeze(0), img_v[i].unsqueeze(0), dropout_p=0.0, causal=False,
+                    joint_tensor_query=txt_q[i][:q_lens[i]].unsqueeze(0),
+                    joint_tensor_key=txt_k[i][:q_lens[i]].unsqueeze(0),
+                    joint_tensor_value=txt_v[i][:q_lens[i]].unsqueeze(0),
+                    joint_strategy='rear',
+                )
+        else:
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            hidden_states = self.hybrid_seq_parallel_attn(
+                None,
+                query, key, value, dropout_p=0.0, causal=False
+            )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+        # 6. Output projection
+        if encoder_hidden_states is not None:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : -encoder_hidden_states.shape[1]],
+                hidden_states[:, -encoder_hidden_states.shape[1] :],
+            )
+            if getattr(attn, "to_out", None) is not None:
+                hidden_states = attn.to_out[0](hidden_states)
+                hidden_states = attn.to_out[1](hidden_states)
+            if getattr(attn, "to_add_out", None) is not None:
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states

videox_fun/dist/qwen_xfuser.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import functools
+import glob
+import json
+import math
+import os
+import types
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
+                             scale_lora_layers, unscale_lora_layers)
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from torch import nn
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+def apply_rotary_emb_qwen(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(1)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class QwenImageMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,  # Image stream
+        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
+        encoder_hidden_states_mask: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if encoder_hidden_states is None:
+            raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
+        seq_txt = encoder_hidden_states.shape[1]
+        # Compute QKV for image stream (sample projections)
+        img_query = attn.to_q(hidden_states)
+        img_key = attn.to_k(hidden_states)
+        img_value = attn.to_v(hidden_states)
+        # Compute QKV for text stream (context projections)
+        txt_query = attn.add_q_proj(encoder_hidden_states)
+        txt_key = attn.add_k_proj(encoder_hidden_states)
+        txt_value = attn.add_v_proj(encoder_hidden_states)
+        # Reshape for multi-head attention
+        img_query = img_query.unflatten(-1, (attn.heads, -1))
+        img_key = img_key.unflatten(-1, (attn.heads, -1))
+        img_value = img_value.unflatten(-1, (attn.heads, -1))
+        txt_query = txt_query.unflatten(-1, (attn.heads, -1))
+        txt_key = txt_key.unflatten(-1, (attn.heads, -1))
+        txt_value = txt_value.unflatten(-1, (attn.heads, -1))
+        # Apply QK normalization
+        if attn.norm_q is not None:
+            img_query = attn.norm_q(img_query)
+        if attn.norm_k is not None:
+            img_key = attn.norm_k(img_key)
+        if attn.norm_added_q is not None:
+            txt_query = attn.norm_added_q(txt_query)
+        if attn.norm_added_k is not None:
+            txt_key = attn.norm_added_k(txt_key)
+        # Apply RoPE
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
+            img_key = apply_rotary_emb_qwen(img_key, img_freqs, use_real=False)
+            txt_query = apply_rotary_emb_qwen(txt_query, txt_freqs, use_real=False)
+            txt_key = apply_rotary_emb_qwen(txt_key, txt_freqs, use_real=False)
+        # Concatenate for joint attention
+        # Order: [text, image]
+        # joint_query = torch.cat([txt_query, img_query], dim=1)
+        # joint_key = torch.cat([txt_key, img_key], dim=1)
+        # joint_value = torch.cat([txt_value, img_value], dim=1)
+        half_dtypes = (torch.float16, torch.bfloat16)
+        def half(x):
+            return x if x.dtype in half_dtypes else x.to(dtype)
+        joint_hidden_states = xFuserLongContextAttention()(
+            None,
+            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
+            joint_tensor_query=half(txt_query),
+            joint_tensor_key=half(txt_key),
+            joint_tensor_value=half(txt_value),
+            joint_strategy='front',
+        )
+        # Reshape back
+        joint_hidden_states = joint_hidden_states.flatten(2, 3)
+        joint_hidden_states = joint_hidden_states.to(img_query.dtype)
+        # Split attention outputs back
+        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
+        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
+        # Apply output projections
+        img_attn_output = attn.to_out[0](img_attn_output)
+        if len(attn.to_out) > 1:
+            img_attn_output = attn.to_out[1](img_attn_output)  # dropout
+        txt_attn_output = attn.to_add_out(txt_attn_output)
+        return img_attn_output, txt_attn_output

videox_fun/dist/wan_xfuser.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch
+import torch.cuda.amp as amp
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast(enabled=False)
+@torch.compiler.disable()
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float32).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+        dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output)
+def rope_apply_qk(q, k, grid_sizes, freqs):
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    return q, k
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16,
+                     t=0):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q, k = rope_apply_qk(q, k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x
+@amp.autocast(enabled=False)
+@torch.compiler.disable()
+def s2v_rope_apply(x, grid_sizes, freqs):
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # loop over samples
+    output = []
+    for i, _ in enumerate(x):
+        s = x.size(1)
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2))
+        freqs_i = freqs[i]
+        freqs_i_rank = pad_freqs(freqs_i, s)
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def s2v_rope_apply_qk(q, k, grid_sizes, freqs):
+    q = s2v_rope_apply(q, grid_sizes, freqs)
+    k = s2v_rope_apply(k, grid_sizes, freqs)
+    return q, k
+def usp_attn_s2v_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16,
+                     t=0):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q, k = s2v_rope_apply_qk(q, k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

videox_fun/dist/z_image_xfuser.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import torch.cuda.amp as amp
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention import Attention
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+class ZMultiGPUsSingleStreamAttnProcessor:
+    """
+    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
+    original Z-ImageAttention module.
+    """
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        # Apply Norms
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE
+        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+            with torch.amp.autocast("cuda", enabled=False):
+                x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+                freqs_cis = freqs_cis.unsqueeze(2)
+                x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+                return x_out.type_as(x_in)  # todo
+        if freqs_cis is not None:
+            query = apply_rotary_emb(query, freqs_cis)
+            key = apply_rotary_emb(key, freqs_cis)
+        # Cast to correct dtype
+        dtype = query.dtype
+        query, key = query.to(dtype), key.to(dtype)
+        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
+        if attention_mask is not None and attention_mask.ndim == 2:
+            attention_mask = attention_mask[:, None, None, :]
+        # Compute joint attention
+        hidden_states = xFuserLongContextAttention()(
+            query,
+            key,
+            value,
+        )
+        # Reshape back
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(dtype)
+        output = attn.to_out[0](hidden_states)
+        if len(attn.to_out) > 1:  # dropout
+            output = attn.to_out[1](output)
+        return output