Spaces:

atnikos
/

motionfix-demo

Running

App Files Files Community

atnikos commited on Jun 22, 2024

Commit

d8530c7

1 Parent(s): f66aca9

basic setup

Browse files

Files changed (16) hide show

.gitignore +35 -0
README.md +1 -0
app.py +90 -2
deps/statistics_bodilex.npy +3 -0
diffusion/__init__.py +46 -0
diffusion/diffusion_utils.py +88 -0
diffusion/gaussian_diffusion.py +875 -0
diffusion/respace.py +129 -0
diffusion/timestep_sampler.py +150 -0
gen_utils.py +11 -0
geometry_utils.py +89 -0
model_utils.py +64 -0
normalization.py +150 -0
requirements.txt +1 -0
text_encoder.py +59 -0
tmed_denoiser.py +404 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+.err
+*.out
+/cluster_scripts
+/condor_logs
+lightning_logs
+sinc-env
+fast-cluster
+eval-deps
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 license: other
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: other
+models : ["openai/clip-vit-large-patch14"]
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import spaces
 import torch
@@ -5,6 +6,36 @@ import random
 zero = torch.Tensor([0]).cuda()
 print(zero.device) # <-- 'cpu' 🤔
 @spaces.GPU
 def greet(n):
@@ -20,17 +51,74 @@ def clear():
 def random_number():
     return str(random.uniform(0, 100))
 with gr.Blocks() as demo:
     input_text = gr.Textbox(label="Input Text")
-    output_text = gr.Textbox(label="Output Text")
     with gr.Row():
         retrieve_button = gr.Button("Retrieve")
         clear_button = gr.Button("Clear")
         random_button = gr.Button("Random")
-    retrieve_button.click(greet, inputs=input_text, outputs=output_text)
     clear_button.click(clear, outputs=input_text)
     random_button.click(random_number, outputs=input_text)

+from geometry_utils import diffout2motion
 import gradio as gr
 import spaces
 import torch
 zero = torch.Tensor([0]).cuda()
 print(zero.device) # <-- 'cpu' 🤔
+# G&uumll Varol
+WEBSITE = """
+<div class="embed_hidden">
+<h1 style='text-align: center'> ACRONYM: The actual title </h1>
+<h2 style='text-align: center'>
+<a href="https://google.com" target="_blank"><nobr>fname m. lname</nobr></a> &emsp;
+<a href="https://google.com" target="_blank"><nobr>fname m. lname</nobr></a> &emsp;
+<a href="https://google.com" target="_blank"><nobr>fname m. lname</nobr></a>
+</h2>
+<h2 style='text-align: center'>
+<nobr>XXX 2024</nobr>
+</h2>
+<h3 style="text-align:center;">
+<a target="_blank" href="https://arxiv.org/"> <button type="button" class="btn btn-primary btn-lg"> Paper </button></a>
+<a target="_blank" href="https://github.com/"> <button type="button" class="btn btn-primary btn-lg"> Code </button></a>
+<a target="_blank" href="google.com"> <button type="button" class="btn btn-primary btn-lg"> Webpage </button></a>
+<a target="_blank" href="bibfile.com"> <button type="button" class="btn btn-primary btn-lg"> BibTex </button></a>
+</h3>
+<h3> Description </h3>
+<p>
+This space illustrates <a href='project.com' target='_blank'><b>XXX</b></a>, a method for XXX.
+What does it do?
+</p>
+</div>
+"""
 @spaces.GPU
 def greet(n):
 def random_number():
     return str(random.uniform(0, 100))
+from huggingface_hub import hf_hub_download, hf_hub_url, cached_download
+def download_models():
+    REPO_ID = 'athn-nik/example-model'
+    return hf_hub_download(REPO_ID, filename="min_checkpoint.ckpt")
 with gr.Blocks() as demo:
+    gr.Markdown(WEBSITE)
     input_text = gr.Textbox(label="Input Text")
+    # output_text = gr.Textbox(label="Output Text")
     with gr.Row():
         retrieve_button = gr.Button("Retrieve")
         clear_button = gr.Button("Clear")
         random_button = gr.Button("Random")
+        from normalization import Normalizer
+        normalizer = Normalizer()
+        # tmed_den = load_model()
+        from diffusion import create_diffusion
+        from text_encoder import ClipTextEncoder
+        from tmed_denoiser import TMED_denoiser
+        model_ckpt = download_models()
+        checkpoint = torch.load(model_ckpt)
+        print(checkpoint.keys())
+        checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
+        tmed_denoiser = TMED_denoiser().load_state_dict(checkpoint, strict=False)
+        text_encoder = ClipTextEncoder()
+        texts_cond = [input_text]
+        diffusion_process = create_diffusion(timestep_respacing=None,
+                                             learn_sigma=False, sigma_small=True,
+                                             diffusion_steps=300,
+                                             noise_schedule='squaredcos_cap_v2',
+                                             predict_type='sample',
+                                             predict_xstart=True) # noise vs sample
+        # uncond_tokens = [""] * len(texts_cond)
+        # if self.condition == 'text':
+        #     uncond_tokens.extend(texts_cond)
+        # elif self.condition == 'text_uncond':
+        #     uncond_tokens.extend(uncond_tokens)
+        bsz = 1
+        seqlen_tgt = 180
+        no_of_texts = len(texts_cond)
+        texts_cond = ['']*no_of_texts + texts_cond
+        texts_cond = ['']*no_of_texts + texts_cond
+        text_emb, text_mask = text_encoder(texts_cond)
+        cond_emb_motion = torch.zeros(1, bsz,
+                                        512,
+                                        device='cuda')
+        cond_motion_mask = torch.ones((bsz, 1),
+                                    dtype=bool, device='cuda')
+        mask_target =  torch.ones((1, bsz),
+                                    dtype=bool, device='cuda')
+        # complete noise
+        diff_out = tmed_denoiser.diffusion_reverse(text_emb,
+                                                   text_mask,
+                                                   cond_emb_motion,
+                                                   cond_motion_mask,
+                                                   mask_target,
+                                                   diffusion_process,
+                                                   init_vec=None,
+                                                   init_from='noise',
+                                                   gd_text=4.0,
+                                                   gd_motion=2.0,
+                                                   steps_num=300)
+        edited_motion = diffout2motion(diff_out)
     clear_button.click(clear, outputs=input_text)
     random_button.click(random_number, outputs=input_text)

deps/statistics_bodilex.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a0be87962557d3149203eb4586f3e670c1bd7785765ad8cef9ed91f6277a2c2
+size 4826

diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,875 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=False, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=False,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=False,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=False,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=False,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=False,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=False,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=False,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=False, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=False, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            terms["target"] = target
+            terms['model_output'] = model_output
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=False, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

diffusion/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

gen_utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import torch
+import numpy as np
+def cast_dict_to_tensors(d, device="cpu"):
+    if isinstance(d, dict):
+        return {k: cast_dict_to_tensors(v, device) for k, v in d.items()}
+    elif isinstance(d, np.ndarray):
+        return torch.from_numpy(d).float().to(device)
+    elif isinstance(d, torch.Tensor):
+        return d.to(device)
+    else:
+        return d

geometry_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+def diffout2motion(diffout):
+        # - "body_transl_delta_pelv_xy_wo_z"
+        # - "body_transl_z"
+        # - "z_orient_delta"
+        # - "body_orient_xy"
+        # - "body_pose"
+        # - "body_joints_local_wo_z_rot"
+        feats_unnorm = self.cat_inputs(self.unnorm_inputs(
+                                        self.uncat_inputs(diffout,
+                                            self.input_feats_dims),
+                                        self.input_feats))[0]
+        # FIRST POSE FOR GENERATION & DELTAS FOR INTEGRATION
+        if "body_joints_local_wo_z_rot" in self.input_feats:
+            idx = self.input_feats.index("body_joints_local_wo_z_rot")
+            feats_unnorm = feats_unnorm[..., :-self.input_feats_dims[idx]]
+        first_trans = torch.zeros(*diffout.shape[:-1], 3,
+                                    device=self.device)[:, [0]]
+        if 'z_orient_delta' in self.input_feats:
+            first_orient_z = torch.eye(3, device=self.device).unsqueeze(0)  # Now the shape is (1, 1, 3, 3)
+            first_orient_z = first_orient_z.repeat(feats_unnorm.shape[0], 1, 1)  # Now the shape is (B, 1, 3, 3)
+            first_orient_z = transform_body_pose(first_orient_z, 'rot->6d')
+            # --> first_orient_z convert to 6d
+            # integrate z orient delta --> z component tof orientation
+            z_orient_delta = feats_unnorm[..., 9:15]
+            from src.tools.transforms3d import apply_rot_delta, remove_z_rot, get_z_rot, change_for
+            prev_z = first_orient_z
+            full_z_angle = [first_orient_z[:, None]]
+            for i in range(1, z_orient_delta.shape[1]):
+                curr_z = apply_rot_delta(prev_z, z_orient_delta[:, i])
+                prev_z = curr_z.clone()
+                full_z_angle.append(curr_z[:,None])
+            full_z_angle = torch.cat(full_z_angle, dim=1)
+            full_z_angle_rotmat = get_z_rot(full_z_angle)
+            # full_orient = torch.cat([full_z_angle, xy_orient], dim=-1)
+            xy_orient = feats_unnorm[..., 3:9]
+            xy_orient_rotmat = transform_body_pose(xy_orient, '6d->rot')
+            # xy_orient = remove_z_rot(xy_orient, in_format="6d")
+            # GLOBAL ORIENTATION
+            # full_z_angle = transform_body_pose(full_z_angle_rotmat,
+            #                                    'rot->6d')
+            # full_global_orient = apply_rot_delta(full_z_angle,
+            #                                      xy_orient)
+            full_global_orient_rotmat = full_z_angle_rotmat @ xy_orient_rotmat
+            full_global_orient = transform_body_pose(full_global_orient_rotmat,
+                                                        'rot->6d')
+            first_trans = self.cat_inputs(self.unnorm_inputs(
+                                                    [first_trans],
+                                                    ['body_transl'])
+                                            )[0]
+            # apply deltas
+            # get velocity in global c.f. and add it to the state position
+            assert 'body_transl_delta_pelv' in self.input_feats
+            pelvis_delta = feats_unnorm[..., :3]
+            trans_vel_pelv = change_for(pelvis_delta[:, 1:],
+                                        full_global_orient_rotmat[:, :-1],
+                                        forward=False)
+            # new_state_pos = prev_trans_norm.squeeze() + trans_vel_pelv
+            full_trans = torch.cumsum(trans_vel_pelv, dim=1) + first_trans
+            full_trans = torch.cat([first_trans, full_trans], dim=1)
+            #  "body_transl_delta_pelv_xy_wo_z"
+            # first_trans = self.cat_inputs(self.unnorm_inputs(
+            #                                         [first_trans],
+            #                                         ['body_transl'])
+            #                                 )[0]
+            # pelvis_xy = pelvis_delta_xy
+            # FULL TRANSLATION
+            # full_trans = torch.cat([pelvis_xy,
+            #                         feats_unnorm[..., 2:3][:,1:]], dim=-1)
+            #############
+            full_rots = torch.cat([full_global_orient,
+                                    feats_unnorm[...,-21*6:]],
+                                    dim=-1)
+            full_motion_unnorm = torch.cat([full_trans,
+                                            full_rots], dim=-1)
+        return full_motion_unnorm

model_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import torch
+from torch import nn
+class TimestepEmbedderMDM(nn.Module):
+    def __init__(self, latent_dim):
+        super().__init__()
+        self.latent_dim = latent_dim
+        time_embed_dim = self.latent_dim
+        self.sequence_pos_encoder = PositionalEncoding(d_model=self.latent_dim)
+        # TODO add time embedding learnable
+        self.time_embed = nn.Sequential(
+            nn.Linear(self.latent_dim, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+    def forward(self, timesteps):
+        return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1,
+                 max_len=5000, batch_first=False, negative=False):
+        super().__init__()
+        self.batch_first = batch_first
+        self.dropout = nn.Dropout(p=dropout)
+        self.max_len = max_len
+        self.negative = negative
+        if negative:
+            pe = torch.zeros(2*max_len, d_model)
+            position = torch.arange(-max_len, max_len, dtype=torch.float).unsqueeze(1)
+        else:
+            pe = torch.zeros(max_len, d_model)
+            position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe, persistent=False)
+    def forward(self, x, hist_frames=0):
+        if not self.negative:
+            center = 0
+            assert hist_frames == 0
+            first = 0
+        else:
+            center = self.max_len
+            first = center-hist_frames
+        if self.batch_first:
+            last = first + x.shape[1]
+            x = x + self.pe.permute(1, 0, 2)[:, first:last, :]
+        else:
+            last = first + x.shape[0]
+            x = x + self.pe[first:last, :]
+        return self.dropout(x)

normalization.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from os.path import exists
+from gen_utils import cast_dict_to_tensors
+from einops import rearrange
+from torch import Tensor
+from typing import List, Union
+import torch
+import numpy as np
+class Normalizer:
+    def __init__(self, statistics_path: str='deps/statistics_bodilex.npy', nfeats: int=207,
+                 input_feats: List[str] = ["body_transl_delta_pelv",
+                                           "body_orient_xy",
+                                           "z_orient_delta", "body_pose",
+                                           "body_joints_local_wo_z_rot"],
+                 dim_per_feat: List[int] = [3, 6, 6, 126, 66], *args, **kwargs):
+        self.stats = self.load_norm_statistics(statistics_path, 'cuda')
+        # from src.model.utils.tools import pack_to_render
+        # mr = pack_to_render(aa.detach().cpu(), trans=None)
+        # mr = {k: v[0] for k, v in mr.items()}
+        # fname = render_motion(aitrenderer, mr,
+        #                  "/home/nathanasiou/Desktop/conditional_action_gen/modilex/pose_test",
+        #                 pose_repr='aa',
+        #                 text_for_vid=str(keyids[0]),
+        #                 color=color_map['generated'],
+        #                 smpl_layer=smpl_layer)
+        self.nfeats = nfeats
+        self.dim_per_feat = dim_per_feat
+        self.input_feats_dims = list(dim_per_feat)
+        self.input_feats = list(input_feats)
+    def load_norm_statistics(self, path, device):
+        # workaround for cluster local/sync
+        assert exists(path)
+        stats = np.load(path, allow_pickle=True)[()]
+        return cast_dict_to_tensors(stats, device=device)
+    def norm_and_cat(self, batch, features_types):
+        """
+        turn batch data into the format the forward() function expects
+        """
+        seq_first = lambda t: rearrange(t, 'b s ... -> s b ...')
+        input_batch = {}
+        ## PREPARE INPUT ##
+        motion_condition = any('source' in value for value in batch.keys())
+        mo_types = ['source', 'target']
+        for mot in mo_types:
+            list_of_feat_tensors = [seq_first(batch[f'{feat_type}_{mot}'])
+                                    for feat_type in features_types if f'{feat_type}_{mot}' in batch.keys()]
+            # normalise and cat to a unified feature vector
+            list_of_feat_tensors_normed = self.norm_inputs(list_of_feat_tensors,
+                                                           features_types)
+            # list_of_feat_tensors_normed = [x[1:] if 'delta' in nx else x for nx,
+                                                # x in zip(features_types,
+                                                # list_of_feat_tensors_normed)]
+            x_norm, _ = self.cat_inputs(list_of_feat_tensors_normed)
+            input_batch[mot] = x_norm
+        return input_batch
+    def norm_and_cat_single_motion(self, batch, features_types):
+        """
+        turn batch data into the format the forward() function expects
+        """
+        seq_first = lambda t: rearrange(t, 'b s ... -> s b ...')
+        input_batch = {}
+        ## PREPARE INPUT ##
+        list_of_feat_tensors = [seq_first(batch[feat_type])
+                                for feat_type in features_types]
+        # normalise and cat to a unified feature vector
+        list_of_feat_tensors_normed = self.norm_inputs(list_of_feat_tensors,
+                                                        features_types)
+        # list_of_feat_tensors_normed = [x[1:] if 'delta' in nx else x for nx,
+                                            # x in zip(features_types,
+                                            # list_of_feat_tensors_normed)]
+        x_norm, _ = self.cat_inputs(list_of_feat_tensors_normed)
+        input_batch['motion'] = x_norm
+        return input_batch
+    def norm(self, x, stats):
+        mean = stats['mean'].to('cuda')
+        std = stats['std'].to('cuda')
+        return (x - mean) / (std + 1e-5)
+    def unnorm(self, x, stats):
+        mean = stats['mean'].to('cuda')
+        std = stats['std'].to('cuda')
+        return x * (std + 1e-5) + mean
+    def unnorm_state(self, state_norm: Tensor) -> Tensor:
+        # unnorm state
+        return self.cat_inputs(
+            self.unnorm_inputs(self.uncat_inputs(state_norm,
+                                                 self.first_pose_feats_dims),
+                               self.first_pose_feats))[0]
+    def unnorm_delta(self, delta_norm: Tensor) -> Tensor:
+        # unnorm delta
+        return self.cat_inputs(
+            self.unnorm_inputs(self.uncat_inputs(delta_norm,
+                                                 self.input_feats_dims),
+                               self.input_feats))[0]
+    def norm_state(self, state:Tensor) -> Tensor:
+        # normalise state
+        return self.cat_inputs(
+            self.norm_inputs(self.uncat_inputs(state,
+                                               self.first_pose_feats_dims),
+                             self.first_pose_feats))[0]
+    def norm_delta(self, delta:Tensor) -> Tensor:
+        # normalise delta
+        return self.cat_inputs(
+            self.norm_inputs(self.uncat_inputs(delta, self.input_feats_dims),
+                             self.input_feats))[0]
+    def cat_inputs(self, x_list: List[Tensor]):
+        """
+        cat the inputs to a unified vector and return their lengths in order
+        to un-cat them later
+        """
+        return torch.cat(x_list, dim=-1), [x.shape[-1] for x in x_list]
+    def uncat_inputs(self, x: Tensor, lengths: List[int]):
+        """
+        split the unified feature vector back to its original parts
+        """
+        return torch.split(x, lengths, dim=-1)
+    def norm_inputs(self, x_list: List[Tensor], names: List[str]):
+        """
+        Normalise inputs using the self.stats metrics
+        """
+        x_norm = []
+        for x, name in zip(x_list, names):
+            x_norm.append(self.norm(x, self.stats[name]))
+        return x_norm
+    def unnorm_inputs(self, x_list: List[Tensor], names: List[str]):
+        """
+        Un-normalise inputs using the self.stats metrics
+        """
+        x_unnorm = []
+        for x, name in zip(x_list, names):
+            x_unnorm.append(self.unnorm(x, self.stats[name]))
+        return x_unnorm

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 spaces
 gradio==4.36.1
 torch

 spaces
 gradio==4.36.1
 torch
+transformers==4.41.2

text_encoder.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+from typing import List, Union
+import torch
+from torch import Tensor, nn
+class ClipTextEncoder(nn.Module):
+    def __init__(
+            self,
+            modelpath: str='deps/clip-vit-large-patch14', # clip-vit-base-patch32
+            finetune: bool = False,
+            **kwargs
+        ) -> None:
+        super().__init__()
+        from transformers import logging
+        from transformers import AutoModel, AutoTokenizer
+        logging.set_verbosity_error()
+        # Tokenizer
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
+        self.text_model = AutoModel.from_pretrained(modelpath)
+        # Don't train the model
+        if not finetune:
+            self.text_model.training = False
+            for p in self.text_model.parameters():
+                p.requires_grad = False
+        # Then configure the model
+        self.max_length = self.tokenizer.model_max_length
+        self.text_encoded_dim = self.text_model.config.text_config.hidden_size
+    def forward(self, texts: List[str]):
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            texts,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_model.device)
+        txt_att_mask = text_inputs.attention_mask.to(self.text_model.device)
+            # split into max length Clip can handle
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            text_input_ids = text_input_ids[:, :self.tokenizer.
+                                            model_max_length]
+        # use pooled ouuput if latent dim is two-dimensional
+        # pooled = 0 if self.latent_dim[0] == 1 else 1 # (bs, seq_len, text_encoded_dim) -> (bs, text_encoded_dim)
+        # text encoder forward, clip must use get_text_features
+        # (batch_Size, seq_length , text_encoded_dim)
+        text_embeddings = self.text_model.text_model(text_input_ids,
+                            # attention_mask=txt_att_mask
+                            ).last_hidden_state
+        return text_embeddings, txt_att_mask.bool()

tmed_denoiser.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import torch
+import torch.nn as nn
+from model_utils import TimestepEmbedderMDM
+from model_utils  import PositionalEncoding
+class TMED_denoiser(nn.Module):
+    def __init__(self,
+                 nfeats: int = 207,
+                 condition: str = "text",
+                 latent_dim: list = 512,
+                 ff_size: int = 1024,
+                 num_layers: int = 8,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 activation: str = "gelu",
+                 text_encoded_dim: int = 768,
+                 pred_delta_motion: bool = False,
+                 use_sep: bool = True,
+                 **kwargs) -> None:
+        super().__init__()
+        self.latent_dim = latent_dim
+        self.pred_delta_motion = pred_delta_motion
+        self.text_encoded_dim = text_encoded_dim
+        self.condition = condition
+        self.feat_comb_coeff = nn.Parameter(torch.tensor([1.0]))
+        self.pose_proj_in_source = nn.Linear(nfeats, self.latent_dim)
+        self.pose_proj_in_target = nn.Linear(nfeats, self.latent_dim)
+        self.pose_proj_out = nn.Linear(self.latent_dim, nfeats)
+        # emb proj
+        if self.condition in ["text", "text_uncond"]:
+            # text condition
+            # project time from text_encoded_dim to latent_dim
+            self.embed_timestep = TimestepEmbedderMDM(self.latent_dim)
+            # FIXME me TODO this
+            # self.time_embedding = TimestepEmbedderMDM(self.latent_dim)
+            # project time+text to latent_dim
+            if text_encoded_dim != self.latent_dim:
+                # todo 10.24 debug why relu
+                self.emb_proj = nn.Linear(text_encoded_dim, self.latent_dim)
+        else:
+            raise TypeError(f"condition type {self.condition} not supported")
+        self.use_sep = use_sep
+        self.query_pos = PositionalEncoding(self.latent_dim, dropout)
+        self.mem_pos = PositionalEncoding(self.latent_dim, dropout)
+        if self.use_sep:
+            self.sep_token = nn.Parameter(torch.randn(1, self.latent_dim))
+        # use torch transformer
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=self.latent_dim,
+            nhead=num_heads,
+            dim_feedforward=ff_size,
+            dropout=dropout,
+            activation=activation)
+        self.encoder = nn.TransformerEncoder(encoder_layer,
+                                                num_layers=num_layers)
+    def forward(self,
+                noised_motion,
+                timestep,
+                in_motion_mask,
+                text_embeds,
+                condition_mask,
+                motion_embeds=None,
+                lengths=None,
+                **kwargs):
+        # 0.  dimension matching
+        # noised_motion [latent_dim[0], batch_size, latent_dim] <= [batch_size, latent_dim[0], latent_dim[1]]
+        bs = noised_motion.shape[0]
+        noised_motion = noised_motion.permute(1, 0, 2)
+        # 0. check lengths for no vae (diffusion only)
+        # if lengths not in [None, []]:
+        motion_in_mask = in_motion_mask
+        # time_embedding | text_embedding | frames_source | frames_target
+        # 1 * lat_d | max_text * lat_d | max_frames * lat_d | max_frames * lat_d
+        # 1. time_embeddingno
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timestep.expand(noised_motion.shape[1]).clone()
+        time_emb = self.embed_timestep(timesteps).to(dtype=noised_motion.dtype)
+        # make it S first
+        # time_emb = self.time_embedding(time_emb).unsqueeze(0)
+        if self.condition in ["text", "text_uncond"]:
+            # make it seq first
+            text_embeds = text_embeds.permute(1, 0, 2)
+            if self.text_encoded_dim != self.latent_dim:
+                # [1 or 2, bs, latent_dim] <= [1 or 2, bs, text_encoded_dim]
+                text_emb_latent = self.emb_proj(text_embeds)
+            else:
+                text_emb_latent = text_embeds
+                # source_motion_zeros = torch.zeros(*noised_motion.shape[:2],
+                #                             self.latent_dim,
+                #                             device=noised_motion.device)
+                # aux_fake_mask = torch.zeros(condition_mask.shape[0],
+                #                             noised_motion.shape[0],
+                #                             device=noised_motion.device)
+                # condition_mask = torch.cat((condition_mask, aux_fake_mask),
+                #                            1).bool().to(noised_motion.device)
+            emb_latent = torch.cat((time_emb, text_emb_latent), 0)
+            if motion_embeds is not None:
+                zeroes_mask = (motion_embeds == 0).all(dim=-1)
+                if motion_embeds.shape[-1] != self.latent_dim:
+                    motion_embeds_proj = self.pose_proj_in_source(motion_embeds)
+                    motion_embeds_proj[zeroes_mask] = 0
+                else:
+                    motion_embeds_proj = motion_embeds
+        else:
+            raise TypeError(f"condition type {self.condition} not supported")
+        # 4. transformer
+        # if self.diffusion_only:
+        proj_noised_motion = self.pose_proj_in_target(noised_motion)
+        if self.use_sep:
+            sep_token_batch = torch.tile(self.sep_token, (bs,)).reshape(bs,
+                                                                        -1)
+            xseq = torch.cat((emb_latent, motion_embeds_proj,
+                            sep_token_batch[None],
+                            proj_noised_motion), axis=0)
+        else:
+            xseq = torch.cat((emb_latent, motion_embeds_proj,
+                              proj_noised_motion), axis=0)
+        # if self.ablation_skip_connection:
+        #     xseq = self.query_pos(xseq)
+        #     tokens = self.encoder(xseq)
+        # else:
+        #     # adding the timestep embed
+        #     # [seqlen+1, bs, d]
+        #     # todo change to query_pos_decoder
+        xseq = self.query_pos(xseq)
+        # BUILD the mask now
+        if motion_embeds is None:
+            time_token_mask = torch.ones((bs, time_emb.shape[0]),
+                                        dtype=bool, device=xseq.device)
+            aug_mask = torch.cat((time_token_mask,
+                                  condition_mask[:, :text_emb_latent.shape[0]],
+                                  motion_in_mask), 1)
+        else:
+            time_token_mask = torch.ones((bs, time_emb.shape[0]),
+                                        dtype=bool,
+                                        device=xseq.device)
+            if self.use_sep:
+                sep_token_mask = torch.ones((bs, self.sep_token.shape[0]),
+                                        dtype=bool,
+                                        device=xseq.device)
+            if self.use_sep:
+                aug_mask = torch.cat((time_token_mask,
+                                condition_mask[:, :text_emb_latent.shape[0]],
+                                condition_mask[:, text_emb_latent.shape[0]:],
+                                sep_token_mask,
+                                motion_in_mask,
+                                ), 1)
+            else:
+                aug_mask = torch.cat((time_token_mask,
+                                condition_mask[:, :text_emb_latent.shape[0]],
+                                condition_mask[:, text_emb_latent.shape[0]:],
+                                motion_in_mask,
+                                ), 1)
+        tokens = self.encoder(xseq, src_key_padding_mask=~aug_mask)
+        # if self.diffusion_only:
+        if motion_embeds is not None:
+            denoised_motion_proj = tokens[emb_latent.shape[0]:]
+            if self.use_sep:
+                useful_tokens = motion_embeds_proj.shape[0]+1
+            else:
+                useful_tokens = motion_embeds_proj.shape[0]
+            denoised_motion_proj = denoised_motion_proj[useful_tokens:]
+        else:
+            denoised_motion_proj = tokens[emb_latent.shape[0]:]
+        denoised_motion = self.pose_proj_out(denoised_motion_proj)
+        if self.pred_delta_motion and motion_embeds is not None:
+            import torch.nn.functional as F
+            tgt_size = len(denoised_motion)
+            if len(denoised_motion) > len(motion_embeds):
+                pad_for_src = tgt_size - len(motion_embeds)
+                motion_embeds = F.pad(motion_embeds,
+                                      (0, 0, 0, 0, 0, pad_for_src))
+            denoised_motion = denoised_motion + motion_embeds[:tgt_size]
+        denoised_motion[~motion_in_mask.T] = 0
+        # zero for padded area
+        # else:
+        #     sample = tokens[:sample.shape[0]]
+        # 5. [batch_size, latent_dim[0], latent_dim[1]] <= [latent_dim[0], batch_size, latent_dim[1]]
+        denoised_motion = denoised_motion.permute(1, 0, 2)
+        return denoised_motion
+    def forward_with_guidance(self,
+                              noised_motion,
+                              timestep,
+                              in_motion_mask,
+                              text_embeds,
+                              condition_mask,
+                              guidance_motion,
+                              guidance_text_n_motion,
+                              motion_embeds=None,
+                              lengths=None,
+                              inpaint_dict=None,
+                              max_steps=None,
+                              prob_way='3way',
+                              **kwargs):
+        # if motion embeds is None
+        # TODO put here that you have tow
+        # implement 2 cases for that case
+        # text unconditional more or less 2 replicas
+        # timestep
+        if max_steps is not None:
+            curr_ts = timestep[0].item()
+            g_m = max(1, guidance_motion*2*curr_ts/max_steps)
+            guidance_motion = g_m
+            g_t_tm = max(1, guidance_text_n_motion*2*curr_ts/max_steps)
+            guidance_text_n_motion = g_t_tm
+        if motion_embeds is None:
+            half = noised_motion[: len(noised_motion) // 2]
+            combined = torch.cat([half, half], dim=0)
+            model_out = self.forward(combined, timestep,
+                                    in_motion_mask=in_motion_mask,
+                                    text_embeds=text_embeds,
+                                    condition_mask=condition_mask,
+                                    motion_embeds=motion_embeds,
+                                    lengths=lengths)
+            uncond_eps, cond_eps_text = torch.split(model_out, len(model_out) // 2,
+                                                     dim=0)
+            # make it BxSxfeatures
+            if inpaint_dict is not None:
+                import torch.nn.functional as F
+                source_mot = inpaint_dict['start_motion'].permute(1, 0, 2)
+                if source_mot.shape[1] >= uncond_eps.shape[1]:
+                    source_mot = source_mot[:, :uncond_eps.shape[1]]
+                else:
+                    pad = uncond_eps.shape[1] - source_mot.shape[1]
+                    # Pad the tensor on the second dimension (time)
+                    source_mot = F.pad(source_mot, (0, 0, 0, pad), 'constant', 0)
+                mot_len = source_mot.shape[1]
+                # concat mask for all the frames
+                mask_src_parts = inpaint_dict['mask'].unsqueeze(1).repeat(1,
+                                                                      mot_len,
+                                                                      1)
+                uncond_eps = uncond_eps*(~mask_src_parts) + source_mot*mask_src_parts
+                cond_eps_text = cond_eps_text*(~mask_src_parts) + source_mot*mask_src_parts
+            half_eps = uncond_eps + guidance_text_n_motion * (cond_eps_text - uncond_eps)
+            eps = torch.cat([half_eps, half_eps], dim=0)
+        else:
+            third = noised_motion[: len(noised_motion) // 3]
+            combined = torch.cat([third, third, third], dim=0)
+            model_out = self.forward(combined, timestep,
+                                     in_motion_mask=in_motion_mask,
+                                     text_embeds=text_embeds,
+                                     condition_mask=condition_mask,
+                                     motion_embeds=motion_embeds,
+                                     lengths=lengths)
+            # For exact reproducibility reasons, we apply classifier-free guidance on only
+            # three channels by default. The standard approach to cfg applies it to all channels.
+            # This can be done by uncommenting the following line and commenting-out the line following that.
+            # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+            # eps, rest = model_out[:, :3], model_out[:, 3:]
+            uncond_eps, cond_eps_motion, cond_eps_text_n_motion = torch.split(model_out,
+                                                                            len(model_out) // 3,
+                                                                            dim=0)
+            if inpaint_dict is not None:
+                import torch.nn.functional as F
+                source_mot = inpaint_dict['start_motion'].permute(1, 0, 2)
+                if source_mot.shape[1] >= uncond_eps.shape[1]:
+                    source_mot = source_mot[:, :uncond_eps.shape[1]]
+                else:
+                    pad = uncond_eps.shape[1] - source_mot.shape[1]
+                    # Pad the tensor on the second dimension (time)
+                    source_mot = F.pad(source_mot, (0, 0, 0, pad), 'constant', 0)
+                mot_len = source_mot.shape[1]
+                # concat mask for all the frames
+                mask_src_parts = inpaint_dict['mask'].unsqueeze(1).repeat(1,
+                                                                      mot_len,
+                                                                      1)
+                uncond_eps = uncond_eps*(~mask_src_parts) + source_mot*mask_src_parts
+                cond_eps_text = cond_eps_text*(~mask_src_parts) + source_mot*mask_src_parts
+                cond_eps_text_n_motion = cond_eps_text_n_motion*(~mask_src_parts) + source_mot*mask_src_parts
+            if prob_way=='3way':
+                third_eps = uncond_eps + guidance_motion * (cond_eps_motion - uncond_eps) + \
+                            guidance_text_n_motion * (cond_eps_text_n_motion - cond_eps_motion)
+            if prob_way=='2way':
+                third_eps = uncond_eps + guidance_text_n_motion * (cond_eps_text_n_motion - uncond_eps)
+            eps = torch.cat([third_eps, third_eps, third_eps], dim=0)
+        return eps
+    def _diffusion_reverse(self, text_embeds, text_masks_from_enc,
+                            motion_embeds, cond_motion_masks,
+                            inp_motion_mask, diff_process,
+                            init_vec=None,
+                            init_from='noise',
+                            gd_text=None, gd_motion=None,
+                            mode='full_cond',
+                            return_init_noise=False,
+                            steps_num=None,
+                            inpaint_dict=None,
+                            use_linear=False,
+                            prob_way='3way'):
+        # guidance_scale_text: 7.5 #
+        #  guidance_scale_motion: 1.5
+        # init latents
+        bsz = inp_motion_mask.shape[0]
+        assert mode in ['full_cond', 'text_cond', 'mot_cond']
+        assert inp_motion_mask is not None
+        # len_to_gen = max(lengths) if not self.input_deltas else max(lengths) + 1
+        if init_vec is None:
+            initial_latents = torch.randn(
+                (bsz, inp_motion_mask.shape[1], 207),
+                device=inp_motion_mask.device,
+                dtype=torch.float,
+            )
+        else:
+            initial_latents = init_vec
+        gd_scale_text = 2.0
+        gd_scale_motion = 4.0
+        if text_embeds is not None:
+            max_text_len = text_embeds.shape[1]
+        else:
+            max_text_len = 0
+        max_motion_len = cond_motion_masks.shape[1]
+        text_masks = text_masks_from_enc.clone()
+        nomotion_mask = torch.zeros(bsz, max_motion_len,
+                    dtype=torch.bool).to('cuda')
+        motion_masks = torch.cat([nomotion_mask,
+                                cond_motion_masks,
+                                cond_motion_masks],
+                                dim=0)
+        aug_mask = torch.cat([text_masks,
+                                motion_masks],
+                                dim=1)
+        # Setup classifier-free guidance:
+        if motion_embeds is not None:
+            z = torch.cat([initial_latents, initial_latents, initial_latents], 0)
+        else:
+            z = torch.cat([initial_latents, initial_latents], 0)
+        # y_null = torch.tensor([1000] * n, device=device)
+        # y = torch.cat([y, y_null], 0)
+        if use_linear:
+            max_steps_diff = diff_process.num_timesteps
+        else:
+            max_steps_diff = None
+        if motion_embeds is not None:
+            model_kwargs = dict(# noised_motion=latent_model_input,
+                                # timestep=t,
+                                in_motion_mask=torch.cat([inp_motion_mask,
+                                                        inp_motion_mask,
+                                                        inp_motion_mask], 0),
+                                text_embeds=text_embeds,
+                                condition_mask=aug_mask,
+                                motion_embeds=torch.cat([torch.zeros_like(motion_embeds),
+                                                        motion_embeds,
+                                                        motion_embeds], 1),
+                                guidance_motion=gd_motion,
+                                guidance_text_n_motion=gd_text,
+                                inpaint_dict=inpaint_dict,
+                                max_steps=max_steps_diff,
+                                prob_way=prob_way)
+        else:
+            model_kwargs = dict(# noised_motion=latent_model_input,
+                    # timestep=t,
+                    in_motion_mask=torch.cat([inp_motion_mask,
+                                            inp_motion_mask], 0),
+                    text_embeds=text_embeds,
+                    condition_mask=aug_mask,
+                    motion_embeds=None,
+                    guidance_motion=gd_motion,
+                    guidance_text_n_motion=gd_text,
+                    inpaint_dict=inpaint_dict,
+                    max_steps=max_steps_diff)
+        # model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+        # Sample images:
+        samples = diff_process.p_sample_loop(self.forward_with_guidance,
+                                            z.shape, z,
+                                            clip_denoised=False,
+                                            model_kwargs=model_kwargs,
+                                            progress=True,
+                                            device=initial_latents.device,)
+        _, _, samples = samples.chunk(3, dim=0)  # Remove null class samples
+        final_diffout = samples.permute(1, 0, 2)
+        if return_init_noise:
+            return initial_latents, final_diffout
+        else:
+            return final_diffout