import torch import torch.nn as nn import numpy as np import torch from torch import nn class TimestepEmbedderMDM(nn.Module): def __init__(self, latent_dim): super().__init__() self.latent_dim = latent_dim time_embed_dim = self.latent_dim self.sequence_pos_encoder = PositionalEncoding(d_model=self.latent_dim) # TODO add time embedding learnable self.time_embed = nn.Sequential( nn.Linear(self.latent_dim, time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim), ) def forward(self, timesteps): return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2) class PositionalEncoding(nn.Module): def __init__(self, d_model, dropout=0.1, max_len=5000, batch_first=False, negative=False): super().__init__() self.batch_first = batch_first self.dropout = nn.Dropout(p=dropout) self.max_len = max_len self.negative = negative if negative: pe = torch.zeros(2*max_len, d_model) position = torch.arange(-max_len, max_len, dtype=torch.float).unsqueeze(1) else: pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe, persistent=False) def forward(self, x, hist_frames=0): if not self.negative: center = 0 assert hist_frames == 0 first = 0 else: center = self.max_len first = center-hist_frames if self.batch_first: last = first + x.shape[1] x = x + self.pe.permute(1, 0, 2)[:, first:last, :] else: last = first + x.shape[0] x = x + self.pe[first:last, :] return self.dropout(x)