PMA-VAE / model.py

Upload model.py with huggingface_hub

92f566d verified 16 days ago

31.8 kB

	"""
	PMA-VAE: Parallel Mobile Artistic Variational Autoencoder
	=========================================================
	Attention-free, mobile-deployable VAE with:
	- Parallel 2D Mamba/SSM blocks (no sequential pixel loops)
	- Mobile depthwise-separable convolutions
	- Multi-scale latents: z_base (H/16), z_detail (H/8), z_style (global vector)
	- FiLM style conditioning throughout decoder
	- Designed for: image generation, super-resolution, artifact removal, style transfer

	Architecture:
	Image → PixelUnshuffle stem → MobileConv + Parallel 2D Mamba encoder
	→ Multi-scale latent (base + detail + style)
	→ Light parallel decoder with FiLM modulation → Reconstructed image

	Total params target: ~20-40M (encoder heavier, decoder light for mobile)
	"""

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import rearrange


	# ==============================================================================
	# Parallel Scan (Blelloch-style) — Pure PyTorch, no CUDA kernels
	# Based on: https://github.com/alxndrTL/mamba.py/blob/main/mambapy/pscan.py
	# ==============================================================================

	class PScan(torch.autograd.Function):
	"""
	Parallel prefix scan (Blelloch algorithm) in pure PyTorch.
	Computes: y[t] = A[t] * y[t-1] + X[t] for all t in parallel.
	"""
	@staticmethod
	def pscan_forward(A, X):
	B, D, L, N = A.size()
	# Pad to next power of 2 if needed
	orig_L = L
	if L & (L - 1) != 0: # not power of 2
	next_pow2 = 1 << (L - 1).bit_length()
	pad = next_pow2 - L
	A = F.pad(A, (0, 0, 0, pad), value=1.0)
	X = F.pad(X, (0, 0, 0, pad), value=0.0)
	L = next_pow2

	num_steps = int(math.log2(L))

	# Store intermediate values for down-sweep
	Aa = A.clone()
	Xa = X.clone()

	# Up-sweep (reduce)
	for k in range(num_steps):
	step = 1 << (k + 1)
	half = step // 2
	# Indices for even/odd pairs
	idx = torch.arange(half - 1, L, step, device=A.device)
	idx_prev = idx - half

	Xa[:, :, idx] = Aa[:, :, idx] * Xa[:, :, idx_prev] + Xa[:, :, idx]
	Aa[:, :, idx] = Aa[:, :, idx] * Aa[:, :, idx_prev]

	# Down-sweep
	for k in range(num_steps - 2, -1, -1):
	step = 1 << (k + 1)
	half = step // 2
	idx = torch.arange(step - 1, L, step, device=A.device)
	if idx.numel() > 0 and (idx + half < L).any():
	valid = idx + half
	valid = valid[valid < L]
	if valid.numel() > 0:
	src_idx = valid - half
	Xa[:, :, valid] = Aa[:, :, valid] * Xa[:, :, src_idx] + Xa[:, :, valid]

	return Xa[:, :, :orig_L]

	@staticmethod
	def forward(ctx, A_in, X_in):
	A = A_in.clone()
	X = X_in.clone()
	result = PScan.pscan_forward(A, X)
	ctx.save_for_backward(A_in, X_in, result)
	return result

	@staticmethod
	def backward(ctx, grad_output):
	A_in, X_in, result = ctx.saved_tensors
	# For backward: reversed scan
	# dA[t] = grad[t] * y[t-1], dX[t] = cumulative product of future A's * grad
	# Simplified: use autograd-friendly sequential for backward (still fast enough)
	B, D, L, N = A_in.size()

	grad_A = torch.zeros_like(A_in)
	grad_X = torch.zeros_like(X_in)

	# Sequential backward (simpler, correct)
	grad_h = torch.zeros(B, D, N, device=A_in.device, dtype=A_in.dtype)

	for t in range(L - 1, -1, -1):
	grad_h = grad_h + grad_output[:, :, t]
	grad_X[:, :, t] = grad_h
	if t > 0:
	# y[t-1] from forward
	y_prev = result[:, :, t - 1]
	grad_A[:, :, t] = (grad_h * y_prev).sum(-1, keepdim=True).expand_as(A_in[:, :, t])
	grad_h = grad_h * A_in[:, :, t]
	else:
	grad_A[:, :, 0] = torch.zeros_like(A_in[:, :, 0])

	return grad_A, grad_X

	pscan = PScan.apply


	# ==============================================================================
	# Selective State Space (S6) Block — The core Mamba mechanism
	# ==============================================================================

	class SelectiveSSM(nn.Module):
	"""
	Selective State Space Model (S6) from Mamba paper.
	Uses parallel scan for O(L) computation without sequential loops.

	For 2D images: we flatten H*W to sequence, process with SSM, reshape back.
	"""
	def __init__(self, d_model, d_state=16, d_conv=4, expand=2, use_parallel_scan=True):
	super().__init__()
	self.d_model = d_model
	self.d_state = d_state
	self.d_conv = d_conv
	self.expand = expand
	self.d_inner = int(expand * d_model)
	self.use_parallel_scan = use_parallel_scan

	# Input projection: x → (xz) where x goes through SSM, z is gate
	self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)

	# 1D depthwise conv (local context before SSM)
	self.conv1d = nn.Conv1d(
	self.d_inner, self.d_inner,
	kernel_size=d_conv, bias=True,
	groups=self.d_inner, padding=d_conv - 1
	)

	# Input-dependent SSM parameters
	self.x_proj = nn.Linear(self.d_inner, self.d_state * 2 + 1, bias=False)
	self.dt_proj = nn.Linear(1, self.d_inner, bias=True)

	# A matrix (structured, log-parameterized)
	A = torch.arange(1, d_state + 1, dtype=torch.float32).unsqueeze(0).expand(self.d_inner, -1)
	self.A_log = nn.Parameter(torch.log(A))

	# D skip connection
	self.D = nn.Parameter(torch.ones(self.d_inner))

	# Output projection
	self.out_proj = nn.Linear(self.d_inner, d_model, bias=False)

	# Pre-norm
	self.norm = nn.RMSNorm(d_model)

	def ssm_parallel(self, x):
	"""Parallel scan SSM — no sequential loops."""
	B_size, L, D = x.shape

	A = -torch.exp(self.A_log.float()) # (d_inner, d_state)
	D_skip = self.D.float()

	# Compute input-dependent B, C, dt
	x_dbl = self.x_proj(x) # (B, L, d_state*2 + 1)
	dt, B_mat, C_mat = x_dbl.split([1, self.d_state, self.d_state], dim=-1)
	dt = F.softplus(self.dt_proj(dt)) # (B, L, d_inner)

	# Discretize: dA = exp(dt * A), dB = dt * B
	dA = torch.exp(dt.unsqueeze(-1) * A.unsqueeze(0).unsqueeze(0)) # (B, L, D, N)
	dBx = dt.unsqueeze(-1) * B_mat.unsqueeze(2) * x.unsqueeze(-1) # (B, L, D, N)

	# Rearrange for parallel scan: (B, D, L, N)
	dA = dA.permute(0, 2, 1, 3).contiguous()
	dBx = dBx.permute(0, 2, 1, 3).contiguous()

	if self.use_parallel_scan:
	# Parallel prefix scan
	h = pscan(dA, dBx) # (B, D, L, N)
	else:
	# Sequential fallback
	h = torch.zeros_like(dBx)
	state = torch.zeros(B_size, self.d_inner, self.d_state,
	device=x.device, dtype=x.dtype)
	for t in range(L):
	state = dA[:, :, t] * state + dBx[:, :, t]
	h[:, :, t] = state

	# Output: y = C * h + D * x
	h = h.permute(0, 2, 1, 3) # (B, L, D, N)
	C_mat_exp = C_mat.unsqueeze(2) # (B, L, 1, N)
	y = (h * C_mat_exp).sum(-1) # (B, L, D)
	y = y + D_skip * x

	return y

	def forward(self, x):
	"""x: (B, L, d_model)"""
	residual = x
	x = self.norm(x)

	# Input projection + gate split
	xz = self.in_proj(x) # (B, L, 2*d_inner)
	x_ssm, z = xz.chunk(2, dim=-1)

	# 1D conv for local context
	x_ssm = rearrange(x_ssm, 'b l d -> b d l')
	x_ssm = self.conv1d(x_ssm)[:, :, :residual.shape[1]]
	x_ssm = rearrange(x_ssm, 'b d l -> b l d')
	x_ssm = F.silu(x_ssm)

	# SSM
	y = self.ssm_parallel(x_ssm)

	# Gated output
	y = y * F.silu(z)

	return self.out_proj(y) + residual


	# ==============================================================================
	# 2D Cross-Scan for Vision — VMamba style
	# ==============================================================================

	def cross_scan_2d(x):
	"""
	Convert 2D feature map to 4 directional 1D sequences.
	x: (B, H, W, C)
	Returns: list of 4 tensors, each (B, H*W, C)
	"""
	B, H, W, C = x.shape
	# Direction 1: raster (top-left → bottom-right)
	d1 = rearrange(x, 'b h w c -> b (h w) c')
	# Direction 2: reverse raster
	d2 = rearrange(x.flip([1, 2]), 'b h w c -> b (h w) c')
	# Direction 3: column-first
	d3 = rearrange(x.permute(0, 2, 1, 3), 'b w h c -> b (w h) c')
	# Direction 4: reverse column-first
	d4 = rearrange(x.permute(0, 2, 1, 3).flip([1, 2]), 'b w h c -> b (w h) c')
	return [d1, d2, d3, d4]


	def cross_merge_2d(ys, H, W):
	"""
	Merge 4 directional sequences back to 2D.
	ys: list of 4 tensors (B, H*W, C)
	Returns: (B, H, W, C)
	"""
	d1 = rearrange(ys[0], 'b (h w) c -> b h w c', h=H, w=W)
	d2 = rearrange(ys[1], 'b (h w) c -> b h w c', h=H, w=W).flip([1, 2])
	d3 = rearrange(ys[2], 'b (h w) c -> b w h c', h=H, w=W).permute(0, 2, 1, 3)
	d4 = rearrange(ys[3], 'b (h w) c -> b w h c', h=H, w=W).permute(0, 2, 1, 3).flip([1, 2])
	return (d1 + d2 + d3 + d4) * 0.25


	class Mamba2DBlock(nn.Module):
	"""
	2D Mamba block using cross-scan pattern.
	Processes feature maps with 4 directional SSM scans in parallel.
	No attention — pure SSM + local conv.
	"""
	def __init__(self, channels, d_state=16, expand=2, use_parallel_scan=True):
	super().__init__()
	self.channels = channels
	# One SSM shared across all 4 directions (weight sharing saves params)
	self.ssm = SelectiveSSM(
	d_model=channels,
	d_state=d_state,
	d_conv=4,
	expand=expand,
	use_parallel_scan=use_parallel_scan
	)
	self.mix_proj = nn.Linear(channels, channels)
	self.norm = nn.RMSNorm(channels)

	def forward(self, x):
	"""x: (B, C, H, W)"""
	B, C, H, W = x.shape
	residual = x

	# Convert to (B, H, W, C)
	x_hwc = x.permute(0, 2, 3, 1)

	# Cross-scan: 4 directional 1D sequences
	seqs = cross_scan_2d(x_hwc)

	# Process all 4 directions with shared SSM
	outputs = [self.ssm(s) for s in seqs]

	# Cross-merge back to 2D
	merged = cross_merge_2d(outputs, H, W) # (B, H, W, C)
	merged = self.norm(merged)
	merged = self.mix_proj(merged)

	# Back to (B, C, H, W)
	return merged.permute(0, 3, 1, 2) + residual


	# ==============================================================================
	# Mobile Convolution Blocks
	# ==============================================================================

	class SqueezeExcitation(nn.Module):
	"""Channel attention via squeeze-excitation."""
	def __init__(self, channels, reduction=4):
	super().__init__()
	reduced = max(8, channels // reduction)
	self.pool = nn.AdaptiveAvgPool2d(1)
	self.fc = nn.Sequential(
	nn.Linear(channels, reduced),
	nn.SiLU(inplace=True),
	nn.Linear(reduced, channels),
	nn.Sigmoid()
	)

	def forward(self, x):
	B, C, H, W = x.shape
	w = self.pool(x).view(B, C)
	w = self.fc(w).view(B, C, 1, 1)
	return x * w


	class FiLM(nn.Module):
	"""Feature-wise Linear Modulation for style conditioning."""
	def __init__(self, cond_dim, channels):
	super().__init__()
	self.proj = nn.Linear(cond_dim, channels * 2)

	def forward(self, x, cond):
	"""x: (B,C,H,W), cond: (B, cond_dim)"""
	params = self.proj(cond) # (B, 2*C)
	gamma, beta = params.chunk(2, dim=-1) # each (B, C)
	gamma = gamma.view(-1, x.shape[1], 1, 1)
	beta = beta.view(-1, x.shape[1], 1, 1)
	return x * (1 + gamma) + beta


	class MobileConvBlock(nn.Module):
	"""
	Mobile-friendly inverted residual block with:
	- Depthwise separable convolution
	- Squeeze-Excitation
	- Optional FiLM style conditioning
	- Reparameterizable for mobile deployment
	"""
	def __init__(self, in_ch, out_ch, expand_ratio=4, stride=1,
	use_se=True, cond_dim=None):
	super().__init__()
	mid_ch = in_ch * expand_ratio
	self.use_residual = (stride == 1 and in_ch == out_ch)

	layers = []
	# Expand
	if expand_ratio != 1:
	layers.extend([
	nn.Conv2d(in_ch, mid_ch, 1, bias=False),
	nn.BatchNorm2d(mid_ch),
	nn.SiLU(inplace=True),
	])
	# Depthwise
	layers.extend([
	nn.Conv2d(mid_ch, mid_ch, 3, stride=stride, padding=1,
	groups=mid_ch, bias=False),
	nn.BatchNorm2d(mid_ch),
	nn.SiLU(inplace=True),
	])
	self.conv = nn.Sequential(*layers)

	# Squeeze-Excitation
	self.se = SqueezeExcitation(mid_ch) if use_se else nn.Identity()

	# Project
	self.project = nn.Sequential(
	nn.Conv2d(mid_ch, out_ch, 1, bias=False),
	nn.BatchNorm2d(out_ch),
	)

	# FiLM conditioning
	self.film = FiLM(cond_dim, out_ch) if cond_dim else None

	# Skip connection
	if not self.use_residual and stride == 1:
	self.skip = nn.Conv2d(in_ch, out_ch, 1, bias=False)
	elif not self.use_residual:
	self.skip = nn.Sequential(
	nn.Conv2d(in_ch, out_ch, 1, stride=stride, bias=False),
	nn.BatchNorm2d(out_ch),
	)
	else:
	self.skip = nn.Identity()

	def forward(self, x, cond=None):
	out = self.conv(x)
	out = self.se(out)
	out = self.project(out)
	if self.film is not None and cond is not None:
	out = self.film(out, cond)
	if self.use_residual:
	return out + x
	else:
	return out + self.skip(x) if hasattr(self, 'skip') else out


	class GatedConvBlock(nn.Module):
	"""Gated convolution block — alternative to attention for global mixing."""
	def __init__(self, channels):
	super().__init__()
	self.norm = nn.GroupNorm(min(32, channels), channels)
	self.proj = nn.Conv2d(channels, channels * 2, 1)
	self.dw = nn.Conv2d(channels, channels, 5, padding=2, groups=channels)
	self.out = nn.Conv2d(channels, channels, 1)

	def forward(self, x):
	residual = x
	x = self.norm(x)
	gate, val = self.proj(x).chunk(2, dim=1)
	val = self.dw(val)
	x = val * F.silu(gate)
	return self.out(x) + residual


	# ==============================================================================
	# PMA-VAE Encoder
	# ==============================================================================

	class PMAEncoder(nn.Module):
	"""
	Encoder with progressive downsampling:
	H → H/2 → H/4 → H/8 → H/16

	Outputs multi-scale latents:
	- z_base: H/16 x W/16 x latent_base_dim
	- z_detail: H/8 x W/8 x latent_detail_dim
	- z_style: 1 x 1 x latent_style_dim (global)
	"""
	def __init__(self, in_channels=3,
	stage_channels=(64, 128, 192, 256),
	stage_blocks=(2, 2, 4, 4),
	latent_base_dim=32,
	latent_detail_dim=8,
	latent_style_dim=128,
	d_state=16,
	use_parallel_scan=True):
	super().__init__()

	self.latent_base_dim = latent_base_dim
	self.latent_detail_dim = latent_detail_dim
	self.latent_style_dim = latent_style_dim

	# Stem: PixelUnshuffle (lossless 2x downsample) + Conv
	self.stem = nn.Sequential(
	nn.PixelUnshuffle(2), # (B, C*4, H/2, W/2)
	nn.Conv2d(in_channels * 4, stage_channels[0], 3, padding=1, bias=False),
	nn.BatchNorm2d(stage_channels[0]),
	nn.SiLU(inplace=True),
	)

	# Stage 1: H/2 → H/4, MobileConv only
	self.stage1 = self._make_mobile_stage(
	stage_channels[0], stage_channels[1], stage_blocks[0], stride=2
	)

	# Stage 2: H/4 → H/8, MobileConv + some Mamba
	self.stage2 = self._make_hybrid_stage(
	stage_channels[1], stage_channels[2], stage_blocks[1],
	stride=2, d_state=d_state, mamba_ratio=0.5,
	use_parallel_scan=use_parallel_scan
	)

	# Detail latent head (at H/8 resolution)
	self.detail_head_mu = nn.Conv2d(stage_channels[2], latent_detail_dim, 1)
	self.detail_head_logvar = nn.Conv2d(stage_channels[2], latent_detail_dim, 1)

	# Stage 3: H/8 → H/16, Mamba-heavy
	self.stage3 = self._make_hybrid_stage(
	stage_channels[2], stage_channels[3], stage_blocks[2],
	stride=2, d_state=d_state, mamba_ratio=0.75,
	use_parallel_scan=use_parallel_scan
	)

	# One global mixing block at H/16
	self.global_mix = GatedConvBlock(stage_channels[3])

	# Base latent head (at H/16 resolution)
	self.base_head_mu = nn.Conv2d(stage_channels[3], latent_base_dim, 1)
	self.base_head_logvar = nn.Conv2d(stage_channels[3], latent_base_dim, 1)

	# Style latent head (global)
	self.style_pool = nn.AdaptiveAvgPool2d(1)
	self.style_head_mu = nn.Linear(stage_channels[3], latent_style_dim)
	self.style_head_logvar = nn.Linear(stage_channels[3], latent_style_dim)

	def _make_mobile_stage(self, in_ch, out_ch, num_blocks, stride=1):
	blocks = [MobileConvBlock(in_ch, out_ch, stride=stride)]
	for _ in range(num_blocks - 1):
	blocks.append(MobileConvBlock(out_ch, out_ch))
	return nn.Sequential(*blocks)

	def _make_hybrid_stage(self, in_ch, out_ch, num_blocks, stride=1,
	d_state=16, mamba_ratio=0.5, use_parallel_scan=True):
	blocks = nn.ModuleList()
	# First block handles stride
	blocks.append(MobileConvBlock(in_ch, out_ch, stride=stride))

	num_mamba = max(1, int((num_blocks - 1) * mamba_ratio))
	num_mobile = (num_blocks - 1) - num_mamba

	for _ in range(num_mobile):
	blocks.append(MobileConvBlock(out_ch, out_ch))
	for _ in range(num_mamba):
	blocks.append(Mamba2DBlock(out_ch, d_state=d_state, expand=2,
	use_parallel_scan=use_parallel_scan))
	return blocks

	def forward(self, x):
	"""
	x: (B, 3, H, W)
	Returns: dict with mu/logvar for base, detail, style latents
	"""
	# Stem: H → H/2
	x = self.stem(x)

	# Stage 1: H/2 → H/4
	x = self.stage1(x)

	# Stage 2: H/4 → H/8
	for block in self.stage2:
	x = block(x)

	# Detail latent at H/8
	detail_mu = self.detail_head_mu(x)
	detail_logvar = self.detail_head_logvar(x)

	# Stage 3: H/8 → H/16
	for block in self.stage3:
	x = block(x)

	# Global mixing
	x = self.global_mix(x)

	# Base latent at H/16
	base_mu = self.base_head_mu(x)
	base_logvar = self.base_head_logvar(x)

	# Style latent (global)
	style_feat = self.style_pool(x).flatten(1)
	style_mu = self.style_head_mu(style_feat)
	style_logvar = self.style_head_logvar(style_feat)

	return {
	'base_mu': base_mu, 'base_logvar': base_logvar,
	'detail_mu': detail_mu, 'detail_logvar': detail_logvar,
	'style_mu': style_mu, 'style_logvar': style_logvar,
	}


	# ==============================================================================
	# PMA-VAE Decoder
	# ==============================================================================

	class UpsampleBlock(nn.Module):
	"""Efficient 2x upsample with pixel shuffle."""
	def __init__(self, in_ch, out_ch):
	super().__init__()
	self.conv = nn.Conv2d(in_ch, out_ch * 4, 3, padding=1, bias=False)
	self.ps = nn.PixelShuffle(2)
	self.norm = nn.BatchNorm2d(out_ch)
	self.act = nn.SiLU(inplace=True)

	def forward(self, x):
	return self.act(self.norm(self.ps(self.conv(x))))


	class PMADecoder(nn.Module):
	"""
	Lightweight decoder for mobile deployment.

	Takes multi-scale latents and reconstructs image:
	z_base (H/16) + z_style → decode → fuse z_detail (H/8) → upsample → image
	"""
	def __init__(self, out_channels=3,
	stage_channels=(256, 192, 128, 96, 64),
	latent_base_dim=32,
	latent_detail_dim=8,
	latent_style_dim=128,
	d_state=16,
	use_parallel_scan=True):
	super().__init__()

	# Initial projection from latent to feature space
	self.base_proj = nn.Sequential(
	nn.Conv2d(latent_base_dim, stage_channels[0], 3, padding=1, bias=False),
	nn.BatchNorm2d(stage_channels[0]),
	nn.SiLU(inplace=True),
	)

	# Stage 1: H/16, Mamba blocks with FiLM style conditioning
	self.stage1_blocks = nn.ModuleList([
	MobileConvBlock(stage_channels[0], stage_channels[0],
	cond_dim=latent_style_dim),
	Mamba2DBlock(stage_channels[0], d_state=d_state,
	use_parallel_scan=use_parallel_scan),
	])

	# Upsample H/16 → H/8
	self.up1 = UpsampleBlock(stage_channels[0], stage_channels[1])

	# Fuse detail latent at H/8
	self.detail_fuse = nn.Sequential(
	nn.Conv2d(stage_channels[1] + latent_detail_dim, stage_channels[1], 1, bias=False),
	nn.BatchNorm2d(stage_channels[1]),
	nn.SiLU(inplace=True),
	)

	# Stage 2: H/8, MobileConv with FiLM
	self.stage2_blocks = nn.ModuleList([
	MobileConvBlock(stage_channels[1], stage_channels[1],
	cond_dim=latent_style_dim),
	MobileConvBlock(stage_channels[1], stage_channels[1],
	cond_dim=latent_style_dim),
	Mamba2DBlock(stage_channels[1], d_state=d_state,
	use_parallel_scan=use_parallel_scan),
	])

	# Upsample H/8 → H/4
	self.up2 = UpsampleBlock(stage_channels[1], stage_channels[2])

	# Stage 3: H/4
	self.stage3_blocks = nn.ModuleList([
	MobileConvBlock(stage_channels[2], stage_channels[2],
	cond_dim=latent_style_dim),
	MobileConvBlock(stage_channels[2], stage_channels[2]),
	])

	# Upsample H/4 → H/2
	self.up3 = UpsampleBlock(stage_channels[2], stage_channels[3])

	# Stage 4: H/2
	self.stage4_blocks = nn.ModuleList([
	MobileConvBlock(stage_channels[3], stage_channels[3]),
	MobileConvBlock(stage_channels[3], stage_channels[3]),
	])

	# Upsample H/2 → H (PixelShuffle)
	self.up4 = UpsampleBlock(stage_channels[3], stage_channels[4])

	# Final output head
	self.head = nn.Sequential(
	nn.Conv2d(stage_channels[4], stage_channels[4], 3, padding=1),
	nn.SiLU(inplace=True),
	nn.Conv2d(stage_channels[4], out_channels, 3, padding=1),
	nn.Tanh(), # output [-1, 1]
	)

	def forward(self, z_base, z_detail, z_style):
	"""
	z_base: (B, latent_base_dim, H/16, W/16)
	z_detail: (B, latent_detail_dim, H/8, W/8)
	z_style: (B, latent_style_dim)
	"""
	# Project base latent
	x = self.base_proj(z_base)

	# Stage 1: H/16 with style conditioning
	for block in self.stage1_blocks:
	if isinstance(block, MobileConvBlock):
	x = block(x, cond=z_style)
	else:
	x = block(x)

	# Upsample to H/8
	x = self.up1(x)

	# Fuse detail latent
	x = self.detail_fuse(torch.cat([x, z_detail], dim=1))

	# Stage 2: H/8
	for block in self.stage2_blocks:
	if isinstance(block, MobileConvBlock):
	x = block(x, cond=z_style)
	else:
	x = block(x)

	# Upsample to H/4
	x = self.up2(x)

	# Stage 3: H/4
	for block in self.stage3_blocks:
	if isinstance(block, MobileConvBlock):
	x = block(x, cond=z_style)
	else:
	x = block(x)

	# Upsample to H/2
	x = self.up3(x)

	# Stage 4: H/2
	for block in self.stage4_blocks:
	x = block(x)

	# Upsample to H
	x = self.up4(x)

	# Output
	return self.head(x)


	# ==============================================================================
	# Full PMA-VAE Model
	# ==============================================================================

	class PMAVAE(nn.Module):
	"""
	Parallel Mobile Artistic VAE — Full model.

	Features:
	- Attention-free (Mamba SSM + mobile convolutions)
	- Multi-scale latent space (base + detail + style)
	- FiLM style conditioning in decoder
	- Parallel scan training (no sequential pixel loops)
	- Mobile-deployable decoder (~15-20M params)

	Args:
	in_channels: Input image channels (3 for RGB)
	enc_channels: Channel widths per encoder stage
	dec_channels: Channel widths per decoder stage
	latent_base_dim: Channels for H/16 base latent
	latent_detail_dim: Channels for H/8 detail latent
	latent_style_dim: Dimension of global style vector
	d_state: SSM state dimension
	use_parallel_scan: Use Blelloch parallel scan (True) or sequential (False)
	"""
	def __init__(self,
	in_channels=3,
	enc_channels=(64, 128, 192, 256),
	dec_channels=(256, 192, 128, 96, 64),
	enc_blocks=(2, 2, 4, 4),
	latent_base_dim=32,
	latent_detail_dim=8,
	latent_style_dim=128,
	d_state=16,
	use_parallel_scan=True):
	super().__init__()

	self.encoder = PMAEncoder(
	in_channels=in_channels,
	stage_channels=enc_channels,
	stage_blocks=enc_blocks,
	latent_base_dim=latent_base_dim,
	latent_detail_dim=latent_detail_dim,
	latent_style_dim=latent_style_dim,
	d_state=d_state,
	use_parallel_scan=use_parallel_scan,
	)

	self.decoder = PMADecoder(
	out_channels=in_channels,
	stage_channels=dec_channels,
	latent_base_dim=latent_base_dim,
	latent_detail_dim=latent_detail_dim,
	latent_style_dim=latent_style_dim,
	d_state=d_state,
	use_parallel_scan=use_parallel_scan,
	)

	def reparameterize(self, mu, logvar):
	"""Reparameterization trick: z = mu + eps * std"""
	if self.training:
	std = torch.exp(0.5 * logvar)
	eps = torch.randn_like(std)
	return mu + eps * std
	return mu

	def encode(self, x):
	"""Encode image to multi-scale latent distributions."""
	posteriors = self.encoder(x)
	return posteriors

	def decode(self, z_base, z_detail, z_style):
	"""Decode latents to image."""
	return self.decoder(z_base, z_detail, z_style)

	def forward(self, x):
	"""
	Full forward pass: encode → sample → decode.
	Returns: (recon, posteriors_dict)
	"""
	posteriors = self.encode(x)

	# Sample from each latent distribution
	z_base = self.reparameterize(posteriors['base_mu'], posteriors['base_logvar'])
	z_detail = self.reparameterize(posteriors['detail_mu'], posteriors['detail_logvar'])
	z_style = self.reparameterize(posteriors['style_mu'], posteriors['style_logvar'])

	# Decode
	recon = self.decode(z_base, z_detail, z_style)

	return recon, posteriors

	def get_last_decoder_layer(self):
	"""For adaptive discriminator weight balancing."""
	return self.decoder.head[-2].weight

	@torch.no_grad()
	def encode_to_latent(self, x):
	"""Encode to deterministic latent (use mu, no sampling)."""
	posteriors = self.encode(x)
	return (posteriors['base_mu'], posteriors['detail_mu'], posteriors['style_mu'])

	@torch.no_grad()
	def decode_from_latent(self, z_base, z_detail, z_style):
	"""Decode from latents (inference mode)."""
	return self.decode(z_base, z_detail, z_style)

	def count_parameters(self):
	"""Count and display parameter breakdown."""
	enc_params = sum(p.numel() for p in self.encoder.parameters())
	dec_params = sum(p.numel() for p in self.decoder.parameters())
	total = enc_params + dec_params
	return {
	'encoder': enc_params,
	'decoder': dec_params,
	'total': total,
	'encoder_M': enc_params / 1e6,
	'decoder_M': dec_params / 1e6,
	'total_M': total / 1e6,
	}


	# ==============================================================================
	# Model Configs
	# ==============================================================================

	def pmavae_tiny(**kwargs):
	"""Tiny config for testing. ~5M params."""
	return PMAVAE(
	enc_channels=(32, 64, 96, 128),
	dec_channels=(128, 96, 64, 48, 32),
	enc_blocks=(1, 1, 2, 2),
	latent_base_dim=16,
	latent_detail_dim=4,
	latent_style_dim=64,
	d_state=8,
	**kwargs
	)


	def pmavae_small(**kwargs):
	"""Small config for Colab free tier. ~20M params."""
	return PMAVAE(
	enc_channels=(48, 96, 144, 192),
	dec_channels=(192, 144, 96, 72, 48),
	enc_blocks=(2, 2, 3, 3),
	latent_base_dim=24,
	latent_detail_dim=6,
	latent_style_dim=96,
	d_state=16,
	**kwargs
	)


	def pmavae_base(**kwargs):
	"""Base config. ~40M params."""
	return PMAVAE(
	enc_channels=(64, 128, 192, 256),
	dec_channels=(256, 192, 128, 96, 64),
	enc_blocks=(2, 2, 4, 4),
	latent_base_dim=32,
	latent_detail_dim=8,
	latent_style_dim=128,
	d_state=16,
	**kwargs
	)


	if __name__ == '__main__':
	# Quick test
	device = 'cpu'
	model = pmavae_tiny(use_parallel_scan=False).to(device)

	x = torch.randn(2, 3, 256, 256, device=device)
	recon, posteriors = model(x)

	print(f"Input: {x.shape}")
	print(f"Recon: {recon.shape}")
	for k, v in posteriors.items():
	print(f" {k}: {v.shape}")

	params = model.count_parameters()
	print(f"\nParams: {params['total_M']:.2f}M (enc: {params['encoder_M']:.2f}M, dec: {params['decoder_M']:.2f}M)")