prediff_code / scripts /train_diffusion /prediff_lightning_module.py

Upload folder using huggingface_hub

7667a87 verified 8 months ago

51.6 kB

	from omegaconf import OmegaConf
	import os
	from shutil import copyfile
	import warnings
	from typing import Dict,Sequence,Union
	import inspect

	import numpy as np
	import torch
	from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
	import torchmetrics
	from lightning.pytorch import Trainer, loggers as pl_loggers
	from lightning.pytorch.profilers import PyTorchProfiler
	from lightning.pytorch.strategies import DDPStrategy
	from lightning.pytorch.callbacks import (
	Callback, LearningRateMonitor, DeviceStatsMonitor,
	EarlyStopping, ModelCheckpoint
	)
	from lightning.pytorch.utilities import grad_norm
	from einops import rearrange

	from models.vae import AutoencoderKL
	from models.knowledge_alignment import SEVIRAvgIntensityAlignment,get_alignment_kwargs_avg_x
	from models.diffusion import LatentDiffusion
	from models.core_model.cuboid_transformer import CuboidTransformerUNet
	from datamodule import SEVIRLightningDataModule,vis_sevir_seq
	from utils.path import (
	default_exps_dir,
	default_pretrained_vae_dir,default_pretrained_alignment_dir
	)
	from utils.optim import disable_train,warmup_lambda
	from utils.layout import step_layout_to_in_out_slice
	from evaluation import FrechetVideoDistance,SEVIRSkillScore


	class PreDiffSEVIRPLModule(LatentDiffusion):
	def __init__(self,
	total_num_steps: int,
	oc_file: str = None,
	save_dir: str = None):
	self.total_num_steps = total_num_steps
	if oc_file is not None:
	oc_from_file = OmegaConf.load(open(oc_file, "r"))
	else:
	oc_from_file = None
	oc = self.get_base_config(oc_from_file=oc_from_file)
	self.save_hyperparameters(oc)
	self.oc = oc

	latent_model_cfg = OmegaConf.to_object(oc.model.latent_model)
	num_blocks = len(latent_model_cfg["depth"])
	if isinstance(latent_model_cfg["self_pattern"], str):
	block_attn_patterns = [latent_model_cfg["self_pattern"]] * num_blocks
	else:
	block_attn_patterns = OmegaConf.to_container(latent_model_cfg["self_pattern"])
	latent_model = CuboidTransformerUNet(
	input_shape=latent_model_cfg["input_shape"],
	target_shape=latent_model_cfg["target_shape"],
	base_units=latent_model_cfg["base_units"],
	scale_alpha=latent_model_cfg["scale_alpha"],
	num_heads=latent_model_cfg["num_heads"],
	attn_drop=latent_model_cfg["attn_drop"],
	proj_drop=latent_model_cfg["proj_drop"],
	ffn_drop=latent_model_cfg["ffn_drop"],
	# inter-attn downsample/upsample
	downsample=latent_model_cfg["downsample"],
	downsample_type=latent_model_cfg["downsample_type"],
	upsample_type=latent_model_cfg["upsample_type"],
	upsample_kernel_size=latent_model_cfg["upsample_kernel_size"],
	# attention
	depth=latent_model_cfg["depth"],
	block_attn_patterns=block_attn_patterns,
	# global vectors
	num_global_vectors=latent_model_cfg["num_global_vectors"],
	use_global_vector_ffn=latent_model_cfg["use_global_vector_ffn"],
	use_global_self_attn=latent_model_cfg["use_global_self_attn"],
	separate_global_qkv=latent_model_cfg["separate_global_qkv"],
	global_dim_ratio=latent_model_cfg["global_dim_ratio"],
	# misc
	ffn_activation=latent_model_cfg["ffn_activation"],
	gated_ffn=latent_model_cfg["gated_ffn"],
	norm_layer=latent_model_cfg["norm_layer"],
	padding_type=latent_model_cfg["padding_type"],
	checkpoint_level=latent_model_cfg["checkpoint_level"],
	pos_embed_type=latent_model_cfg["pos_embed_type"],
	use_relative_pos=latent_model_cfg["use_relative_pos"],
	self_attn_use_final_proj=latent_model_cfg["self_attn_use_final_proj"],
	# initialization
	attn_linear_init_mode=latent_model_cfg["attn_linear_init_mode"],
	ffn_linear_init_mode=latent_model_cfg["ffn_linear_init_mode"],
	ffn2_linear_init_mode=latent_model_cfg["ffn2_linear_init_mode"],
	attn_proj_linear_init_mode=latent_model_cfg["attn_proj_linear_init_mode"],
	conv_init_mode=latent_model_cfg["conv_init_mode"],
	down_linear_init_mode=latent_model_cfg["down_up_linear_init_mode"],
	up_linear_init_mode=latent_model_cfg["down_up_linear_init_mode"],
	global_proj_linear_init_mode=latent_model_cfg["global_proj_linear_init_mode"],
	norm_init_mode=latent_model_cfg["norm_init_mode"],
	# timestep embedding for diffusion
	time_embed_channels_mult=latent_model_cfg["time_embed_channels_mult"],
	time_embed_use_scale_shift_norm=latent_model_cfg["time_embed_use_scale_shift_norm"],
	time_embed_dropout=latent_model_cfg["time_embed_dropout"],
	unet_res_connect=latent_model_cfg["unet_res_connect"]
	)

	vae_cfg = OmegaConf.to_object(oc.model.vae)
	first_stage_model = AutoencoderKL(
	down_block_types=vae_cfg["down_block_types"],
	in_channels=vae_cfg["in_channels"],
	block_out_channels=vae_cfg["block_out_channels"],
	act_fn=vae_cfg["act_fn"],
	latent_channels=vae_cfg["latent_channels"],
	up_block_types=vae_cfg["up_block_types"],
	norm_num_groups=vae_cfg["norm_num_groups"],
	layers_per_block=vae_cfg["layers_per_block"],
	out_channels=vae_cfg["out_channels"], )
	pretrained_ckpt_path = vae_cfg["pretrained_ckpt_path"]
	if pretrained_ckpt_path is not None:
	state_dict = torch.load(os.path.join(default_pretrained_vae_dir, vae_cfg["pretrained_ckpt_path"]),
	map_location=torch.device("cpu"))
	first_stage_model.load_state_dict(state_dict=state_dict)
	else:
	warnings.warn(f"Pretrained weights for `AutoencoderKL` not set. Run for sanity check only.")

	diffusion_cfg = OmegaConf.to_object(oc.model.diffusion)
	super(PreDiffSEVIRPLModule, self).__init__(
	torch_nn_module=latent_model,
	layout=oc.layout.layout,
	data_shape=diffusion_cfg["data_shape"],
	timesteps=diffusion_cfg["timesteps"],
	beta_schedule=diffusion_cfg["beta_schedule"],
	loss_type=self.oc.optim.loss_type,
	monitor=self.oc.optim.monitor,
	use_ema=diffusion_cfg["use_ema"],
	log_every_t=diffusion_cfg["log_every_t"],
	clip_denoised=diffusion_cfg["clip_denoised"],
	linear_start=diffusion_cfg["linear_start"],
	linear_end=diffusion_cfg["linear_end"],
	cosine_s=diffusion_cfg["cosine_s"],
	given_betas=diffusion_cfg["given_betas"],
	original_elbo_weight=diffusion_cfg["original_elbo_weight"],
	v_posterior=diffusion_cfg["v_posterior"],
	l_simple_weight=diffusion_cfg["l_simple_weight"],
	parameterization=diffusion_cfg["parameterization"],
	learn_logvar=diffusion_cfg["learn_logvar"],
	logvar_init=diffusion_cfg["logvar_init"],
	# latent diffusion
	latent_shape=diffusion_cfg["latent_shape"],
	first_stage_model=first_stage_model,
	cond_stage_model=diffusion_cfg["cond_stage_model"],
	num_timesteps_cond=diffusion_cfg["num_timesteps_cond"],
	cond_stage_trainable=diffusion_cfg["cond_stage_trainable"],
	cond_stage_forward=diffusion_cfg["cond_stage_forward"],
	scale_by_std=diffusion_cfg["scale_by_std"],
	scale_factor=diffusion_cfg["scale_factor"], )
	# knowledge alignment
	knowledge_alignment_cfg = OmegaConf.to_object(oc.model.align)
	self.alignment_type = knowledge_alignment_cfg["alignment_type"]
	self.use_alignment = self.alignment_type is not None
	if self.use_alignment:
	alignment_ckpt_path = os.path.join(default_pretrained_alignment_dir, knowledge_alignment_cfg["model_ckpt_path"])
	self.alignment_obj = SEVIRAvgIntensityAlignment(
	alignment_type=knowledge_alignment_cfg["alignment_type"],
	guide_scale=knowledge_alignment_cfg["guide_scale"],
	model_type=knowledge_alignment_cfg["model_type"],
	model_args=knowledge_alignment_cfg["model_args"],
	model_ckpt_path=alignment_ckpt_path
	)
	disable_train(self.alignment_obj.model)
	self.alignment_model = self.alignment_obj.model
	alignment_fn = self.alignment_obj.get_mean_shift
	else:
	alignment_fn = None
	self.set_alignment(alignment_fn=alignment_fn)
	# lr_scheduler
	self.total_num_steps = total_num_steps
	# logging
	self.save_dir = save_dir
	self.logging_prefix = oc.logging.logging_prefix
	# visualization
	self.train_example_data_idx_list = list(oc.eval.train_example_data_idx_list)
	self.val_example_data_idx_list = list(oc.eval.val_example_data_idx_list)
	self.test_example_data_idx_list = list(oc.eval.test_example_data_idx_list)
	self.eval_example_only = oc.eval.eval_example_only

	if self.oc.eval.eval_unaligned:
	self.valid_mse = torchmetrics.MeanSquaredError()
	self.valid_mae = torchmetrics.MeanAbsoluteError()
	self.valid_score = SEVIRSkillScore(
	mode=self.oc.dataset.metrics_mode,
	seq_len=self.oc.layout.out_len,
	layout=self.layout,
	threshold_list=self.oc.dataset.threshold_list,
	metrics_list=self.oc.dataset.metrics_list,
	eps=1e-4
	)
	self.test_mse = torchmetrics.MeanSquaredError()
	self.test_mae = torchmetrics.MeanAbsoluteError()
	self.test_ssim = torchmetrics.image.StructuralSimilarityIndexMeasure()
	self.test_score = SEVIRSkillScore(
	mode=self.oc.dataset.metrics_mode,
	seq_len=self.oc.layout.out_len,
	layout=self.layout,
	threshold_list=self.oc.dataset.threshold_list,
	metrics_list=self.oc.dataset.metrics_list,
	eps=1e-4
	)
	self.test_fvd = FrechetVideoDistance(
	feature=self.oc.eval.fvd_features,
	layout=self.layout,
	reset_real_features=False,
	normalize=False,
	auto_t=True, )
	if self.oc.eval.eval_aligned:
	self.valid_aligned_mse = torchmetrics.MeanSquaredError()
	self.valid_aligned_mae = torchmetrics.MeanAbsoluteError()
	self.valid_aligned_score = SEVIRSkillScore(
	mode=self.oc.dataset.metrics_mode,
	seq_len=self.oc.layout.out_len,
	layout=self.layout,
	threshold_list=self.oc.dataset.threshold_list,
	metrics_list=self.oc.dataset.metrics_list,
	eps=1e-4, )
	self.test_aligned_mse = torchmetrics.MeanSquaredError()
	self.test_aligned_mae = torchmetrics.MeanAbsoluteError()
	self.test_aligned_ssim = torchmetrics.image.StructuralSimilarityIndexMeasure()
	self.test_aligned_score = SEVIRSkillScore(
	mode=self.oc.dataset.metrics_mode,
	seq_len=self.oc.layout.out_len,
	layout=self.layout,
	threshold_list=self.oc.dataset.threshold_list,
	metrics_list=self.oc.dataset.metrics_list,
	eps=1e-4, )
	self.test_aligned_fvd = FrechetVideoDistance(
	feature=self.oc.eval.fvd_features,
	layout=self.layout,
	reset_real_features=False,
	normalize=False,
	auto_t=True, )

	self.configure_save(cfg_file_path=oc_file)

	def configure_save(self, cfg_file_path=None):
	self.save_dir = os.path.join(default_exps_dir, self.save_dir)
	os.makedirs(self.save_dir, exist_ok=True)
	if cfg_file_path is not None:
	cfg_file_target_path = os.path.join(self.save_dir, "cfg.yaml")
	if (not os.path.exists(cfg_file_target_path)) or \
	(not os.path.samefile(cfg_file_path, cfg_file_target_path)):
	copyfile(cfg_file_path, cfg_file_target_path)
	self.example_save_dir = os.path.join(self.save_dir, "examples")
	os.makedirs(self.example_save_dir, exist_ok=True)
	self.npy_save_dir = os.path.join(self.save_dir, "npy")
	os.makedirs(self.npy_save_dir, exist_ok=True)

	# region Get Default Config
	def get_base_config(self, oc_from_file=None):
	oc = OmegaConf.create()
	oc.layout = self.get_layout_config()
	oc.optim = self.get_optim_config()
	oc.logging = self.get_logging_config()
	oc.trainer = self.get_trainer_config()
	oc.eval = self.get_eval_config()
	oc.model = self.get_model_config()
	oc.dataset = self.get_dataset_config()
	if oc_from_file is not None:
	# oc = apply_omegaconf_overrides(oc, oc_from_file)
	oc = OmegaConf.merge(oc, oc_from_file)
	return oc

	@staticmethod
	def get_layout_config():
	cfg = OmegaConf.create()
	cfg.in_len = 7
	cfg.out_len = 6
	cfg.in_step=1
	cfg.out_step=1
	cfg.in_out_diff=1

	cfg.img_height = 128
	cfg.img_width = 128
	cfg.data_channels = 4
	cfg.layout = "NTHWC"
	return cfg

	@staticmethod
	def get_model_config():
	cfg = OmegaConf.create()
	layout_cfg = PreDiffSEVIRPLModule.get_layout_config()

	cfg.diffusion = OmegaConf.create()
	cfg.diffusion.data_shape = (layout_cfg.out_len,
	layout_cfg.img_height,
	layout_cfg.img_width,
	layout_cfg.data_channels)
	cfg.diffusion.timesteps = 1000
	cfg.diffusion.beta_schedule = "linear"
	cfg.diffusion.use_ema = True
	cfg.diffusion.log_every_t = 100 # log every `log_every_t` timesteps. Must be smaller than `timesteps`.
	cfg.diffusion.clip_denoised = False
	cfg.diffusion.linear_start = 1e-4
	cfg.diffusion.linear_end = 2e-2
	cfg.diffusion.cosine_s = 8e-3
	cfg.diffusion.given_betas = None
	cfg.diffusion.original_elbo_weight = 0.
	cfg.diffusion.v_posterior = 0.
	cfg.diffusion.l_simple_weight = 1.
	cfg.diffusion.parameterization = "eps"
	cfg.diffusion.learn_logvar = None
	cfg.diffusion.logvar_init = 0.
	# latent diffusion
	cfg.diffusion.latent_shape = [10, 16, 16, 4]
	cfg.diffusion.cond_stage_model = "__is_first_stage__"
	cfg.diffusion.num_timesteps_cond = None
	cfg.diffusion.cond_stage_trainable = False
	cfg.diffusion.cond_stage_forward = None
	cfg.diffusion.scale_by_std = False
	cfg.diffusion.scale_factor = 1.0
	cfg.diffusion.latent_cond_shape = [10, 16, 16, 4]
	# knowledge alignment
	cfg.align = OmegaConf.create()
	cfg.align.alignment_type = None
	cfg.align.guide_scale = 1.0
	cfg.align.model_type = "cuboid"
	cfg.align.model_ckpt_path = "tmp.pt"
	cfg.align.model_args = OmegaConf.create()
	# Earthformer
	cfg.align.model_args.input_shape = [6, 16, 16, 4]
	cfg.align.model_args.out_channels = 2
	cfg.align.model_args.base_units = 16
	cfg.align.model_args.block_units = None
	cfg.align.model_args.scale_alpha = 1.0
	cfg.align.model_args.depth = [1, 1]
	cfg.align.model_args.downsample = 2
	cfg.align.model_args.downsample_type = "patch_merge"
	cfg.align.model_args.block_attn_patterns = "axial"
	cfg.align.model_args.num_heads = 4
	cfg.align.model_args.attn_drop = 0.0
	cfg.align.model_args.proj_drop = 0.0
	cfg.align.model_args.ffn_drop = 0.0
	cfg.align.model_args.ffn_activation = "gelu"
	cfg.align.model_args.gated_ffn = False
	cfg.align.model_args.norm_layer = "layer_norm"
	cfg.align.model_args.use_inter_ffn = True
	cfg.align.model_args.hierarchical_pos_embed = False
	cfg.align.model_args.pos_embed_type = 't+h+w'
	cfg.align.model_args.padding_type = "zero"
	cfg.align.model_args.checkpoint_level = 0
	cfg.align.model_args.use_relative_pos = True
	cfg.align.model_args.self_attn_use_final_proj = True
	# global vectors
	cfg.align.model_args.num_global_vectors = 0
	cfg.align.model_args.use_global_vector_ffn = True
	cfg.align.model_args.use_global_self_attn = False
	cfg.align.model_args.separate_global_qkv = False
	cfg.align.model_args.global_dim_ratio = 1
	# initialization
	cfg.align.model_args.attn_linear_init_mode = "0"
	cfg.align.model_args.ffn_linear_init_mode = "0"
	cfg.align.model_args.ffn2_linear_init_mode = "2"
	cfg.align.model_args.attn_proj_linear_init_mode = "2"
	cfg.align.model_args.conv_init_mode = "0"
	cfg.align.model_args.down_linear_init_mode = "0"
	cfg.align.model_args.global_proj_linear_init_mode = "2"
	cfg.align.model_args.norm_init_mode = "0"
	# timestep embedding for diffusion
	cfg.align.model_args.time_embed_channels_mult = 4
	cfg.align.model_args.time_embed_use_scale_shift_norm = False
	cfg.align.model_args.time_embed_dropout = 0.0
	# readout
	cfg.align.model_args.pool = "attention"
	cfg.align.model_args.readout_seq = True
	cfg.align.model_args.out_len = 6

	cfg.latent_model = OmegaConf.create()
	cfg.latent_model.input_shape = [10, 16, 16, 4]
	cfg.latent_model.target_shape = [10, 16, 16, 4]
	cfg.latent_model.base_units = 4
	# block_units = null
	cfg.latent_model.scale_alpha = 1.0
	cfg.latent_model.num_heads = 4
	cfg.latent_model.attn_drop = 0.1
	cfg.latent_model.proj_drop = 0.1
	cfg.latent_model.ffn_drop = 0.1
	# inter-attn downsample/upsample
	cfg.latent_model.downsample = 2
	cfg.latent_model.downsample_type = "patch_merge"
	cfg.latent_model.upsample_type = "upsample"
	cfg.latent_model.upsample_kernel_size = 3
	# cuboid attention
	cfg.latent_model.depth = [1, 1]
	cfg.latent_model.self_pattern = "axial"
	# global vectors
	cfg.latent_model.num_global_vectors = 0
	cfg.latent_model.use_dec_self_global = False
	cfg.latent_model.dec_self_update_global = True
	cfg.latent_model.use_dec_cross_global = False
	cfg.latent_model.use_global_vector_ffn = False
	cfg.latent_model.use_global_self_attn = True
	cfg.latent_model.separate_global_qkv = True
	cfg.latent_model.global_dim_ratio = 1
	# mise
	cfg.latent_model.ffn_activation = "gelu"
	cfg.latent_model.gated_ffn = False
	cfg.latent_model.norm_layer = "layer_norm"
	cfg.latent_model.padding_type = "zeros"
	cfg.latent_model.pos_embed_type = "t+h+w"
	cfg.latent_model.checkpoint_level = 0
	cfg.latent_model.use_relative_pos = True
	cfg.latent_model.self_attn_use_final_proj = True
	# initialization
	cfg.latent_model.attn_linear_init_mode = "0"
	cfg.latent_model.ffn_linear_init_mode = "0"
	cfg.latent_model.ffn2_linear_init_mode = "2"
	cfg.latent_model.attn_proj_linear_init_mode = "2"
	cfg.latent_model.conv_init_mode = "0"
	cfg.latent_model.down_up_linear_init_mode = "0"
	cfg.latent_model.global_proj_linear_init_mode = "2"
	cfg.latent_model.norm_init_mode = "0"
	# timestep embedding for diffusion
	cfg.latent_model.time_embed_channels_mult = 4
	cfg.latent_model.time_embed_use_scale_shift_norm = False
	cfg.latent_model.time_embed_dropout = 0.0
	cfg.latent_model.unet_res_connect = True

	cfg.vae = OmegaConf.create()
	cfg.vae.data_channels = layout_cfg.data_channels
	# from stable-diffusion-v1-5
	cfg.vae.down_block_types = ['DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D']
	cfg.vae.in_channels = cfg.vae.data_channels
	cfg.vae.block_out_channels = [128, 256, 512, 512]
	cfg.vae.act_fn = 'silu'
	cfg.vae.latent_channels = 4
	cfg.vae.up_block_types = ['UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D']
	cfg.vae.norm_num_groups = 32
	cfg.vae.layers_per_block = 2
	cfg.vae.out_channels = cfg.vae.data_channels
	return cfg

	@staticmethod
	def get_dataset_config():
	cfg = OmegaConf.create()
	cfg.dataset_name = "sevir_lr"
	cfg.img_height = 128
	cfg.img_width = 128
	cfg.in_len = 7
	cfg.out_len = 6
	cfg.seq_len = 13
	cfg.plot_stride = 1
	cfg.interval_real_time = 10
	cfg.sample_mode = "sequent"
	cfg.stride = cfg.out_len
	cfg.layout = "NTHWC"
	cfg.start_date = None
	cfg.train_val_split_date = (2019, 1, 1)
	cfg.train_test_split_date = (2019, 6, 1)
	cfg.end_date = None
	cfg.metrics_mode = "0"
	cfg.metrics_list = ('csi', 'pod', 'sucr', 'bias')
	cfg.threshold_list = (16, 74, 133, 160, 181, 219)
	cfg.aug_mode = "1"
	return cfg

	@staticmethod
	def get_optim_config():
	cfg = OmegaConf.create()
	cfg.seed = None
	cfg.total_batch_size = 32
	cfg.micro_batch_size = 8
	cfg.float32_matmul_precision = "high"

	cfg.method = "adamw"
	cfg.lr = 1.0E-6
	cfg.wd = 1.0E-2
	cfg.betas = (0.9, 0.999)
	cfg.gradient_clip_val = 1.0
	cfg.max_epochs = 50
	cfg.loss_type = "l2"
	# scheduler
	cfg.warmup_percentage = 0.2
	cfg.lr_scheduler_mode = "cosine" # Can be strings like 'linear', 'cosine', 'platue'
	cfg.min_lr_ratio = 1.0E-3
	cfg.warmup_min_lr_ratio = 0.0
	# early stopping
	cfg.monitor = "valid_loss_epoch"
	cfg.early_stop = False
	cfg.early_stop_mode = "min"
	cfg.early_stop_patience = 5
	cfg.save_top_k = 1
	return cfg

	@staticmethod
	def get_logging_config():
	cfg = OmegaConf.create()
	cfg.logging_prefix = "PreDiff"
	cfg.monitor_lr = True
	cfg.monitor_device = False
	cfg.track_grad_norm = -1
	cfg.use_wandb = False
	cfg.profiler = None
	cfg.save_npy = False
	return cfg

	@staticmethod
	def get_trainer_config():
	cfg = OmegaConf.create()
	cfg.check_val_every_n_epoch = 1
	cfg.log_step_ratio = 0.001 # Logging every 1% of the total training steps per epoch
	cfg.precision = 32
	cfg.find_unused_parameters = True
	cfg.num_sanity_val_steps = 2
	return cfg

	@staticmethod
	def get_eval_config():
	cfg = OmegaConf.create()
	cfg.train_example_data_idx_list = [0, ]
	cfg.val_example_data_idx_list = [0, ]
	cfg.test_example_data_idx_list = [0, ]
	cfg.eval_example_only = False
	cfg.eval_aligned = True
	cfg.eval_unaligned = True
	cfg.num_samples_per_context = 1
	cfg.font_size = 20
	cfg.label_offset = (-0.5, 0.5)
	cfg.label_avg_int = False
	cfg.fvd_features = 400
	return cfg
	# endregion

	# region Trainer and Optimizer Config
	def configure_optimizers(self):
	optim_cfg = self.oc.optim
	params = list(self.torch_nn_module.parameters())
	if self.cond_stage_trainable:
	print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
	params = params + list(self.cond_stage_model.parameters())
	if self.learn_logvar:
	print('Diffusion model optimizing logvar')
	params.append(self.logvar)

	if optim_cfg.method == "adamw":
	optimizer = torch.optim.AdamW(params, lr=optim_cfg.lr, betas=optim_cfg.betas)
	else:
	raise NotImplementedError(f"opimization method {optim_cfg.method} not supported.")

	warmup_iter = int(np.round(self.oc.optim.warmup_percentage * self.total_num_steps))
	if optim_cfg.lr_scheduler_mode == 'none':
	return {'optimizer': optimizer}
	else:
	if optim_cfg.lr_scheduler_mode == 'cosine':
	warmup_scheduler = LambdaLR(optimizer,
	lr_lambda=warmup_lambda(warmup_steps=warmup_iter,
	min_lr_ratio=optim_cfg.warmup_min_lr_ratio))
	cosine_scheduler = CosineAnnealingLR(optimizer,
	T_max=(self.total_num_steps - warmup_iter),
	eta_min=optim_cfg.min_lr_ratio * optim_cfg.lr)
	lr_scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, cosine_scheduler],
	milestones=[warmup_iter])
	lr_scheduler_config = {
	'scheduler': lr_scheduler,
	'interval': 'step',
	'frequency': 1,
	}
	else:
	raise NotImplementedError
	return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_config}

	def set_trainer_kwargs(self, **kwargs):
	r"""
	Default kwargs used when initializing pl.Trainer
	"""
	if self.oc.logging.profiler is None:
	profiler = None
	elif self.oc.logging.profiler == "pytorch":
	profiler = PyTorchProfiler(filename=f"{self.oc.logging.logging_prefix}_PyTorchProfiler.log")
	else:
	raise NotImplementedError
	checkpoint_callback = ModelCheckpoint(
	monitor=self.oc.optim.monitor,
	dirpath=os.path.join(self.save_dir, "checkpoints"),
	filename="{epoch:03d}_{val/loss:.4f}",
	auto_insert_metric_name=False,
	save_top_k=self.oc.optim.save_top_k,
	save_last=True,
	mode="min",
	)
	callbacks = kwargs.pop("callbacks", [])
	assert isinstance(callbacks, list)
	for ele in callbacks:
	assert isinstance(ele, Callback)
	callbacks += [checkpoint_callback, ]
	if self.oc.logging.monitor_lr:
	callbacks += [LearningRateMonitor(logging_interval='step'), ]
	if self.oc.logging.monitor_device:
	callbacks += [DeviceStatsMonitor(), ]
	if self.oc.optim.early_stop:
	callbacks += [EarlyStopping(monitor=self.oc.optim.monitor,
	min_delta=0.0,
	patience=self.oc.optim.early_stop_patience,
	verbose=False,
	mode=self.oc.optim.early_stop_mode), ]

	logger = kwargs.pop("logger", [])
	tb_logger = pl_loggers.TensorBoardLogger(save_dir=self.save_dir)
	csv_logger = pl_loggers.CSVLogger(save_dir=self.save_dir)
	logger += [tb_logger, csv_logger]
	if self.oc.logging.use_wandb:
	wandb_logger = pl_loggers.WandbLogger(
	name = self.oc.logging.logging_name,
	id = self.oc.logging.run_id,
	project=self.oc.logging.logging_prefix,
	save_dir=self.save_dir
	)
	logger += [wandb_logger, ]

	log_every_n_steps = max(1, int(self.oc.trainer.log_step_ratio * self.total_num_steps))
	trainer_init_keys = inspect.signature(Trainer).parameters.keys()
	ret = dict(
	callbacks=callbacks,
	# log
	logger=logger,
	log_every_n_steps=log_every_n_steps,
	profiler=profiler,
	# save
	default_root_dir=self.save_dir,
	# ddp
	accelerator="gpu",
	strategy=DDPStrategy(find_unused_parameters=self.oc.trainer.find_unused_parameters),
	# strategy=ApexDDPStrategy(find_unused_parameters=False, delay_allreduce=True),
	# optimization
	max_epochs=self.oc.optim.max_epochs,
	check_val_every_n_epoch=self.oc.trainer.check_val_every_n_epoch,
	gradient_clip_val=self.oc.optim.gradient_clip_val,
	# NVIDIA amp
	precision=self.oc.trainer.precision,
	# misc
	num_sanity_val_steps=self.oc.trainer.num_sanity_val_steps,
	inference_mode=False,
	)
	oc_trainer_kwargs = OmegaConf.to_object(self.oc.trainer)
	oc_trainer_kwargs = {key: val for key, val in oc_trainer_kwargs.items() if key in trainer_init_keys}
	ret.update(oc_trainer_kwargs)
	ret.update(kwargs)
	return ret
	# endregion

	# region Properties Extraction and Misc Calc
	@classmethod
	def get_total_num_steps(
	cls,
	num_samples: int,
	total_batch_size: int,
	epoch: int = None):
	r"""
	Parameters
	----------
	num_samples: int
	The number of samples of the datasets. `num_samples / micro_batch_size` is the number of steps per epoch.
	total_batch_size: int
	`total_batch_size == micro_batch_size * world_size * grad_accum`
	epoch: int
	"""
	if epoch is None:
	epoch = cls.get_optim_config().max_epochs
	return int(epoch * num_samples / total_batch_size)

	@staticmethod
	def get_sevir_datamodule(dataset_cfg,
	micro_batch_size: int = 1,
	num_workers: int = 4):
	dm = SEVIRLightningDataModule(
	seq_len=dataset_cfg["seq_len"],
	sample_mode=dataset_cfg["sample_mode"],
	stride=dataset_cfg["stride"],
	batch_size=micro_batch_size,
	layout=dataset_cfg["layout"],
	output_type=np.float32,
	preprocess=True,
	rescale_method="01",
	verbose=False,
	aug_mode=dataset_cfg["aug_mode"],
	ret_contiguous=False,
	# datamodule_only
	dataset_name=dataset_cfg["dataset_name"],
	start_date=dataset_cfg["start_date"],
	train_test_split_date=dataset_cfg["train_test_split_date"],
	end_date=dataset_cfg["end_date"],
	val_ratio=dataset_cfg["val_ratio"],
	num_workers=num_workers, )
	return dm

	@property
	def in_slice(self):
	if not hasattr(self, "_in_slice"):
	in_slice, out_slice = step_layout_to_in_out_slice(
	layout=self.oc.layout.layout,
	in_len=self.oc.layout.in_len, in_step= self.oc.layout.in_step,
	out_len=self.oc.layout.out_len, out_step = self.oc.layout.out_step,
	in_out_diff= self.oc.layout.in_out_diff
	)
	self._in_slice = in_slice
	self._out_slice = out_slice
	return self._in_slice

	@property
	def out_slice(self):
	if not hasattr(self, "_out_slice"):
	in_slice, out_slice = step_layout_to_in_out_slice(
	layout=self.oc.layout.layout,
	in_len=self.oc.layout.in_len, in_step= self.oc.layout.in_step,
	out_len=self.oc.layout.out_len, out_step = self.oc.layout.out_step,
	in_out_diff= self.oc.layout.in_out_diff
	)
	self._in_slice = in_slice
	self._out_slice = out_slice
	return self._out_slice

	@torch.no_grad()
	def get_input(self, batch, **kwargs):
	r"""
	dataset dependent
	re-implement it for each specific dataset

	Parameters
	----------
	batch: Any
	raw data batch from specific dataloader

	Returns
	-------
	out: Sequence[torch.Tensor, Dict[str, Any]]
	out[0] should be a torch.Tensor which is the target to generate
	out[1] should be a dict consists of several key-value pairs for conditioning
	"""
	return self._get_input_sevirlr(batch=batch, return_verbose=kwargs.get("return_verbose", False))

	@torch.no_grad()
	def _get_input_sevirlr(self, batch, return_verbose=False):
	seq = batch
	in_seq = seq[self.in_slice]
	out_seq = seq[self.out_slice].contiguous()
	if return_verbose:
	return out_seq, {"y": in_seq}, in_seq
	else:
	return out_seq, {"y": in_seq}
	# endregion

	# region Operation Step
	def training_step(self, batch, batch_idx):
	loss, loss_dict = self(batch)
	self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=False)
	micro_batch_size = batch.shape[self.batch_axis]
	data_idx = int(batch_idx * micro_batch_size)
	if self.current_epoch % self.oc.trainer.check_val_every_n_epoch == 0 \
	and self.local_rank == 0:
	if data_idx in self.train_example_data_idx_list:
	target_seq, cond, context_seq = \
	self.get_input(batch, return_verbose=True)
	aligned_pred_seq_list = []
	aligned_pred_label_list = []
	pred_seq_list = []
	pred_label_list = []
	for i in range(self.oc.eval.num_samples_per_context):
	# aligned sampling
	if self.use_alignment and self.oc.eval.eval_aligned:
	if self.alignment_type == "avg_x":
	alignment_kwargs = get_alignment_kwargs_avg_x(context_seq=context_seq,
	target_seq=target_seq)
	else:
	raise NotImplementedError
	pred_seq = self.sample(
	cond=cond,
	batch_size=micro_batch_size,
	return_intermediates=False,
	use_alignment=True,
	alignment_kwargs=alignment_kwargs,
	verbose=False, ).contiguous()
	aligned_pred_seq_list.append(pred_seq[0].detach().float().cpu().numpy())
	aligned_pred_label_list.append(f"{self.oc.logging.logging_prefix}_aligned_pred_{i}")
	# no alignment
	if self.oc.eval.eval_unaligned:
	pred_seq = self.sample(
	cond=cond,
	batch_size=micro_batch_size,
	return_intermediates=False,
	verbose=False, ).contiguous()
	pred_seq_list.append(pred_seq[0].detach().float().cpu().numpy())
	pred_label_list.append(f"{self.oc.logging.logging_prefix}_pred_{i}")
	pred_seq_list = aligned_pred_seq_list + pred_seq_list
	pred_label_list = aligned_pred_label_list + pred_label_list
	self.save_vis_step_end(
	data_idx=data_idx,
	context_seq=context_seq[0].detach().float().cpu().numpy(),
	target_seq=target_seq[0].detach().float().cpu().numpy(),
	pred_seq=pred_seq_list,
	pred_label=pred_label_list,
	mode="train", )
	return loss

	def validation_step(self, batch, batch_idx):
	_, loss_dict_no_ema = self(batch)
	with self.ema_scope():
	_, loss_dict_ema = self(batch)
	loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
	self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True, sync_dist=True)
	micro_batch_size = batch.shape[self.batch_axis]
	data_idx = int(batch_idx * micro_batch_size)
	if not self.eval_example_only or data_idx in self.val_example_data_idx_list:
	target_seq, cond, context_seq = \
	self.get_input(batch, return_verbose=True)
	aligned_pred_seq_list = []
	aligned_pred_label_list = []
	pred_seq_list = []
	pred_label_list = []
	for i in range(self.oc.eval.num_samples_per_context):
	# aligned sampling
	if self.use_alignment and self.oc.eval.eval_aligned:
	if self.alignment_type == "avg_x":
	alignment_kwargs = get_alignment_kwargs_avg_x(context_seq=context_seq,
	target_seq=target_seq)
	else:
	raise NotImplementedError
	pred_seq = self.sample(
	cond=cond,
	batch_size=micro_batch_size,
	return_intermediates=False,
	use_alignment=True,
	alignment_kwargs=alignment_kwargs,
	verbose=False, ).contiguous()
	aligned_pred_seq_list.append(pred_seq[0].detach().float().cpu().numpy())
	aligned_pred_label_list.append(f"{self.oc.logging.logging_prefix}_aligned_pred_{i}")
	if pred_seq.dtype is not torch.float:
	pred_seq = pred_seq.float()
	self.valid_aligned_mse(pred_seq, target_seq)
	self.valid_aligned_mae(pred_seq, target_seq)
	self.valid_aligned_score.update(pred_seq, target_seq)
	# no alignment
	if self.oc.eval.eval_unaligned:
	pred_seq = self.sample(
	cond=cond,
	batch_size=micro_batch_size,
	return_intermediates=False,
	verbose=False, ).contiguous()
	pred_seq_list.append(pred_seq[0].detach().float().cpu().numpy())
	pred_label_list.append(f"{self.oc.logging.logging_prefix}_pred_{i}")
	if pred_seq.dtype is not torch.float:
	pred_seq = pred_seq.float()
	self.valid_mse(pred_seq, target_seq)
	self.valid_mae(pred_seq, target_seq)
	self.valid_score.update(pred_seq, target_seq)
	pred_seq_list = aligned_pred_seq_list + pred_seq_list
	pred_label_list = aligned_pred_label_list + pred_label_list
	self.save_vis_step_end(
	data_idx=data_idx,
	context_seq=context_seq[0].detach().float().cpu().numpy(),
	target_seq=target_seq[0].detach().float().cpu().numpy(),
	pred_seq=pred_seq_list,
	pred_label=pred_label_list,
	mode="val",
	suffix=f"_rank{self.local_rank}", )
	def on_validation_epoch_end(self):
	if self.oc.eval.eval_unaligned:
	valid_mse = self.valid_mse.compute()
	valid_mae = self.valid_mae.compute()
	valid_score = self.valid_score.compute()
	valid_loss = -valid_score["avg"]["csi"]

	self.log('valid_loss_epoch', valid_loss, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('valid_mse_epoch', valid_mse, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('valid_mae_epoch', valid_mae, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log_score_epoch_end(score_dict=valid_score, prefix="valid")
	self.valid_mse.reset()
	self.valid_mae.reset()
	self.valid_score.reset()
	if self.oc.eval.eval_aligned:
	valid_mse = self.valid_aligned_mse.compute()
	valid_mae = self.valid_aligned_mae.compute()
	valid_score = self.valid_aligned_score.compute()
	valid_loss = -valid_score["avg"]["csi"]

	self.log('valid_aligned_loss_epoch', valid_loss, prog_bar=True, on_step=False, on_epoch=True,
	sync_dist=True)
	self.log('valid_aligned_mse_epoch', valid_mse, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('valid_aligned_mae_epoch', valid_mae, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log_score_epoch_end(score_dict=valid_score, prefix="valid_aligned")
	self.valid_aligned_mse.reset()
	self.valid_aligned_mae.reset()
	self.valid_aligned_score.reset()

	def test_step(self, batch, batch_idx):
	micro_batch_size = batch.shape[self.batch_axis]
	data_idx = int(batch_idx * micro_batch_size)
	if not self.eval_example_only or data_idx in self.val_example_data_idx_list:
	target_seq, cond, context_seq = \
	self.get_input(batch, return_verbose=True)
	target_seq_bchw = rearrange(target_seq, "b t h w c -> (b t) c h w")
	aligned_pred_seq_list = []
	aligned_pred_label_list = []
	pred_seq_list = []
	pred_label_list = []
	for i in range(self.oc.eval.num_samples_per_context):
	# aligned sampling
	if self.use_alignment and self.oc.eval.eval_aligned:
	if self.alignment_type == "avg_x":
	alignment_kwargs = get_alignment_kwargs_avg_x(context_seq=context_seq,
	target_seq=target_seq)
	else:
	raise NotImplementedError
	pred_seq = self.sample(
	cond=cond,
	batch_size=micro_batch_size,
	return_intermediates=False,
	use_alignment=True,
	alignment_kwargs=alignment_kwargs,
	verbose=False, ).contiguous()
	if self.oc.logging.save_npy:
	npy_path = os.path.join(self.npy_save_dir,
	f"batch{batch_idx}_rank{self.local_rank}_sample{i}_aligned.npy")
	np.save(npy_path, pred_seq.detach().float().cpu().numpy())
	aligned_pred_seq_list.append(pred_seq[0].detach().float().cpu().numpy())
	aligned_pred_label_list.append(f"{self.oc.logging.logging_prefix}_aligned_pred_{i}")
	if pred_seq.dtype is not torch.float:
	pred_seq = pred_seq.float()
	self.test_aligned_mse(pred_seq, target_seq)
	self.test_aligned_mae(pred_seq, target_seq)
	self.test_aligned_score.update(pred_seq, target_seq)
	# self.test_aligned_fvd.update(pred_seq, real=False)
	pred_seq_bchw = rearrange(pred_seq, "b t h w c -> (b t) c h w")
	self.test_aligned_ssim(pred_seq_bchw, target_seq_bchw)
	# no alignment
	if self.oc.eval.eval_unaligned:
	pred_seq = self.sample(
	cond=cond,
	batch_size=micro_batch_size,
	return_intermediates=False,
	verbose=False, ).contiguous()
	if self.oc.logging.save_npy:
	npy_path = os.path.join(self.npy_save_dir,
	f"batch{batch_idx}_rank{self.local_rank}_sample{i}.npy")
	np.save(npy_path, pred_seq.detach().float().cpu().numpy())
	pred_seq_list.append(pred_seq[0].detach().float().cpu().numpy())
	pred_label_list.append(f"{self.oc.logging.logging_prefix}_pred_{i}")
	if pred_seq.dtype is not torch.float:
	pred_seq = pred_seq.float()
	self.test_mse(pred_seq, target_seq)
	self.test_mae(pred_seq, target_seq)
	self.test_score.update(pred_seq, target_seq)
	# self.test_fvd.update(pred_seq, real=False)
	pred_seq_bchw = rearrange(pred_seq, "b t h w c -> (b t) c h w")
	self.test_ssim(pred_seq_bchw, target_seq_bchw)
	# if self.use_alignment and self.oc.eval.eval_aligned:
	# self.test_aligned_fvd.update(target_seq, real=True)
	# if self.oc.eval.eval_unaligned:
	# self.test_fvd.update(target_seq, real=True)
	pred_seq_list = aligned_pred_seq_list + pred_seq_list
	pred_label_list = aligned_pred_label_list + pred_label_list
	self.save_vis_step_end(
	data_idx=data_idx,
	context_seq=context_seq[0].detach().float().cpu().numpy(),
	target_seq=target_seq[0].detach().float().cpu().numpy(),
	pred_seq=pred_seq_list,
	pred_label=pred_label_list,
	mode="test",
	suffix=f"_rank{self.local_rank}", )
	def on_test_epoch_end(self):
	if self.oc.eval.eval_unaligned:
	test_mse = self.test_mse.compute()
	test_mae = self.test_mae.compute()
	test_ssim = self.test_ssim.compute()
	test_score = self.test_score.compute()
	# test_fvd = self.test_fvd.compute()

	self.log('test_mse_epoch', test_mse, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('test_mae_epoch', test_mae, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('test_ssim_epoch', test_ssim, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log_score_epoch_end(score_dict=test_score, prefix="test")
	# self.log('test_fvd_epoch', test_fvd, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.test_mse.reset()
	self.test_mae.reset()
	self.test_ssim.reset()
	self.test_score.reset()
	# self.test_fvd.reset()
	if self.oc.eval.eval_aligned:
	test_mse = self.test_aligned_mse.compute()
	test_mae = self.test_aligned_mae.compute()
	test_ssim = self.test_aligned_ssim.compute()
	test_score = self.test_aligned_score.compute()
	# test_fvd = self.test_aligned_fvd.compute()

	self.log('test_aligned_mse_epoch', test_mse, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('test_aligned_mae_epoch', test_mae, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log('test_aligned_ssim_epoch', test_ssim, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.log_score_epoch_end(score_dict=test_score, prefix="test_aligned")
	# self.log('test_aligned_fvd_epoch', test_fvd, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	self.test_aligned_mse.reset()
	self.test_aligned_mae.reset()
	self.test_aligned_ssim.reset()
	self.test_aligned_score.reset()
	# self.test_aligned_fvd.reset()
	# endregion

	def save_vis_step_end(
	self,
	data_idx: int,
	context_seq: np.ndarray,
	target_seq: np.ndarray,
	pred_seq: Union[np.ndarray, Sequence[np.ndarray]],
	pred_label: Union[str, Sequence[str]] = None,
	label_mode: str = "name",
	mode: str = "train",
	prefix: str = "",
	suffix: str = "", ):
	r"""
	Parameters
	----------
	data_idx
	context_seq, target_seq, pred_seq: np.ndarray
	layout should not include batch
	mode: str
	"""
	if mode == "train":
	example_data_idx_list = self.train_example_data_idx_list
	elif mode == "val":
	example_data_idx_list = self.val_example_data_idx_list
	elif mode == "test":
	example_data_idx_list = self.test_example_data_idx_list
	else:
	raise ValueError(f"Wrong mode {mode}! Must be in ['train', 'val', 'test'].")
	if label_mode == "name":
	# use the given label
	context_label = "context"
	target_label = "target"
	elif label_mode == "avg_int":
	context_label = f"context\navg_int={np.mean(context_seq):.4f}"
	target_label = f"target\navg_int={np.mean(target_seq):.4f}"
	if isinstance(pred_label, Sequence):
	pred_label = [f"{label}\navg_int={np.mean(seq):.4f}" for label, seq in zip(pred_label, pred_seq)]
	elif isinstance(pred_label, str):
	pred_label = f"{pred_label}\navg_int={np.mean(pred_seq):.4f}"
	else:
	raise TypeError(f"Wrong pred_label type {type(pred_label)}! must be in [str, Sequence[str]].")
	else:
	raise NotImplementedError
	if isinstance(pred_seq, Sequence):
	seq_list = [context_seq, target_seq] + list(pred_seq)
	label_list = [context_label, target_label] + pred_label
	else:
	seq_list = [context_seq, target_seq, pred_seq]
	label_list = [context_label, target_label, pred_label]
	if data_idx in example_data_idx_list:
	png_save_name = f"{prefix}{mode}_epoch_{self.current_epoch}_data_{data_idx}{suffix}.png"
	vis_sevir_seq(
	save_path=os.path.join(self.example_save_dir, png_save_name),
	seq=seq_list,
	label=label_list,
	interval_real_time=10,
	plot_stride=1, fs=self.oc.eval.fs,
	label_offset=self.oc.eval.label_offset,
	label_avg_int=self.oc.eval.label_avg_int, )

	def log_score_epoch_end(self, score_dict: Dict, prefix: str = "valid"):
	for metrics in self.oc.dataset.metrics_list:
	for thresh in self.oc.dataset.threshold_list:
	score_mean = np.mean(score_dict[thresh][metrics]).item()
	self.log(f"{prefix}_{metrics}_{thresh}_epoch", score_mean,
	prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)
	score_avg_mean = score_dict.get("avg", None)
	if score_avg_mean is not None:
	score_avg_mean = np.mean(score_avg_mean[metrics]).item()
	self.log(f"{prefix}_{metrics}_avg_epoch", score_avg_mean,
	prog_bar=True, on_step=False, on_epoch=True, sync_dist=True)

	def on_before_optimizer_step(self, optimizer):
	# Compute the 2-norm for each layer
	# If using mixed precision, the gradients are already unscaled here
	# reference: https://lightning.ai/docs/pytorch/2.0.9/debug/debugging_intermediate.html#look-out-for-exploding-gradients
	if self.oc.logging.track_grad_norm != -1:
	norms = grad_norm(self.torch_nn_module, norm_type=self.oc.logging.track_grad_norm)
	self.log_dict(norms)