Upload folder using huggingface_hub

d2f661a verified 9 months ago

13.8 kB

	from datetime import datetime, timedelta
	import os
	import pickle

	from numba import njit, prange, types
	from numba.typed import Dict
	import numpy as np
	from torch.utils.data import Dataset, IterableDataset

	from .patches import unpack_patches
	from .sampling import EqualFrequencySampler


	class BatchGenerator:
	def __init__(self,
	variables,
	raw,
	predictors,
	target,
	primary_var,
	time_range_sampling=(-1,2),
	forecast_raw_vars=(),
	sampling_bins=None,
	sampler_file=None,
	sample_shape=(4,4),
	batch_size=32,
	interval=timedelta(minutes=5),
	random_seed=None,
	augment=False
	):
	super().__init__()
	self.batch_size = batch_size
	self.interval = interval
	self.interval_secs = np.int64(self.interval.total_seconds())
	self.variables = variables
	self.predictors = predictors
	self.target = target
	self.used_variables = predictors + [target]
	self.rng = np.random.RandomState(seed=random_seed)
	self.augment = augment

	# setup indices for retrieving source raw data
	self.sources = set.union(
	*(set(variables[v]["sources"]) for v in self.used_variables)
	)
	self.forecast_raw_vars = set(forecast_raw_vars) & self.sources
	self.patch_index = {}
	for raw_name_base in self.sources:
	if raw_name_base in forecast_raw_vars:
	raw_names = (
	rn for rn in raw if rn.startswith(raw_name_base+"-")
	)
	else:
	raw_names = (raw_name_base,)
	for raw_name in raw_names:
	raw_data = raw[raw_name]
	self.setup_index(raw_name, raw_data, sample_shape)

	for raw_name in self.forecast_raw_vars:
	patch_index_var = {
	k: v for (k,v) in self.patch_index.items()
	if k.startswith(raw_name+"-")
	}
	self.patch_index[raw_name] = \
	ForecastPatchIndexWrapper(patch_index_var)

	# setup samplers
	if (sampler_file is None) or not os.path.isfile(sampler_file):
	print("No cached sampler found, creating a new one...")
	primary_raw_var = variables[primary_var]["sources"][0]
	t0 = t1 = None
	for (var_name, var_data) in variables.items():
	timesteps = var_data["timesteps"][[0,-1]].copy()
	timesteps[0] -= 1
	ts_secs = timesteps * \
	var_data.get("timestep_secs", self.interval_secs)
	timesteps = ts_secs // self.interval_secs
	t0 = timesteps[0] if t0 is None else min(t0,timesteps[0])
	t1 = timesteps[-1] if t1 is None else max(t1,timesteps[-1])
	time_range_valid = (t0,t1+1)
	self.sampler = EqualFrequencySampler(
	sampling_bins, raw[primary_raw_var],
	self.patch_index[primary_raw_var], sample_shape,
	time_range_valid, time_range_sampling=time_range_sampling,
	timestep_secs=self.interval_secs
	)
	if sampler_file is not None:
	print(f"Caching sampler to {sampler_file}.")
	with open(sampler_file, 'wb') as f:
	pickle.dump(self.sampler, f)
	else:
	print(f"Loading cached sampler from {sampler_file}.")
	with open(sampler_file, 'rb') as f:
	self.sampler = pickle.load(f)

	def setup_index(self, raw_name, raw_data, box_size):
	zero_value = raw_data.get("zero_value", 0)
	missing_value = raw_data.get("missing_value", zero_value)

	self.patch_index[raw_name] = PatchIndex(
	*unpack_patches(raw_data),
	zero_value=zero_value,
	missing_value=missing_value,
	interval=self.interval,
	box_size=box_size
	)

	def augmentations(self):
	return tuple(self.rng.randint(2, size=3))

	def augment_batch(self, batch, transpose, flipud, fliplr):
	if self.augment:
	if transpose:
	axes = list(range(batch.ndim))
	axes = axes[:-2] + [axes[-1], axes[-2]]
	batch = batch.transpose(axes)
	flips = []
	if flipud:
	flips.append(-2)
	if fliplr:
	flips.append(-1)
	if flips:
	batch = np.flip(batch, axis=flips)
	return batch.copy()

	def batch(self, samples=None, batch_size=None):
	if batch_size is None:
	batch_size = self.batch_size

	if samples is None:
	# get the sample coordinates from the sampler
	samples = self.sampler(batch_size)

	print(samples)
	(t0,i0,j0) = samples.T

	if self.augment:
	augmentations = self.augmentations()

	batch = {}
	for var_name in self.used_variables:
	var_data = self.variables[var_name]

	# different timestep from standard (e.g. forecast); round down
	# to times where we have data available
	ts_secs = var_data.get("timestep_secs", self.interval_secs)
	t_shift = -(t0 % ts_secs)
	t0_shifted = t0 + t_shift
	t = t0_shifted[:,None] + ts_secs*var_data["timesteps"][None,:]
	t_relative = (t - t0[:,None]) / self.interval_secs

	# read raw data from index
	raw_data = (
	self.patch_index[raw_name](t,i0,j0)
	for raw_name in var_data["sources"]
	)

	# transform to model variable
	batch_var = var_data["transform"](*raw_data)

	# add channel dimension if not already present
	add_dims = (1,) if batch_var.ndim == 4 else ()
	batch_var = np.expand_dims(batch_var, add_dims)

	# data augmentation
	if self.augment:
	batch_var = self.augment_batch(batch_var, *augmentations)

	# bundle with time coordinates
	batch[var_name] = (batch_var, t_relative.astype(np.float32))

	pred_batch = [batch[v] for v in self.predictors]
	target_batch = batch[self.target][0] # no time coordinates for target
	return (pred_batch, target_batch)

	def batches(self, args, num=None, *kwargs):
	if num is not None:
	for i in range(num):
	yield self.batch(args, *kwargs)
	else:
	while True:
	yield self.batch(args, *kwargs)


	class StreamBatchDataset(IterableDataset):
	def __init__(self, batch_gen, batches_per_epoch):
	super().__init__()
	self.batch_gen = batch_gen
	self.batches_per_epoch = batches_per_epoch

	def __iter__(self):
	batches = self.batch_gen.batches(num=self.batches_per_epoch)
	yield from batches


	class DeterministicBatchDataset(Dataset):
	def __init__(self, batch_gen, batches_per_epoch, random_seed=None):
	super().__init__()
	self.batch_gen = batch_gen
	self.batches_per_epoch = batches_per_epoch
	self.batch_gen.sampler.rng = np.random.RandomState(seed=random_seed)
	self.samples = [
	self.batch_gen.sampler(self.batch_gen.batch_size)
	for i in range(self.batches_per_epoch)
	]

	def __len__(self):
	return self.batches_per_epoch

	def __getitem__(self, ind):
	print(self.samples[ind])
	return self.batch_gen.batch(samples=self.samples[ind])


	class PatchIndex:
	IDX_ZERO = -1
	IDX_MISSING = -2

	def __init__(
	self, patch_data, patch_coords, patch_times,
	zero_patch_coords, zero_patch_times,
	interval=timedelta(minutes=5),
	box_size=(4,4), zero_value=0,
	missing_value=0
	):
	self.dt = int(round(interval.total_seconds()))
	self.box_size = box_size
	self.zero_value = zero_value
	self.missing_value = missing_value
	self.patch_data = patch_data
	self.sample_shape = (
	self.patch_data.shape[1]*box_size[0],
	self.patch_data.shape[2]*box_size[1]
	)

	self.patch_index = Dict.empty(
	key_type=types.UniTuple(types.int64, 3),
	value_type=types.int64
	)
	init_patch_index(self.patch_index, patch_coords, patch_times)
	init_patch_index_zero(self.patch_index, zero_patch_coords,
	zero_patch_times, PatchIndex.IDX_ZERO)

	self._batch = None

	def _alloc_batch(self, batch_size, num_timesteps):
	needs_rebuild = (self._batch is None) or \
	(self._batch.shape[0] < batch_size) or \
	(self._batch.shape[1] < num_timesteps)
	if needs_rebuild:
	del self._batch
	self._batch = np.zeros(
	(batch_size,num_timesteps)+self.sample_shape,
	self.patch_data.dtype
	)
	return self._batch

	def __call__(self, t, i0_all, j0_all):
	batch = self._alloc_batch(*t.shape)

	i1_all = i0_all + self.box_size[0]
	j1_all = j0_all + self.box_size[1]
	bi_size = self.patch_data.shape[1]
	bj_size = self.patch_data.shape[2]

	build_batch(batch, self.patch_data, self.patch_index,
	t, i0_all, i1_all, j0_all, j1_all,
	bi_size, bj_size, self.zero_value,
	self.missing_value)

	return batch[:,:t.shape[1],...]


	@njit
	def init_patch_index(patch_index, patch_coords, patch_times):
	for k in range(patch_coords.shape[0]):
	t = patch_times[k]
	i = np.int64(patch_coords[k,0])
	j = np.int64(patch_coords[k,1])
	patch_index[(t,i,j)] = k


	@njit
	def init_patch_index_zero(patch_index, zero_patch_coords,
	zero_patch_times, idx_zero):

	for k in range(zero_patch_coords.shape[0]):
	t = zero_patch_times[k]
	i = np.int64(zero_patch_coords[k,0])
	j = np.int64(zero_patch_coords[k,1])
	patch_index[(t,i,j)] = idx_zero


	# numba can't find these values from PatchIndex
	IDX_ZERO = PatchIndex.IDX_ZERO
	IDX_MISSING = PatchIndex.IDX_MISSING
	@njit(parallel=True)
	def build_batch(
	batch, patch_data, patch_index,
	t_all, i0_all, i1_all, j0_all, j1_all,
	bi_size, bj_size, zero_value, missing_value
	):
	for k in prange(t_all.shape[0]):
	i0 = i0_all[k]
	i1 = i1_all[k]
	j0 = j0_all[k]
	j1 = j1_all[k]

	for (bt,t) in enumerate(t_all[k,:]):
	for i in range(i0, i1):
	bi0 = (i-i0) * bi_size
	bi1 = bi0 + bi_size
	for j in range(j0, j1):
	ind = int(patch_index.get((t,i,j), IDX_MISSING))
	bj0 = (j-j0) * bj_size
	bj1 = bj0 + bj_size
	if ind >= 0:
	batch[k,bt,bi0:bi1,bj0:bj1] = patch_data[ind]
	elif ind == IDX_ZERO:
	batch[k,bt,bi0:bi1,bj0:bj1] = zero_value
	elif ind == IDX_MISSING:
	batch[k,bt,bi0:bi1,bj0:bj1] = missing_value


	class ForecastPatchIndexWrapper(PatchIndex):
	def __init__(self, patch_index):
	self.patch_index = patch_index
	raw_names = {"-".join(v.split("-")[:-1]) for v in patch_index}
	if len(raw_names) != 1:
	raise ValueError(
	"Can only wrap variables with the same base name")
	self.raw_name = list(raw_names)[0]
	lags_hour = [int(v.split("-")[-1]) for v in patch_index]
	self.lags_hour = set(lags_hour)
	forecast_interval_hour = np.diff(sorted(lags_hour))
	if len(set(forecast_interval_hour)) != 1:
	raise ValueError("Lags must be evenly spaced")
	forecast_interval_hour = forecast_interval_hour[0]
	if (24 % forecast_interval_hour):
	raise ValueError(
	"24 hours must be a multiple of the forecast interval")
	self.forecast_interval_hour = forecast_interval_hour
	self.forecast_interval = 3600 * forecast_interval_hour

	# need to set these for _alloc_batch to work
	self._batch = None
	v = list(self.patch_index.keys())[0]
	self.sample_shape = self.patch_index[v].sample_shape
	self.patch_data = self.patch_index[v].patch_data

	def __call__(self, t, i0, j0):
	batch = self._alloc_batch(*t.shape)

	# ensure that all data come from the same forecast
	t0 = t[:,:1]
	start_time_from_fc = t0 % self.forecast_interval
	time_from_fc = start_time_from_fc + (t - t0)
	lags_hour = (time_from_fc // self.forecast_interval) * \
	self.forecast_interval_hour

	for lag in self.lags_hour:
	raw_name_lag = f"{self.raw_name}-{lag}"
	batch_lag = self.patch_index[raw_name_lag](t,i0,j0)
	lag_mask = (lags_hour == lag)
	copy_masked_times(batch_lag, batch, lag_mask)

	return batch[:,:t.shape[1],...]


	@njit(parallel=True)
	def copy_masked_times(from_batch, to_batch, mask):
	for k in prange(from_batch.shape[0]):
	for bt in range(from_batch.shape[1]):
	if mask[k,bt]:
	to_batch[k,bt,:,:] = from_batch[k,bt,:,:]