anwm / err
de99's picture
Upload err
27c8922 verified
W0926 20:42:57.914000 970483 site-packages/torch/distributed/run.py:793]
W0926 20:42:57.914000 970483 site-packages/torch/distributed/run.py:793] *****************************************
W0926 20:42:57.914000 970483 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0926 20:42:57.914000 970483 site-packages/torch/distributed/run.py:793] *****************************************
[rank2]: Traceback (most recent call last):
[rank2]: File "/data1/tpz/nwm-main/isolated_nwm_infer_v5.py", line 291, in <module>
[rank2]: main(args)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank2]: return func(*args, **kwargs)
[rank2]: File "/data1/tpz/nwm-main/isolated_nwm_infer_v5.py", line 264, in main
[rank2]: generate_rollout(args, curr_rollout_output_dir, rollout_fps, idxs, model_lst, obs_image, gt_image, delta, num_cond, device, aug_image)
[rank2]: File "/data1/tpz/nwm-main/isolated_nwm_infer_v5.py", line 113, in generate_rollout
[rank2]: x_pred_pixels = model_forward_wrapper(all_models, curr_obs, curr_delta, rollout_stride, args.latent_size, num_cond=num_cond, num_goals=1, device=device, x_supervised=sup_image[:, i:i+1])
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank2]: return func(*args, **kwargs)
[rank2]: File "/data1/tpz/nwm-main/isolated_nwm_infer_v5.py", line 80, in model_forward_wrapper
[rank2]: x = vae.encode(x).latent_dist.sample().mul_(0.18215).unflatten(0, (B, T))
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/diffusers/utils/accelerate_utils.py", line 46, in wrapper
[rank2]: return method(self, *args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_kl.py", line 278, in encode
[rank2]: h = self._encode(x)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_kl.py", line 252, in _encode
[rank2]: enc = self.encoder(x)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/diffusers/models/autoencoders/vae.py", line 168, in forward
[rank2]: sample = down_block(sample)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/diffusers/models/unets/unet_2d_blocks.py", line 1442, in forward
[rank2]: hidden_states = resnet(hidden_states, temb=None)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/diffusers/models/resnet.py", line 327, in forward
[rank2]: hidden_states = self.norm1(hidden_states)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 313, in forward
[rank2]: return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
[rank2]: File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/nn/functional.py", line 2955, in group_norm
[rank2]: return torch.group_norm(
[rank2]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 19.14 GiB. GPU 2 has a total capacity of 39.56 GiB of which 16.76 GiB is free. Including non-PyTorch memory, this process has 22.79 GiB memory in use. Of the allocated memory 21.10 GiB is allocated by PyTorch, and 488.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
W0926 21:44:41.600000 970483 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 970563 closing signal SIGTERM
W0926 21:44:41.610000 970483 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 970564 closing signal SIGTERM
W0926 21:44:41.615000 970483 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 970566 closing signal SIGTERM
W0926 21:44:41.632000 970483 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 970567 closing signal SIGTERM
E0926 21:44:42.802000 970483 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 2 (pid: 970565) of binary: /data1/tpz/anaconda3/envs/nwm3/bin/python3.10
Traceback (most recent call last):
File "/data1/tpz/anaconda3/envs/nwm3/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
run(args)
File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
elastic_launch(
File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/data1/tpz/anaconda3/envs/nwm3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
isolated_nwm_infer_v5.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2025-09-26_21:44:41
host : localhost
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 970565)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================