Lately, I was using DDW again extensively and came across this "sporadic" error again... only for some of the many jobs but they are still popping up. I double checked with @rdrighetto and we have the latest version.
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/ddw/fit_model │
│ .py:273 in fit_model │
│ │
│ 270 │ # fit the model │
│ 271 │ if val_data_exists and resume_from_checkpoint is None: │
│ 272 │ │ trainer.validate(lit_unet, val_dataloader) │
│ ❱ 273 │ trainer.fit( │
│ 274 │ │ #ckpt_path=resume_from_checkpoint, # for pytorch-lightning >= │
│ 275 │ │ model=lit_unet, │
│ 276 │ │ train_dataloaders=fitting_dataloader, │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/trainer.py:579 in fit │
│ │
│ 576 │ │ if not isinstance(model, pl.LightningModule): │
│ 577 │ │ │ raise TypeError(f"`Trainer.fit()` requires a `LightningMo │
│ 578 │ │ self.strategy._lightning_module = model │
│ ❱ 579 │ │ call._call_and_handle_interrupt( │
│ 580 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 581 │ │ ) │
│ 582 │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/call.py:38 in _call_and_handle_interrupt │
│ │
│ 35 │ │ if trainer.strategy.launcher is not None: │
│ 36 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 37 │ │ else: │
│ ❱ 38 │ │ │ return trainer_fn(*args, **kwargs) │
│ 39 │ │
│ 40 │ except _TunerExitException: │
│ 41 │ │ trainer._call_teardown_hook() │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/trainer.py:621 in _fit_impl │
│ │
│ 618 │ │ │ model_provided=True, │
│ 619 │ │ │ model_connected=self.lightning_module is not None, │
│ 620 │ │ ) │
│ ❱ 621 │ │ self._run(model, ckpt_path=self.ckpt_path) │
│ 622 │ │ │
│ 623 │ │ assert self.state.stopped │
│ 624 │ │ self.training = False │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/trainer.py:1058 in _run │
│ │
│ 1055 │ │ │
│ 1056 │ │ self._checkpoint_connector.resume_end() │
│ 1057 │ │ │
│ ❱ 1058 │ │ results = self._run_stage() │
│ 1059 │ │ │
│ 1060 │ │ log.detail(f"{self.__class__.__name__}: trainer tearing down" │
│ 1061 │ │ self._teardown() │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/trainer.py:1137 in _run_stage │
│ │
│ 1134 │ │ │ return self._run_evaluate() │
│ 1135 │ │ if self.predicting: │
│ 1136 │ │ │ return self._run_predict() │
│ ❱ 1137 │ │ self._run_train() │
│ 1138 │ │
│ 1139 │ def _pre_training_routine(self) -> None: │
│ 1140 │ │ # wait for all to join if on distributed │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/trainer.py:1160 in _run_train │
│ │
│ 1157 │ │ self.fit_loop.trainer = self │
│ 1158 │ │ │
│ 1159 │ │ with torch.autograd.set_detect_anomaly(self._detect_anomaly): │
│ ❱ 1160 │ │ │ self.fit_loop.run() │
│ 1161 │ │
│ 1162 │ def _run_evaluate(self) -> _EVALUATE_OUTPUT: │
│ 1163 │ │ assert self.evaluating │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/loops/loop.py:200 in run │
│ │
│ 197 │ │ │ try: │
│ 198 │ │ │ │ self.on_advance_start(*args, **kwargs) │
│ 199 │ │ │ │ self.advance(*args, **kwargs) │
│ ❱ 200 │ │ │ │ self.on_advance_end() │
│ 201 │ │ │ │ self._restarting = False │
│ 202 │ │ │ except StopIteration: │
│ 203 │ │ │ │ break │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/loops/fit_loop.py:296 in on_advance_end │
│ │
│ 293 │ │ │
│ 294 │ │ # call train epoch end hooks │
│ 295 │ │ self.trainer._call_callback_hooks("on_train_epoch_end") │
│ ❱ 296 │ │ self.trainer._call_lightning_module_hook("on_train_epoch_end") │
│ 297 │ │ │
│ 298 │ │ self.trainer._logger_connector.on_epoch_end() │
│ 299 │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/pytorch_light │
│ ning/trainer/trainer.py:1302 in _call_lightning_module_hook │
│ │
│ 1299 │ │ pl_module._current_fx_name = hook_name │
│ 1300 │ │ │
│ 1301 │ │ with self.profiler.profile(f"[LightningModule]{pl_module.__cl │
│ ❱ 1302 │ │ │ output = fn(*args, **kwargs) │
│ 1303 │ │ │
│ 1304 │ │ # restore current_fx when nested context │
│ 1305 │ │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/ddw/utils/une │
│ t.py:85 in on_train_epoch_end │
│ │
│ 82 │ │ │ self.current_epoch + 1 │
│ 83 │ │ ) % self.update_subtomo_missing_wedges_every_n_epochs == 0: # │
│ 84 │ │ │ self.update_subtomo_missing_wedges() │
│ ❱ 85 │ │ │ self.update_normalization() │
│ 86 │ │
│ 87 │ def configure_optimizers(self): │
│ 88 │ │ optimizer = torch.optim.Adam(self.parameters(), **self.adam_pa │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/ddw/utils/une │
│ t.py:166 in update_normalization │
│ │
│ 163 │ │ # update normalization in hparams │
│ 164 │ │ self.unet_params["normalization_loc"] = loc │
│ 165 │ │ self.unet_params["normalization_scale"] = scale │
│ ❱ 166 │ │ self.update_hparam("unet_params", self.unet_params) │
│ 167 │ │ self.log("normalization/loc", loc) │
│ 168 │ │ self.log("normalization/scale", scale) │
│ 169 │
│ │
│ /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3 │
│ _python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/ddw/utils/une │
│ t.py:178 in update_hparam │
│ │
│ 175 │ │ logdir = f"{logger.save_dir}/{logger.name}/version_{logger.ver │
│ 176 │ │ hparams_file = f"{logdir}/hparams.yaml" │
│ 177 │ │ hparams = yaml.safe_load(open(hparams_file, "r")) │
│ ❱ 178 │ │ hparams[hparam] = value │
│ 179 │ │ with open(hparams_file, "w") as f: │
│ 180 │ │ │ yaml.dump(hparams, f) │
│ 181 │
╰──────────────────────────────────────────────────────────────────────────────╯
TypeError: 'NoneType' object does not support item assignment
Command exited with non-zero status 1
45074.13user 2356.74system 1:47:50elapsed 733%CPU (0avgtext+0avgdata 3734392maxresident)k
0inputs+98117120outputs (136948422major+146981054minor)pagefaults 0swaps
srun: error: sge01: task 1: Exited with exit code 1
[rank0]:[E ProcessGroupNCCL.cpp:523] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=32728, OpType=ALLREDUCE, NumelIn=276641, NumelOut=276641, Timeout(ms)=1800000) ran for 1800387 milliseconds before timing out.
[rank0]:[E ProcessGroupNCCL.cpp:537] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank0]:[E ProcessGroupNCCL.cpp:543] To avoid data inconsistency, we are taking the entire process down.
[rank0]:[E ProcessGroupNCCL.cpp:1182] [Rank 0] NCCL watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=32728, OpType=ALLREDUCE, NumelIn=276641, NumelOut=276641, Timeout(ms)=1800000) ran for 1800387 milliseconds before timing out.
Exception raised from checkTimeout at /opt/conda/conda-bld/pytorch_1711403380909/work/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:525 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x1461aaf80d87 in /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3_python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x1e6 (0x14615f2384d6 in /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3_python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x19d (0x14615f23ba2d in /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3_python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x119 (0x14615f23c629 in /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3_python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xdbbf4 (0x1461aaac7bf4 in /scicore/projects/scicore-p-structsoft/ubuntu/software/Miniconda3/miniconda3_python3.12.1/envs/DeepDeWedge-ub/lib/python3.10/site-packages/torch/lib/../../../.././libstdc++.so.6)
frame #5: <unknown function> + 0x94ac3 (0x1461b333eac3 in /lib/x86_64-linux-gnu/libc.so.6)
frame #6: <unknown function> + 0x126850 (0x1461b33d0850 in /lib/x86_64-linux-gnu/libc.so.6)
Command terminated by signal 6
5867.11user 1972.00system 2:17:12elapsed 95%CPU (0avgtext+0avgdata 3658520maxresident)k
368inputs+98025688outputs (49378770major+75166635minor)pagefaults 0swaps
Originally posted by @Phaips in #8