Skip to content

Commit

Permalink
Add callback to catch NaNs in the train loss (#97)
Browse files Browse the repository at this point in the history
  • Loading branch information
coryMosaicML authored Nov 13, 2023
1 parent 3db3411 commit 3122b81
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
2 changes: 2 additions & 0 deletions diffusion/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
"""Custom callbacks for Diffusion."""

from diffusion.callbacks.log_diffusion_images import LogDiffusionImages
from diffusion.callbacks.nan_catcher import NaNCatcher
from diffusion.callbacks.scheduled_garbage_collector import ScheduledGarbageCollector

__all__ = [
'LogDiffusionImages',
'NaNCatcher',
'ScheduledGarbageCollector',
]
30 changes: 30 additions & 0 deletions diffusion/callbacks/nan_catcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2022 MosaicML Diffusion authors
# SPDX-License-Identifier: Apache-2.0

"""Callback for catching loss NaNs."""

from typing import Dict, Sequence

import torch
from composer import Callback, Logger, State


class NaNCatcher(Callback):
"""Catches NaNs in the loss and raises an error if one is found."""

def after_loss(self, state: State, logger: Logger):
"""Check if loss is NaN and raise an error if so."""
# Should check if any of the elements of the loss are NaN
if isinstance(state.loss, torch.Tensor):
if torch.isnan(state.loss).any():
raise RuntimeError('Train loss contains a NaN.')
elif isinstance(state.loss, Sequence):
for loss in state.loss:
if torch.isnan(loss).any():
raise RuntimeError('Train loss contains a NaN.')
elif isinstance(state.loss, Dict):
for k, v in state.loss.items():
if torch.isnan(v).any():
raise RuntimeError(f'Train loss {k} contains a NaN.')
else:
raise TypeError(f'Loss is of type {type(state.loss)}, but should be a tensor or a list of tensors')

0 comments on commit 3122b81

Please sign in to comment.