Merge branch 'main' of github.com:pytorch/torchcodec into eqnfqef

NicolasHug · NicolasHug · commit 79295d2bc933 · 2025-09-04T09:42:00.000+01:00
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -275,7 +275,32 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   }
 
   torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);
-  nppCtx_->hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream();
+
+  // Create a CUDA event and attach it to the AVFrame's CUDA stream. That's the
+  // NVDEC stream, i.e. the CUDA stream that the frame was decoded on.
+  // We will be waiting for this event to complete before calling the NPP
+  // functions, to ensure NVDEC has finished decoding the frame before running
+  // the NPP color-conversion.
+  // Note that our code is generic and assumes that the NVDEC's stream can be
+  // arbitrary, but unfortunately we know it's hardcoded to be the default
+  // stream by FFmpeg:
+  // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
+  TORCH_CHECK(
+      hwFramesCtx->device_ctx != nullptr,
+      "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
+  auto cudaDeviceCtx =
+      static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
+  at::cuda::CUDAEvent nvdecDoneEvent;
+  at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
+      c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
+  nvdecDoneEvent.record(nvdecStream);
+
+  // Don't start NPP work before NVDEC is done decoding the frame!
+  at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
+  nvdecDoneEvent.block(nppStream);
+
+  // Create the NPP context if we haven't yet.
+  nppCtx_->hStream = nppStream.stream();
   cudaError_t err =
       cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -61,7 +61,15 @@ int getNumChannels(const UniqueAVFrame& avFrame) {
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
   return avFrame->ch_layout.nb_channels;
 #else
-  return av_get_channel_layout_nb_channels(avFrame->channel_layout);
+  int numChannels = av_get_channel_layout_nb_channels(avFrame->channel_layout);
+  // Handle FFmpeg 4 bug where channel_layout and numChannels are 0 or unset
+  // Set values based on avFrame->channels which appears to be correct
+  // to allow successful initialization of SwrContext
+  if (numChannels == 0 && avFrame->channels > 0) {
+    avFrame->channel_layout = av_get_default_channel_layout(avFrame->channels);
+    numChannels = avFrame->channels;
+  }
+  return numChannels;
 #endif
 }
 
diff --git a/test/resources/sine_mono_s16.wav.stream0.all_frames.pt b/test/resources/sine_mono_s16.wav.stream0.all_frames.pt
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1682,26 +1682,25 @@ def test_downsample_empty_frame(self):
             frames_44100_to_8000.data, frames_8000.data, atol=0.03, rtol=0
         )
 
-    def test_s16_ffmpeg4_bug(self):
-        # s16 fails on FFmpeg4 but can be decoded on other versions.
-        # Debugging logs show that we're hitting:
-        # [SWR @ 0x560a7abdaf80] Input channel count and layout are unset
-        # which seems to point to:
-        # https://github.com/FFmpeg/FFmpeg/blob/40a6963fbd0c47be358a3760480180b7b532e1e9/libswresample/swresample.c#L293-L305
-        # ¯\_(ツ)_/¯
+    def test_decode_s16_ffmpeg4(self):
+        # Non-regression test for https://github.com/pytorch/torchcodec/issues/843
+        # Ensures that decoding s16 on FFmpeg4 handles
+        # unset input channel count and layout
 
         asset = SINE_MONO_S16
         decoder = AudioDecoder(asset.path)
         assert decoder.metadata.sample_rate == asset.sample_rate
         assert decoder.metadata.sample_format == asset.sample_format
 
-        cm = (
-            pytest.raises(RuntimeError, match="The frame has 0 channels, expected 1.")
-            if get_ffmpeg_major_version() == 4
-            else contextlib.nullcontext()
+        test_samples = decoder.get_samples_played_in_range()
+        assert test_samples.data.shape[0] == decoder.metadata.num_channels
+        assert test_samples.sample_rate == decoder.metadata.sample_rate
+        reference_frames = asset.get_frame_data_by_range(
+            start=0, stop=1, stream_index=0
+        )
+        torch.testing.assert_close(
+            test_samples.data[0], reference_frames, atol=0, rtol=0
         )
-        with cm:
-            decoder.get_samples_played_in_range()
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     @pytest.mark.parametrize("sample_rate", (None, 8000, 16_000, 44_1000))