-
Notifications
You must be signed in to change notification settings - Fork 62
Use cuda filters to support 10-bit videos #899
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,6 +83,24 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput( | |
enum AVPixelFormat frameFormat = | ||
static_cast<enum AVPixelFormat>(avFrame->format); | ||
|
||
// This is an early-return optimization: if the format is already what we | ||
// need, and the dimensions are also what we need, we don't need to call | ||
// swscale or filtergraph. We can just convert the AVFrame to a tensor. | ||
if (frameFormat == AV_PIX_FMT_RGB24 && | ||
avFrame->width == expectedOutputWidth && | ||
avFrame->height == expectedOutputHeight) { | ||
outputTensor = toTensor(avFrame); | ||
if (preAllocatedOutputTensor.has_value()) { | ||
// We have already validated that preAllocatedOutputTensor and | ||
// outputTensor have the same shape. | ||
preAllocatedOutputTensor.value().copy_(outputTensor); | ||
frameOutput.data = preAllocatedOutputTensor.value(); | ||
} else { | ||
frameOutput.data = outputTensor; | ||
} | ||
return; | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dvrogozh QQ - do we expect frames to come out as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change here in CPU device interface is due to CPU fallback in CUDA device interface to handle ffmpeg-4.4 10-bit streams: Above being said, I think there might be 2 cases in the future where we might see RGB24 coming out of decoders:
|
||
// By default, we want to use swscale for color conversion because it is | ||
// faster. However, it has width requirements, so we may need to fall back | ||
// to filtergraph. We also need to respect what was requested from the | ||
|
@@ -159,7 +177,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput( | |
std::make_unique<FilterGraph>(filtersContext, videoStreamOptions); | ||
prevFiltersContext_ = std::move(filtersContext); | ||
} | ||
outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame); | ||
outputTensor = toTensor(filterGraphContext_->convert(avFrame)); | ||
|
||
// Similarly to above, if this check fails it means the frame wasn't | ||
// reshaped to its expected dimensions by filtergraph. | ||
|
@@ -208,23 +226,20 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwsScale( | |
return resultHeight; | ||
} | ||
|
||
torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph( | ||
const UniqueAVFrame& avFrame) { | ||
UniqueAVFrame filteredAVFrame = filterGraphContext_->convert(avFrame); | ||
|
||
TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24); | ||
torch::Tensor CpuDeviceInterface::toTensor(const UniqueAVFrame& avFrame) { | ||
TORCH_CHECK_EQ(avFrame->format, AV_PIX_FMT_RGB24); | ||
|
||
auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get()); | ||
auto frameDims = getHeightAndWidthFromResizedAVFrame(*avFrame.get()); | ||
int height = frameDims.height; | ||
int width = frameDims.width; | ||
std::vector<int64_t> shape = {height, width, 3}; | ||
std::vector<int64_t> strides = {filteredAVFrame->linesize[0], 3, 1}; | ||
AVFrame* filteredAVFramePtr = filteredAVFrame.release(); | ||
auto deleter = [filteredAVFramePtr](void*) { | ||
UniqueAVFrame avFrameToDelete(filteredAVFramePtr); | ||
std::vector<int64_t> strides = {avFrame->linesize[0], 3, 1}; | ||
AVFrame* avFrameClone = av_frame_clone(avFrame.get()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We weren't calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, that was not a mistake. The function has changed. Previously it combined 2 operations: 1) conversion of the frame with filtergraph, 2) creating a tensor. The frame converted by filtergraph lived locally inside
In the new code I moved frame conversion with filtergraph outside of the function. And function signature has changed - it started to accept just a constant reference to a frame without knowing whether it will be further needed or not. But we still need to pass a reference to the
|
||
auto deleter = [avFrameClone](void*) { | ||
UniqueAVFrame avFrameToDelete(avFrameClone); | ||
}; | ||
return torch::from_blob( | ||
filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8}); | ||
avFrameClone->data[0], shape, strides, deleter, {torch::kUInt8}); | ||
} | ||
|
||
void CpuDeviceInterface::createSwsContext( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -199,12 +199,127 @@ void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) { | |
return; | ||
} | ||
|
||
std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext( | ||
const VideoStreamOptions& videoStreamOptions, | ||
const UniqueAVFrame& avFrame, | ||
const AVRational& timeBase) { | ||
// We need FFmpeg filters to handle those conversion cases which are not | ||
// directly implemented in CUDA or CPU device interface (in case of a | ||
// fallback). | ||
enum AVPixelFormat frameFormat = | ||
static_cast<enum AVPixelFormat>(avFrame->format); | ||
|
||
// Input frame is on CPU, we will just pass it to CPU device interface, so | ||
// skipping filters context as CPU device interface will handle everythong for | ||
// us. | ||
if (avFrame->format != AV_PIX_FMT_CUDA) { | ||
return nullptr; | ||
} | ||
|
||
TORCH_CHECK( | ||
avFrame->hw_frames_ctx != nullptr, | ||
"The AVFrame does not have a hw_frames_ctx. " | ||
"That's unexpected, please report this to the TorchCodec repo."); | ||
|
||
auto hwFramesCtx = | ||
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data); | ||
AVPixelFormat actualFormat = hwFramesCtx->sw_format; | ||
|
||
// NV12 conversion is implemented directly with NPP, no need for filters. | ||
if (actualFormat == AV_PIX_FMT_NV12) { | ||
return nullptr; | ||
} | ||
|
||
auto frameDims = | ||
getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame); | ||
int height = frameDims.height; | ||
int width = frameDims.width; | ||
|
||
AVPixelFormat outputFormat; | ||
std::stringstream filters; | ||
|
||
unsigned version_int = avfilter_version(); | ||
if (version_int < AV_VERSION_INT(8, 0, 103)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love that we're doing FFmpeg versions checks here, but we also do this in several other places in There's no action to take based on this comment, I'm just pointing out it's an awkward situation. I might end up refactoring it in some of my decoder-native transform work. |
||
// Color conversion support ('format=' option) was added to scale_cuda from | ||
// n5.0. With the earlier version of ffmpeg we have no choice but use CPU | ||
// filters. See: | ||
// https://github.com/FFmpeg/FFmpeg/commit/62dc5df941f5e196164c151691e4274195523e95 | ||
outputFormat = AV_PIX_FMT_RGB24; | ||
|
||
auto actualFormatName = av_get_pix_fmt_name(actualFormat); | ||
TORCH_CHECK( | ||
actualFormatName != nullptr, | ||
"The actual format of a frame is unknown to FFmpeg. " | ||
"That's unexpected, please report this to the TorchCodec repo."); | ||
|
||
filters << "hwdownload,format=" << actualFormatName; | ||
filters << ",scale=" << width << ":" << height; | ||
filters << ":sws_flags=bilinear"; | ||
} else { | ||
// Actual output color format will be set via filter options | ||
outputFormat = AV_PIX_FMT_CUDA; | ||
|
||
filters << "scale_cuda=" << width << ":" << height; | ||
filters << ":format=nv12:interp_algo=bilinear"; | ||
} | ||
|
||
return std::make_unique<FiltersContext>( | ||
avFrame->width, | ||
avFrame->height, | ||
frameFormat, | ||
avFrame->sample_aspect_ratio, | ||
width, | ||
height, | ||
outputFormat, | ||
filters.str(), | ||
timeBase, | ||
av_buffer_ref(avFrame->hw_frames_ctx)); | ||
} | ||
|
||
void CudaDeviceInterface::convertAVFrameToFrameOutput( | ||
const VideoStreamOptions& videoStreamOptions, | ||
[[maybe_unused]] const AVRational& timeBase, | ||
UniqueAVFrame& avFrame, | ||
UniqueAVFrame& avInputFrame, | ||
FrameOutput& frameOutput, | ||
std::optional<torch::Tensor> preAllocatedOutputTensor) { | ||
std::unique_ptr<FiltersContext> newFiltersContext = | ||
initializeFiltersContext(videoStreamOptions, avInputFrame, timeBase); | ||
UniqueAVFrame avFilteredFrame; | ||
if (newFiltersContext) { | ||
// We need to compare the current filter context with our previous filter | ||
// context. If they are different, then we need to re-create a filter | ||
// graph. We create a filter graph late so that we don't have to depend | ||
// on the unreliable metadata in the header. And we sometimes re-create | ||
// it because it's possible for frame resolution to change mid-stream. | ||
// Finally, we want to reuse the filter graph as much as possible for | ||
// performance reasons. | ||
if (!filterGraph_ || *filtersContext_ != *newFiltersContext) { | ||
filterGraph_ = | ||
std::make_unique<FilterGraph>(*newFiltersContext, videoStreamOptions); | ||
filtersContext_ = std::move(newFiltersContext); | ||
} | ||
avFilteredFrame = filterGraph_->convert(avInputFrame); | ||
|
||
// If this check fails it means the frame wasn't | ||
// reshaped to its expected dimensions by filtergraph. | ||
TORCH_CHECK( | ||
(avFilteredFrame->width == filtersContext_->outputWidth) && | ||
(avFilteredFrame->height == filtersContext_->outputHeight), | ||
"Expected frame from filter graph of ", | ||
filtersContext_->outputWidth, | ||
"x", | ||
filtersContext_->outputHeight, | ||
", got ", | ||
avFilteredFrame->width, | ||
"x", | ||
avFilteredFrame->height); | ||
} | ||
|
||
UniqueAVFrame& avFrame = (avFilteredFrame) ? avFilteredFrame : avInputFrame; | ||
|
||
// The filtered frame might be on CPU if CPU fallback has happenned on filter | ||
// graph level. For example, that's how we handle color format conversion | ||
// on FFmpeg 4.4 where scale_cuda did not have this supported implemented yet. | ||
if (avFrame->format != AV_PIX_FMT_CUDA) { | ||
// The frame's format is AV_PIX_FMT_CUDA if and only if its content is on | ||
// the GPU. In this branch, the frame is on the CPU: this is what NVDEC | ||
|
@@ -232,8 +347,6 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( | |
// Above we checked that the AVFrame was on GPU, but that's not enough, we | ||
// also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits), | ||
// because this is what the NPP color conversion routines expect. | ||
// TODO: we should investigate how to can perform color conversion for | ||
// non-8bit videos. This is supported on CPU. | ||
TORCH_CHECK( | ||
avFrame->hw_frames_ctx != nullptr, | ||
"The AVFrame does not have a hw_frames_ctx. " | ||
|
@@ -242,16 +355,14 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( | |
auto hwFramesCtx = | ||
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data); | ||
AVPixelFormat actualFormat = hwFramesCtx->sw_format; | ||
|
||
TORCH_CHECK( | ||
actualFormat == AV_PIX_FMT_NV12, | ||
"The AVFrame is ", | ||
(av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat) | ||
: "unknown"), | ||
", but we expected AV_PIX_FMT_NV12. This typically happens when " | ||
"the video isn't 8bit, which is not supported on CUDA at the moment. " | ||
"Try using the CPU device instead. " | ||
"If the video is 10bit, we are tracking 10bit support in " | ||
"https://github.com/pytorch/torchcodec/issues/776"); | ||
", but we expected AV_PIX_FMT_NV12. " | ||
"That's unexpected, please report this to the TorchCodec repo."); | ||
|
||
auto frameDims = | ||
getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame); | ||
|
Uh oh!
There was an error while loading. Please reload this page.