-
Notifications
You must be signed in to change notification settings - Fork 62
BETA CUDA interface: support for approximate mode and time-based APIs #917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
NicolasHug
merged 33 commits into
meta-pytorch:main
from
NicolasHug:nvdec-rework-frame-ordering
Oct 3, 2025
+207
−148
Merged
Changes from all commits
Commits
Show all changes
33 commits
Select commit
Hold shift + click to select a range
78ab058
Let's just commit 3k loc in a single commit
NicolasHug b45decc
Fixes
NicolasHug 316f218
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug d0192ec
GetCache -> getCache
NicolasHug 515deb5
Make UniqueCUvideodecoder a pointer on CUvideodecoder, not void
NicolasHug 13fad10
Make device and device_variant have a default instead of being std::o…
NicolasHug eb8de72
Remove old registerDeviceInterface
NicolasHug 4f7a4fb
Call std::memset
NicolasHug dcf3124
remove unnecessary cuda_runtime.h include, update cmake accordingly
NicolasHug 0ad7370
abstract frameBuffer_ into a FrameBuffer class
NicolasHug aad142e
Cleanup BSF logic
NicolasHug 2592888
Return int in callback instead of unsigned char
NicolasHug b5fe9bc
define width and height as unsigned int
NicolasHug 5605c90
Rework frame ordering and pts matching
NicolasHug 7494259
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug 560b376
Fix cuda context initialization
NicolasHug 88196c5
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug 2a78b84
Renaming
NicolasHug 5d194e5
Comment
NicolasHug d1e51b3
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug f9c7297
Skip equality check on ffmepg 4
NicolasHug b7bbfb2
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug 390fd7c
Refac, simplify
NicolasHug f55dcc0
Update comment
NicolasHug 7e4dd10
Define constant, add TODO for AVRational
NicolasHug f614846
Use uint32_t types
NicolasHug aa6e253
Create packet.reset() and add P0 TODO
NicolasHug 186eaa4
Add TODO
NicolasHug 1cb4890
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug c5b32a4
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-rewor…
NicolasHug 70873bf
lint
NicolasHug 799f1dd
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-rewor…
NicolasHug 8cc80e5
Use auto
NicolasHug File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,16 +35,20 @@ static bool g_cuda_beta = registerDeviceInterface( | |
|
||
static int CUDAAPI | ||
pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) { | ||
BetaCudaDeviceInterface* decoder = | ||
static_cast<BetaCudaDeviceInterface*>(pUserData); | ||
auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData); | ||
return decoder->streamPropertyChange(videoFormat); | ||
} | ||
|
||
static int CUDAAPI | ||
pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* pPicParams) { | ||
BetaCudaDeviceInterface* decoder = | ||
static_cast<BetaCudaDeviceInterface*>(pUserData); | ||
return decoder->frameReadyForDecoding(pPicParams); | ||
pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* picParams) { | ||
auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData); | ||
return decoder->frameReadyForDecoding(picParams); | ||
} | ||
|
||
static int CUDAAPI | ||
pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) { | ||
auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData); | ||
return decoder->frameReadyInDisplayOrder(dispInfo); | ||
} | ||
|
||
static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { | ||
|
@@ -142,7 +146,7 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) | |
|
||
BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { | ||
// TODONVDEC P0: we probably need to free the frames that have been decoded by | ||
// NVDEC but not yet "mapped" - i.e. those that are still in frameBuffer_? | ||
// NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_? | ||
|
||
if (decoder_) { | ||
NVDECCache::getCache(device_.index()) | ||
|
@@ -218,7 +222,7 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) { | |
parserParams.pUserData = this; | ||
parserParams.pfnSequenceCallback = pfnSequenceCallback; | ||
parserParams.pfnDecodePicture = pfnDecodePictureCallback; | ||
parserParams.pfnDisplayPicture = nullptr; | ||
parserParams.pfnDisplayPicture = pfnDisplayPictureCallback; | ||
|
||
CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams); | ||
TORCH_CHECK( | ||
|
@@ -274,10 +278,6 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { | |
cuvidPacket.flags = CUVID_PKT_TIMESTAMP; | ||
cuvidPacket.timestamp = packet->pts; | ||
|
||
// Like DALI: store packet PTS in queue to later assign to frames as they | ||
// come out | ||
packetsPtsQueue.push(packet->pts); | ||
|
||
} else { | ||
// End of stream packet | ||
cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM; | ||
|
@@ -329,70 +329,38 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) { | |
// ready to be decoded, i.e. the parser received all the necessary packets for a | ||
// given frame. It means we can send that frame to be decoded by the hardware | ||
// NVDEC decoder by calling cuvidDecodePicture which is non-blocking. | ||
int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* pPicParams) { | ||
int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) { | ||
if (isFlushing_) { | ||
return 0; | ||
} | ||
|
||
TORCH_CHECK(pPicParams != nullptr, "Invalid picture parameters"); | ||
TORCH_CHECK(picParams != nullptr, "Invalid picture parameters"); | ||
TORCH_CHECK(decoder_, "Decoder not initialized before picture decode"); | ||
|
||
// Send frame to be decoded by NVDEC - non-blocking call. | ||
CUresult result = cuvidDecodePicture(*decoder_.get(), pPicParams); | ||
if (result != CUDA_SUCCESS) { | ||
return 0; // Yes, you're reading that right, 0 mean error. | ||
} | ||
CUresult result = cuvidDecodePicture(*decoder_.get(), picParams); | ||
|
||
// The frame was sent to be decoded on the NVDEC hardware. Now we store some | ||
// relevant info into our frame buffer so that we can retrieve the decoded | ||
// frame later when receiveFrame() is called. | ||
// Importantly we need to 'guess' the PTS of that frame. The heuristic we use | ||
// (like in DALI) is that the frames are ready to be decoded in the same order | ||
// as the packets were sent to the parser. So we assign the PTS of the frame | ||
// by popping the PTS of the oldest packet in our packetsPtsQueue (note: | ||
// oldest doesn't necessarily mean lowest PTS!). | ||
// Yes, you're reading that right, 0 means error, 1 means success | ||
return (result == CUDA_SUCCESS); | ||
} | ||
|
||
TORCH_CHECK( | ||
// TODONVDEC P0 the queue may be empty, handle that. | ||
!packetsPtsQueue.empty(), | ||
"PTS queue is empty when decoding a frame"); | ||
int64_t guessedPts = packetsPtsQueue.front(); | ||
packetsPtsQueue.pop(); | ||
|
||
// Field values taken from DALI | ||
CUVIDPARSERDISPINFO dispInfo = {}; | ||
dispInfo.picture_index = pPicParams->CurrPicIdx; | ||
dispInfo.progressive_frame = !pPicParams->field_pic_flag; | ||
dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1; | ||
dispInfo.repeat_first_field = 0; | ||
dispInfo.timestamp = guessedPts; | ||
|
||
FrameBuffer::Slot* slot = frameBuffer_.findEmptySlot(); | ||
slot->dispInfo = dispInfo; | ||
slot->guessedPts = guessedPts; | ||
slot->occupied = true; | ||
|
||
return 1; | ||
int BetaCudaDeviceInterface::frameReadyInDisplayOrder( | ||
CUVIDPARSERDISPINFO* dispInfo) { | ||
readyFrames_.push(*dispInfo); | ||
return 1; // success | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To clarify for my understanding, when the Are the function signatures for this and other callbacks defined somwhere in documentation? |
||
} | ||
|
||
// Moral equivalent of avcodec_receive_frame(). Here, we look for a decoded | ||
// frame with the exact desired PTS in our frame buffer. This logic is only | ||
// valid in exact seek_mode, for now. | ||
int BetaCudaDeviceInterface::receiveFrame( | ||
UniqueAVFrame& avFrame, | ||
int64_t desiredPts) { | ||
FrameBuffer::Slot* slot = frameBuffer_.findFrameWithExactPts(desiredPts); | ||
if (slot == nullptr) { | ||
// Moral equivalent of avcodec_receive_frame(). | ||
int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { | ||
if (readyFrames_.empty()) { | ||
// No frame found, instruct caller to try again later after sending more | ||
// packets. | ||
return AVERROR(EAGAIN); | ||
} | ||
|
||
slot->occupied = false; | ||
slot->guessedPts = -1; | ||
CUVIDPARSERDISPINFO dispInfo = readyFrames_.front(); | ||
readyFrames_.pop(); | ||
|
||
CUVIDPROCPARAMS procParams = {}; | ||
CUVIDPARSERDISPINFO dispInfo = slot->dispInfo; | ||
procParams.progressive_frame = dispInfo.progressive_frame; | ||
procParams.top_field_first = dispInfo.top_field_first; | ||
procParams.unpaired_field = dispInfo.repeat_first_field < 0; | ||
|
@@ -452,7 +420,7 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( | |
avFrame->width = width; | ||
avFrame->height = height; | ||
avFrame->format = AV_PIX_FMT_CUDA; | ||
avFrame->pts = dispInfo.timestamp; // == guessedPts | ||
avFrame->pts = dispInfo.timestamp; | ||
|
||
// TODONVDEC P0: Zero division error!!! | ||
// TODONVDEC P0: Move AVRational arithmetic to FFMPEGCommon, and put the | ||
|
@@ -518,13 +486,8 @@ void BetaCudaDeviceInterface::flush() { | |
|
||
isFlushing_ = false; | ||
|
||
for (auto& slot : frameBuffer_) { | ||
slot.occupied = false; | ||
slot.guessedPts = -1; | ||
} | ||
|
||
std::queue<int64_t> empty; | ||
packetsPtsQueue.swap(empty); | ||
std::queue<CUVIDPARSERDISPINFO> emptyQueue; | ||
std::swap(readyFrames_, emptyQueue); | ||
|
||
eofSent_ = false; | ||
} | ||
|
@@ -544,26 +507,4 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( | |
avFrame, frameOutput, preAllocatedOutputTensor); | ||
} | ||
|
||
BetaCudaDeviceInterface::FrameBuffer::Slot* | ||
BetaCudaDeviceInterface::FrameBuffer::findEmptySlot() { | ||
for (auto& slot : frameBuffer_) { | ||
if (!slot.occupied) { | ||
return &slot; | ||
} | ||
} | ||
frameBuffer_.emplace_back(); | ||
return &frameBuffer_.back(); | ||
} | ||
|
||
BetaCudaDeviceInterface::FrameBuffer::Slot* | ||
BetaCudaDeviceInterface::FrameBuffer::findFrameWithExactPts( | ||
int64_t desiredPts) { | ||
for (auto& slot : frameBuffer_) { | ||
if (slot.occupied && slot.guessedPts == desiredPts) { | ||
return &slot; | ||
} | ||
} | ||
return nullptr; | ||
} | ||
|
||
} // namespace facebook::torchcodec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the key difference, correct? That is, by registering this callback, we get the new behavior and can delete all of the relevant code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes that's correct