Skip to content

Support encoding to file-like object #754

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/torchcodec/_core/AVIOContextHolder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ void AVIOContextHolder::createAVIOContext(
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
bool isForWriting,
int bufferSize) {
TORCH_CHECK(
bufferSize > 0,
Expand All @@ -23,14 +24,18 @@ void AVIOContextHolder::createAVIOContext(
buffer != nullptr,
"Failed to allocate buffer of size " + std::to_string(bufferSize));

TORCH_CHECK(
(seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
"seek method must be defined, and either write or read must be defined. "
"But not both!")
TORCH_CHECK(seek != nullptr, "seek method must be defined");

if (isForWriting) {
TORCH_CHECK(write != nullptr, "write method must be defined for writing");
} else {
TORCH_CHECK(read != nullptr, "read method must be defined for reading");
}

avioContext_.reset(avioAllocContext(
buffer,
bufferSize,
/*write_flag=*/write != nullptr,
/*write_flag=*/isForWriting,
heldData,
read,
write,
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/AVIOContextHolder.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class AVIOContextHolder {
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
bool isForWriting,
int bufferSize = defaultBufferSize);

private:
Expand Down
26 changes: 21 additions & 5 deletions src/torchcodec/_core/AVIOFileLikeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,29 @@

namespace facebook::torchcodec {

AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike, bool isForWriting)
: fileLike_{UniquePyObject(new py::object(fileLike))} {
{
// TODO: Is it necessary to acquire the GIL here? Is it maybe even
// harmful? At the moment, this is only called from within a pybind
// function, and pybind guarantees we have the GIL.
py::gil_scoped_acquire gil;
TORCH_CHECK(
py::hasattr(fileLike, "read"),
"File like object must implement a read method.");

if (isForWriting) {
TORCH_CHECK(
py::hasattr(fileLike, "write"),
"File like object must implement a write method for writing.");
} else {
TORCH_CHECK(
py::hasattr(fileLike, "read"),
"File like object must implement a read method for reading.");
}

TORCH_CHECK(
py::hasattr(fileLike, "seek"),
"File like object must implement a seek method.");
}
createAVIOContext(&read, nullptr, &seek, &fileLike_);
createAVIOContext(&read, &write, &seek, &fileLike_, isForWriting);
}

int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
Expand Down Expand Up @@ -77,4 +85,12 @@ int64_t AVIOFileLikeContext::seek(void* opaque, int64_t offset, int whence) {
return py::cast<int64_t>((*fileLike)->attr("seek")(offset, whence));
}

int AVIOFileLikeContext::write(void* opaque, const uint8_t* buf, int buf_size) {
auto fileLike = static_cast<UniquePyObject*>(opaque);
py::gil_scoped_acquire gil;
py::bytes bytes_obj(reinterpret_cast<const char*>(buf), buf_size);

return py::cast<int64_t>((*fileLike)->attr("write")(bytes_obj));
}

} // namespace facebook::torchcodec
3 changes: 2 additions & 1 deletion src/torchcodec/_core/AVIOFileLikeContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ namespace facebook::torchcodec {
// and seek calls back up to the methods on the Python object.
class AVIOFileLikeContext : public AVIOContextHolder {
public:
explicit AVIOFileLikeContext(py::object fileLike);
explicit AVIOFileLikeContext(py::object fileLike, bool isForWriting);

private:
static int read(void* opaque, uint8_t* buf, int buf_size);
static int64_t seek(void* opaque, int64_t offset, int whence);
static int write(void* opaque, const uint8_t* buf, int buf_size);

// Note that we dynamically allocate the Python object because we need to
// strictly control when its destructor is called. We must hold the GIL
Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/AVIOTensorContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,14 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
TORCH_CHECK(data.numel() > 0, "data must not be empty");
TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
createAVIOContext(&read, nullptr, &seek, &tensorContext_);
createAVIOContext(
&read, nullptr, &seek, &tensorContext_, /*isForWriting=*/false);
}

AVIOToTensorContext::AVIOToTensorContext()
: tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
createAVIOContext(nullptr, &write, &seek, &tensorContext_);
createAVIOContext(
nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
}

torch::Tensor AVIOToTensorContext::getOutputTensor() {
Expand Down
8 changes: 5 additions & 3 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ AudioEncoder::AudioEncoder(
const torch::Tensor& samples,
int sampleRate,
std::string_view formatName,
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
std::unique_ptr<AVIOContextHolder> avioContextHolder,
const AudioStreamOptions& audioStreamOptions)
: samples_(validateSamples(samples)),
inSampleRate_(sampleRate),
Expand Down Expand Up @@ -248,9 +248,10 @@ void AudioEncoder::initializeEncoder(
torch::Tensor AudioEncoder::encodeToTensor() {
TORCH_CHECK(
avioContextHolder_ != nullptr,
"Cannot encode to tensor, avio context doesn't exist.");
"Cannot encode to tensor, avio tensor context doesn't exist.");
encode();
return avioContextHolder_->getOutputTensor();
return dynamic_cast<AVIOToTensorContext*>(avioContextHolder_.get())
->getOutputTensor();
}

void AudioEncoder::encode() {
Expand Down Expand Up @@ -501,6 +502,7 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
void AudioEncoder::flushBuffers() {
AutoAVPacket autoAVPacket;
maybeFlushSwrBuffers(autoAVPacket);

encodeFrame(autoAVPacket, UniqueAVFrame(nullptr));
}
} // namespace facebook::torchcodec
10 changes: 6 additions & 4 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
#include <torch/types.h>
#include "src/torchcodec/_core/AVIOTensorContext.h"
#include "src/torchcodec/_core/AVIOContextHolder.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/StreamOptions.h"

Expand All @@ -14,13 +14,16 @@ class AudioEncoder {
int sampleRate,
std::string_view fileName,
const AudioStreamOptions& audioStreamOptions);

AudioEncoder(
const torch::Tensor& samples,
int sampleRate,
std::string_view formatName,
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
std::unique_ptr<AVIOContextHolder> avioContextHolder,
const AudioStreamOptions& audioStreamOptions);

void encode();

torch::Tensor encodeToTensor();

private:
Expand Down Expand Up @@ -49,8 +52,7 @@ class AudioEncoder {

UniqueAVAudioFifo avAudioFifo_;

// Stores the AVIOContext for the output tensor buffer.
std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
std::unique_ptr<AVIOContextHolder> avioContextHolder_;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a simpler approach is to only store and accept a AVIOContextHolder, but when you need to access AVIOToTensorContext specific methods, do a std::dynamic_cast<AVIOToTensorContext*>. Dynamic casts are also ugly, but it localizes the ugliness to the one place we need it, and avoids having parallel variables and parallel constructors.

A dynamic cast, in C++, can be used to say "I only have a pointer to Base, but I know that this object is actually type Derived: return me a pointer to Derived." If it fails, you get back nullptr.

This might solve the linking-visibility problem above, I'm not sure. The reason I think it might is that we remove AVIOFileLikeContext from the declaration of Encoder. It will only appear inside the implementation.

bool encodeWasCalled_ = false;
int64_t lastEncodedAVFramePts_ = 0;
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
create_from_file_like,
create_from_tensor,
encode_audio_to_file,
encode_audio_to_file_like,
encode_audio_to_tensor,
get_ffmpeg_library_versions,
get_frame_at_index,
Expand Down
56 changes: 56 additions & 0 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,62 @@ def create_from_file_like(
return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))


def encode_audio_to_file_like(
samples: torch.Tensor,
sample_rate: int,
format: str,
file_like: Union[io.RawIOBase, io.BufferedIOBase],
bit_rate: Optional[int] = None,
num_channels: Optional[int] = None,
desired_sample_rate: Optional[int] = None,
) -> None:
"""Encode audio samples to a file-like object.

Args:
samples: Audio samples tensor
sample_rate: Sample rate in Hz
format: Audio format (e.g., "wav", "mp3", "flac")
file_like: File-like object that supports write() and seek() methods
bit_rate: Optional bit rate for encoding
num_channels: Optional number of output channels
desired_sample_rate: Optional desired sample rate for the output.
"""
assert _pybind_ops is not None

if samples.dtype != torch.float32:
raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")

# We're having the same problem as with the decoder's create_from_file_like:
# We should be able to pass a tensor directly, but this leads to a pybind
# error. In order to work around this, we pass the pointer to the tensor's
# data, and its shape, in order to re-construct it in C++. For this to work:
# - the tensor must be float32
# - the tensor must be contiguous, which is why we call contiguous().
# In theory we could avoid this restriction by also passing the strides?
# - IMPORTANT: the input samples tensor and its underlying data must be
# alive during the call.
#
# A more elegant solution would be to cast the tensor into a py::object, but
# casting the py::object backk to a tensor in C++ seems to lead to the same
# pybing error.

samples = samples.contiguous()
_pybind_ops.encode_audio_to_file_like(
samples.data_ptr(),
list(samples.shape),
sample_rate,
format,
file_like,
bit_rate,
num_channels,
desired_sample_rate,
)

# This check is useless but it's critical to keep it to ensures that samples
# is still alive during the call to encode_audio_to_file_like.
assert samples.is_contiguous()
Copy link
Member Author

@NicolasHug NicolasHug Jul 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hate that we have to do this but I do not see any other obvious way to keep the input samples alive for the duration of the call.
Claude is saying that we could just pass samples as a py::object. We won't be able to turn it back to a tensor (as mentioned in the code comment above), but claude claims that passing it as a parameter will ensure that pybind will keep it alive. I cannot verify this.

@scotts, any thoughts?

Copy link
Contributor

@scotts scotts Jul 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the keep-alive part, I believe Claude is right. If we pass something as a py::object, that gets properly reference-counted which will keep the object alive. When we launder a pointer as an int, there's no reference counting.

Of course, we would ideally just pass the tensor - but we run into problems passing tensors as tensors into the pybind11 code. The next simplest thing that we probably can't do for performance reasons is to copy the tensor into either bytes or a list, and then pass those as py::object. But since samples will be large, I don't think we want to do that.

Most workarounds I can think of are worse. One that might be just as bad, but could potentially apply to both this situation and decoder creation:

  1. On the pybind11 side, we only create the AVIOFileLikeContext. We don't create the encoder or decoder. We do still accept the file-like objects, and they are still stored in the AVIOFileLIkeContext.
  2. We return an int from the C++ side to the Python side where that int is a pointer to the AVIOFileLikeContext.
  3. On the PyTorch custom ops side, we have functions for create-from-file-like and encode-to-file-like that accept the int value and do a reinterpret_cast<AVIOFileLikeContext*> in the C++. Those are then passed to the decoder or encode.

As it is right now, we're doing a lot of ugly pointer casting with tensors. The above may actually be better, as then the pybind11 code is only really concerned with creating AVIOFIleLikeContext objects. It doesn't even need to know about encoders and decoders.



# ==============================
# Abstract impl for the operators. Needed by torch.compile.
# ==============================
Expand Down
50 changes: 49 additions & 1 deletion src/torchcodec/_core/pybind_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#include <string>

#include "src/torchcodec/_core/AVIOFileLikeContext.h"
#include "src/torchcodec/_core/Encoder.h"
#include "src/torchcodec/_core/SingleStreamDecoder.h"
#include "src/torchcodec/_core/StreamOptions.h"

namespace py = pybind11;

Expand All @@ -31,19 +33,65 @@ int64_t create_from_file_like(
realSeek = seekModeFromString(seek_mode.value());
}

auto avioContextHolder = std::make_unique<AVIOFileLikeContext>(file_like);
auto avioContextHolder =
std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);

SingleStreamDecoder* decoder =
new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
return reinterpret_cast<int64_t>(decoder);
}

void encode_audio_to_file_like(
int64_t data_ptr,
const std::vector<int64_t>& shape,
int64_t sample_rate,
std::string_view format,
py::object file_like,
std::optional<int64_t> bit_rate = std::nullopt,
std::optional<int64_t> num_channels = std::nullopt,
std::optional<int64_t> desired_sample_rate = std::nullopt) {
// We assume float32 *and* contiguity, this must be enforced by the caller.
auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we keep this technique, we can probably allow all dtypes by passing in the dtype from the Python side as ints. I assume the Python and C++ enums agree on values, but even if they don't, we can figure out the mapping. Ugly, but possible.

auto samples = torch::from_blob(
reinterpret_cast<void*>(data_ptr), shape, tensor_options);

// TODO Fix implicit int conversion:
// https://github.com/pytorch/torchcodec/issues/679
// same for sample_rate parameter below
AudioStreamOptions audioStreamOptions;
audioStreamOptions.bitRate = bit_rate;
audioStreamOptions.numChannels = num_channels;
audioStreamOptions.sampleRate = desired_sample_rate;

auto avioContextHolder =
std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);

AudioEncoder encoder(
samples,
static_cast<int>(sample_rate),
format,
std::move(avioContextHolder),
audioStreamOptions);
encoder.encode();
}

#ifndef PYBIND_OPS_MODULE_NAME
#error PYBIND_OPS_MODULE_NAME must be defined!
#endif

PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
m.def("create_from_file_like", &create_from_file_like);
m.def(
"encode_audio_to_file_like",
&encode_audio_to_file_like,
"data_ptr",
"shape",
"sample_rate",
"format",
"file_like",
"bit_rate",
"num_channels",
"desired_sample_rate");
}

} // namespace facebook::torchcodec
39 changes: 39 additions & 0 deletions src/torchcodec/encoders/_audio_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,42 @@ def to_tensor(
num_channels=num_channels,
desired_sample_rate=sample_rate,
)

def to_file_like(
self,
file_like,
format: str,
*,
bit_rate: Optional[int] = None,
num_channels: Optional[int] = None,
sample_rate: Optional[int] = None,
) -> None:
"""Encode samples into a file-like object.

Args:
file_like: A file-like object that supports ``write()`` and
``seek()`` methods, such as io.BytesIO(), an open file in binary
write mode, etc. Methods must have the following signature:
``write(data: bytes) -> int`` and ``seek(offset: int, whence:
int = 0) -> int``.
format (str): The format of the encoded samples, e.g. "mp3", "wav"
or "flac".
bit_rate (int, optional): The output bit rate. Encoders typically
support a finite set of bit rate values, so ``bit_rate`` will be
matched to one of those supported values. The default is chosen
by FFmpeg.
num_channels (int, optional): The number of channels of the encoded
output samples. By default, the number of channels of the input
``samples`` is used.
sample_rate (int, optional): The sample rate of the encoded output.
By default, the sample rate of the input ``samples`` is used.
"""
_core.encode_audio_to_file_like(
samples=self._samples,
sample_rate=self._sample_rate,
format=format,
file_like=file_like,
bit_rate=bit_rate,
num_channels=num_channels,
desired_sample_rate=sample_rate,
)
Loading
Loading