Add crf to VideoEncoder API (#1031)

Dan-Flores · web-flow · commit c69739facec7 · 2025-11-13T11:06:09.000-05:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -5,6 +5,7 @@
 #include "torch/types.h"
 
 extern "C" {
+#include <libavutil/opt.h>
 #include <libavutil/pixdesc.h>
 }
 
@@ -568,6 +569,43 @@ AVPixelFormat validatePixelFormat(
   }
   TORCH_CHECK(false, errorMsg.str());
 }
+
+void validateDoubleOption(
+    const AVCodec& avCodec,
+    const char* optionName,
+    double value) {
+  if (!avCodec.priv_class) {
+    return;
+  }
+  const AVOption* option = av_opt_find2(
+      // Convert obj arg from const AVClass* const* to non-const void*
+      // First cast to remove const, then cast to void*
+      const_cast<void*>(static_cast<const void*>(&avCodec.priv_class)),
+      optionName,
+      nullptr,
+      0,
+      AV_OPT_SEARCH_FAKE_OBJ,
+      nullptr);
+  // If the option was not found, let FFmpeg handle it later
+  if (!option) {
+    return;
+  }
+  if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
+      option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
+    TORCH_CHECK(
+        value >= option->min && value <= option->max,
+        optionName,
+        "=",
+        value,
+        " is out of valid range [",
+        option->min,
+        ", ",
+        option->max,
+        "] for this codec. For more details, run 'ffmpeg -h encoder=",
+        avCodec.name,
+        "'");
+  }
+}
 } // namespace
 
 VideoEncoder::~VideoEncoder() {
@@ -700,6 +738,7 @@ void VideoEncoder::initializeEncoder(
   // Apply videoStreamOptions
   AVDictionary* options = nullptr;
   if (videoStreamOptions.crf.has_value()) {
+    validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
     av_dict_set(
         &options,
         "crf",
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -47,7 +47,7 @@ struct VideoStreamOptions {
   // Encoding options
   // TODO-VideoEncoder: Consider adding other optional fields here
   // (bit rate, gop size, max b frames, preset)
-  std::optional<int> crf;
+  std::optional<double> crf;
 
   // Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
   // If not specified, uses codec's default format.
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, int? crf=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, int? crf=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, int? crf=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -604,7 +604,7 @@ void encode_video_to_file(
     int64_t frame_rate,
     std::string_view file_name,
     std::optional<std::string> pixel_format = std::nullopt,
-    std::optional<int64_t> crf = std::nullopt) {
+    std::optional<double> crf = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
@@ -621,7 +621,7 @@ at::Tensor encode_video_to_tensor(
     int64_t frame_rate,
     std::string_view format,
     std::optional<std::string> pixel_format = std::nullopt,
-    std::optional<int64_t> crf = std::nullopt) {
+    std::optional<double> crf = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.pixelFormat = pixel_format;
@@ -641,7 +641,7 @@ void _encode_video_to_file_like(
     std::string_view format,
     int64_t file_like_context,
     std::optional<std::string> pixel_format = std::nullopt,
-    std::optional<int64_t> crf = std::nullopt) {
+    std::optional<double> crf = std::nullopt) {
   auto fileLikeContext =
       reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -213,7 +213,7 @@ def encode_video_to_file_like(
     frame_rate: int,
     format: str,
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
-    crf: Optional[int] = None,
+    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
 ) -> None:
     """Encode video frames to a file-like object.
@@ -322,7 +322,7 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     filename: str,
-    crf: Optional[int] = None,
+    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
 ) -> None:
     return
@@ -333,7 +333,7 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     format: str,
-    crf: Optional[int] = None,
+    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
@@ -345,7 +345,7 @@ def _encode_video_to_file_like_abstract(
     frame_rate: int,
     format: str,
     file_like_context: int,
-    crf: Optional[int] = None,
+    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
 ) -> None:
     return
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
@@ -37,6 +37,7 @@ def to_file(
         dest: Union[str, Path],
         *,
         pixel_format: Optional[str] = None,
+        crf: Optional[Union[int, float]] = None,
     ) -> None:
         """Encode frames into a file.
 
@@ -46,27 +47,35 @@ def to_file(
                 container format.
             pixel_format (str, optional): The pixel format for encoding (e.g.,
                 "yuv420p", "yuv444p"). If not specified, uses codec's default format.
+            crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
+                mean better quality. Valid range depends on the encoder (commonly 0-51).
+                Defaults to None (which will use encoder's default).
         """
         _core.encode_video_to_file(
             frames=self._frames,
             frame_rate=self._frame_rate,
             filename=str(dest),
             pixel_format=pixel_format,
+            crf=crf,
         )
 
     def to_tensor(
         self,
         format: str,
         *,
         pixel_format: Optional[str] = None,
+        crf: Optional[Union[int, float]] = None,
     ) -> Tensor:
         """Encode frames into raw bytes, as a 1D uint8 Tensor.
 
         Args:
             format (str): The container format of the encoded frames, e.g. "mp4", "mov",
-            "mkv", "avi", "webm", "flv", or "gif"
+                "mkv", "avi", "webm", "flv", etc.
             pixel_format (str, optional): The pixel format to encode frames into (e.g.,
                 "yuv420p", "yuv444p"). If not specified, uses codec's default format.
+            crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
+                mean better quality. Valid range depends on the encoder (commonly 0-51).
+                Defaults to None (which will use encoder's default).
 
         Returns:
             Tensor: The raw encoded bytes as 4D uint8 Tensor.
@@ -76,6 +85,7 @@ def to_tensor(
             frame_rate=self._frame_rate,
             format=format,
             pixel_format=pixel_format,
+            crf=crf,
         )
 
     def to_file_like(
@@ -84,6 +94,7 @@ def to_file_like(
         format: str,
         *,
         pixel_format: Optional[str] = None,
+        crf: Optional[Union[int, float]] = None,
     ) -> None:
         """Encode frames into a file-like object.
 
@@ -94,14 +105,18 @@ def to_file_like(
                 ``write(data: bytes) -> int`` and ``seek(offset: int, whence:
                 int = 0) -> int``.
             format (str): The container format of the encoded frames, e.g. "mp4", "mov",
-                "mkv", "avi", "webm", "flv", or "gif".
+                "mkv", "avi", "webm", "flv", etc.
             pixel_format (str, optional): The pixel format for encoding (e.g.,
                 "yuv420p", "yuv444p"). If not specified, uses codec's default format.
+            crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
+                mean better quality. Valid range depends on the encoder (commonly 0-51).
+                Defaults to None (which will use encoder's default).
         """
         _core.encode_video_to_file_like(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
             file_like=file_like,
             pixel_format=pixel_format,
+            crf=crf,
         )
diff --git a/test/test_encoders.py b/test/test_encoders.py
diff --git a/test/test_ops.py b/test/test_ops.py