Add preset to VideoEncoder API (#1042)

Dan-Flores · web-flow · commit e170212b5922 · 2025-11-13T14:40:28.000-05:00
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -745,6 +745,10 @@ void VideoEncoder::initializeEncoder(
         std::to_string(videoStreamOptions.crf.value()).c_str(),
         0);
   }
+  if (videoStreamOptions.preset.has_value()) {
+    av_dict_set(
+        &options, "preset", videoStreamOptions.preset.value().c_str(), 0);
+  }
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
   av_dict_free(&options);
 
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
@@ -45,13 +45,11 @@ struct VideoStreamOptions {
   std::string_view deviceVariant = "ffmpeg";
 
   // Encoding options
-  // TODO-VideoEncoder: Consider adding other optional fields here
-  // (bit rate, gop size, max b frames, preset)
-  std::optional<double> crf;
-
   // Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
   // If not specified, uses codec's default format.
   std::optional<std::string> pixelFormat;
+  std::optional<double> crf;
+  std::optional<std::string> preset;
 };
 
 struct AudioStreamOptions {
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -603,11 +603,13 @@ void encode_video_to_file(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view file_name,
-    std::optional<std::string> pixel_format = std::nullopt,
-    std::optional<double> crf = std::nullopt) {
+    std::optional<std::string_view> pixel_format = std::nullopt,
+    std::optional<double> crf = std::nullopt,
+    std::optional<std::string_view> preset = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
+  videoStreamOptions.preset = preset;
   VideoEncoder(
       frames,
       validateInt64ToInt(frame_rate, "frame_rate"),
@@ -620,12 +622,14 @@ at::Tensor encode_video_to_tensor(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view format,
-    std::optional<std::string> pixel_format = std::nullopt,
-    std::optional<double> crf = std::nullopt) {
+    std::optional<std::string_view> pixel_format = std::nullopt,
+    std::optional<double> crf = std::nullopt,
+    std::optional<std::string_view> preset = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
+  videoStreamOptions.preset = preset;
   return VideoEncoder(
              frames,
              validateInt64ToInt(frame_rate, "frame_rate"),
@@ -640,8 +644,9 @@ void _encode_video_to_file_like(
     int64_t frame_rate,
     std::string_view format,
     int64_t file_like_context,
-    std::optional<std::string> pixel_format = std::nullopt,
-    std::optional<double> crf = std::nullopt) {
+    std::optional<std::string_view> pixel_format = std::nullopt,
+    std::optional<double> crf = std::nullopt,
+    std::optional<std::string_view> preset = std::nullopt) {
   auto fileLikeContext =
       reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
   TORCH_CHECK(
@@ -651,6 +656,7 @@ void _encode_video_to_file_like(
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.pixelFormat = pixel_format;
   videoStreamOptions.crf = crf;
+  videoStreamOptions.preset = preset;
 
   VideoEncoder encoder(
       frames,
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -215,6 +215,7 @@ def encode_video_to_file_like(
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
     crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
+    preset: Optional[str] = None,
 ) -> None:
     """Encode video frames to a file-like object.
 
@@ -225,6 +226,7 @@ def encode_video_to_file_like(
         file_like: File-like object that supports write() and seek() methods
         crf: Optional constant rate factor for encoding quality
         pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
+        preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
     """
     assert _pybind_ops is not None
 
@@ -235,6 +237,7 @@ def encode_video_to_file_like(
         _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
         pixel_format,
         crf,
+        preset,
     )
 
 
@@ -322,8 +325,9 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     filename: str,
-    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
+    preset: Optional[str] = None,
 ) -> None:
     return
 
@@ -333,8 +337,9 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     format: str,
-    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
+    preset: Optional[str] = None,
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
@@ -345,8 +350,9 @@ def _encode_video_to_file_like_abstract(
     frame_rate: int,
     format: str,
     file_like_context: int,
-    crf: Optional[Union[int, float]] = None,
     pixel_format: Optional[str] = None,
+    crf: Optional[Union[int, float]] = None,
+    preset: Optional[str] = None,
 ) -> None:
     return
 
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
@@ -38,6 +38,7 @@ def to_file(
         *,
         pixel_format: Optional[str] = None,
         crf: Optional[Union[int, float]] = None,
+        preset: Optional[Union[str, int]] = None,
     ) -> None:
         """Encode frames into a file.
 
@@ -50,13 +51,19 @@ def to_file(
             crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
                 mean better quality. Valid range depends on the encoder (commonly 0-51).
                 Defaults to None (which will use encoder's default).
+            preset (str or int, optional): Encoder option that controls the tradeoff between
+                encoding speed and compression. Valid values depend on the encoder (commonly
+                a string: "fast", "medium", "slow"). Defaults to None
+                (which will use encoder's default).
         """
+        preset = str(preset) if isinstance(preset, int) else preset
         _core.encode_video_to_file(
             frames=self._frames,
             frame_rate=self._frame_rate,
             filename=str(dest),
             pixel_format=pixel_format,
             crf=crf,
+            preset=preset,
         )
 
     def to_tensor(
@@ -65,6 +72,7 @@ def to_tensor(
         *,
         pixel_format: Optional[str] = None,
         crf: Optional[Union[int, float]] = None,
+        preset: Optional[Union[str, int]] = None,
     ) -> Tensor:
         """Encode frames into raw bytes, as a 1D uint8 Tensor.
 
@@ -76,16 +84,22 @@ def to_tensor(
             crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
                 mean better quality. Valid range depends on the encoder (commonly 0-51).
                 Defaults to None (which will use encoder's default).
+            preset (str or int, optional): Encoder option that controls the tradeoff between
+                encoding speed and compression. Valid values depend on the encoder (commonly
+                a string: "fast", "medium", "slow"). Defaults to None
+                (which will use encoder's default).
 
         Returns:
             Tensor: The raw encoded bytes as 4D uint8 Tensor.
         """
+        preset_value = str(preset) if isinstance(preset, int) else preset
         return _core.encode_video_to_tensor(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
             pixel_format=pixel_format,
             crf=crf,
+            preset=preset_value,
         )
 
     def to_file_like(
@@ -95,6 +109,7 @@ def to_file_like(
         *,
         pixel_format: Optional[str] = None,
         crf: Optional[Union[int, float]] = None,
+        preset: Optional[Union[str, int]] = None,
     ) -> None:
         """Encode frames into a file-like object.
 
@@ -111,12 +126,18 @@ def to_file_like(
             crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
                 mean better quality. Valid range depends on the encoder (commonly 0-51).
                 Defaults to None (which will use encoder's default).
+            preset (str or int, optional): Encoder option that controls the tradeoff between
+                encoding speed and compression. Valid values depend on the encoder (commonly
+                a string: "fast", "medium", "slow"). Defaults to None
+                (which will use encoder's default).
         """
+        preset = str(preset) if isinstance(preset, int) else preset
         _core.encode_video_to_file_like(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
             file_like=file_like,
             pixel_format=pixel_format,
             crf=crf,
+            preset=preset,
         )
diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -617,6 +617,12 @@ def test_bad_input_parameterized(self, tmp_path, method):
             )
             getattr(encoder, method)(**valid_params, crf=-10)
 
+        with pytest.raises(
+            RuntimeError,
+            match=r"avcodec_open2 failed: Invalid argument",
+        ):
+            encoder.to_tensor(format="mp4", preset="fake_preset")
+
     @pytest.mark.parametrize("method", ["to_file", "to_tensor", "to_file_like"])
     @pytest.mark.parametrize("crf", [23, 23.5, -0.9])
     def test_crf_valid_values(self, method, crf, tmp_path):
@@ -829,13 +835,26 @@ def test_against_to_file(self, tmp_path, format, method):
             pytest.param("webm", marks=pytest.mark.slow),
         ),
     )
-    @pytest.mark.parametrize("pixel_format", ("yuv444p", "yuv420p"))
-    def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format):
+    @pytest.mark.parametrize(
+        "encode_params",
+        [
+            {"pixel_format": "yuv444p", "crf": 0, "preset": None},
+            {"pixel_format": "yuv420p", "crf": 30, "preset": None},
+            {"pixel_format": "yuv420p", "crf": None, "preset": "ultrafast"},
+            {"pixel_format": "yuv420p", "crf": None, "preset": None},
+        ],
+    )
+    def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, encode_params):
         ffmpeg_version = get_ffmpeg_major_version()
         if format == "webm" and (
             ffmpeg_version == 4 or (IS_WINDOWS and ffmpeg_version in (6, 7))
         ):
             pytest.skip("Codec for webm is not available in this FFmpeg installation.")
+
+        pixel_format = encode_params["pixel_format"]
+        crf = encode_params["crf"]
+        preset = encode_params["preset"]
+
         if format in ("avi", "flv") and pixel_format == "yuv444p":
             pytest.skip(f"Default codec for {format} does not support {pixel_format}")
 
@@ -848,8 +867,7 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format):
 
         ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_output.{format}")
         frame_rate = 30
-        crf = 0
-        # Some codecs (ex. MPEG4) do not support CRF.
+        # Some codecs (ex. MPEG4) do not support CRF or preset.
         # Flags not supported by the selected codec will be ignored.
         ffmpeg_cmd = [
             "ffmpeg",
@@ -864,18 +882,26 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format):
             str(frame_rate),
             "-i",
             temp_raw_path,
-            "-pix_fmt",
-            pixel_format,  # Output format
-            "-crf",
-            str(crf),
-            ffmpeg_encoded_path,
         ]
+        if pixel_format is not None:  # Output format
+            ffmpeg_cmd.extend(["-pix_fmt", pixel_format])
+        if preset is not None:
+            ffmpeg_cmd.extend(["-preset", preset])
+        if crf is not None:
+            ffmpeg_cmd.extend(["-crf", str(crf)])
+        # Output path must be last
+        ffmpeg_cmd.append(ffmpeg_encoded_path)
         subprocess.run(ffmpeg_cmd, check=True)
 
         # Encode with our video encoder
         encoder_output_path = str(tmp_path / f"encoder_output.{format}")
         encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
-        encoder.to_file(dest=encoder_output_path, pixel_format=pixel_format, crf=crf)
+        encoder.to_file(
+            dest=encoder_output_path,
+            pixel_format=pixel_format,
+            crf=crf,
+            preset=preset,
+        )
 
         ffmpeg_frames = self.decode(ffmpeg_encoded_path).data
         encoder_frames = self.decode(encoder_output_path).data