pytorch · cehongwang · May 21, 2025 · May 22, 2025 · May 22, 2025 · May 23, 2025
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -281,6 +281,16 @@ void TRTEngine::enable_profiling() {
   exec_ctx->setProfiler(trt_engine_profiler.get());
 }
 
+void TRTEngine::set_profile_format(std::string format) {
+  if (format == "trex") {
+    this->trt_engine_profiler->set_profile_format(TraceFormat::kTREX);
+  } else if (format == "perfetto") {
+    this->trt_engine_profiler->set_profile_format(TraceFormat::kPERFETTO);
+  } else {
+    TORCHTRT_THROW_ERROR("Invalid profile format: " + format);
+  }
+}
+
 std::string TRTEngine::get_engine_layer_info() {
   auto inspector = cuda_engine->createEngineInspector();
   return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
@@ -315,7 +325,7 @@ void TRTEngine::set_profiling_paths() {
   output_profile_path = std::filesystem::path{profile_path_prefix + "/" + name + "_output_profile.trace"}.string();
   enqueue_profile_path = std::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
   trt_engine_profile_path =
-      std::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}.string();
+      std::filesystem::path{profile_path_prefix + "/" + name + "_engine_execution_profile.trace"}.string();
   cuda_graph_debug_path = std::filesystem::path{profile_path_prefix + "/" + name + "_cudagraph.dot"}.string();
 }
 

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -147,6 +147,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string to_str() const;
   static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
   void enable_profiling();
+  void set_profile_format(std::string profile_format);
   void disable_profiling();
   std::string get_engine_layer_info();
 

diff --git a/core/runtime/TRTEngineProfiler.cpp b/core/runtime/TRTEngineProfiler.cpp
@@ -32,25 +32,40 @@ TRTEngineProfiler::TRTEngineProfiler(const std::string& name, const std::vector<
   }
 }
 
+void TRTEngineProfiler::set_profile_format(TraceFormat format) {
+  this->profile_format = format;
+}
+
 void dump_trace(const std::string& path, const TRTEngineProfiler& value) {
   std::stringstream out;
   out << "[" << std::endl;
   double ts = 0.0;
+  double running_time = 0.0;
+  for (size_t i = 0; i < value.layer_names.size(); i++) {
+    auto layer_name = value.layer_names[i];
+    auto elem = value.profile.at(layer_name);
+    ts += elem.time;
+  }
   for (size_t i = 0; i < value.layer_names.size(); i++) {
     auto layer_name = value.layer_names[i];
     auto elem = value.profile.at(layer_name);
 
     out << "  {" << std::endl;
     out << "    \"name\": \"" << layer_name << "\"," << std::endl;
-    out << "    \"ph\": \"X\"," << std::endl;
-    out << "    \"ts\": " << ts * 1000 << "," << std::endl;
-    out << "    \"dur\": " << elem.time * 1000 << "," << std::endl;
-    out << "    \"tid\": 1," << std::endl;
-    out << "    \"pid\": \"" << value.name << " Engine Execution\"," << std::endl;
-    out << "    \"args\": {}" << std::endl;
+    if (value.profile_format == TraceFormat::kPERFETTO) {
+      out << "    \"ph\": \"X\"," << std::endl;
+      out << "    \"ts\": " << running_time * 1000 << "," << std::endl;
+      out << "    \"dur\": " << elem.time * 1000 << "," << std::endl;
+      out << "    \"tid\": 1," << std::endl;
+      out << "    \"pid\": \"" << value.name << " Engine Execution\"," << std::endl;
+      out << "    \"args\": {}" << std::endl;
+    } else { // kTREX
+      out << "    \"timeMs\": " << elem.time << "," << std::endl;
+      out << "    \"averageMs\": " << elem.time / elem.count << "," << std::endl;
+      out << "    \"percentage\": " << (elem.time * 100.0 / ts) << std::endl;
+    }
     out << "  }," << std::endl;
-
-    ts += elem.time;
+    running_time += elem.time;
   }
   out.seekp(-2, out.cur);
   out << "\n]" << std::endl;

diff --git a/core/runtime/TRTEngineProfiler.h b/core/runtime/TRTEngineProfiler.h
@@ -10,12 +10,14 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
+enum TraceFormat { kPERFETTO, kTREX };
+
 struct TRTEngineProfiler : public nvinfer1::IProfiler {
   struct Record {
     float time{0};
     int count{0};
   };
-
+  void set_profile_format(TraceFormat format);
   virtual void reportLayerTime(const char* layerName, float ms) noexcept;
   TRTEngineProfiler(
       const std::string& name,
@@ -27,6 +29,7 @@ struct TRTEngineProfiler : public nvinfer1::IProfiler {
   std::string name;
   std::vector<std::string> layer_names;
   std::map<std::string, Record> profile;
+  TraceFormat profile_format = TraceFormat::kPERFETTO;
 };
 
 } // namespace runtime

diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -82,6 +82,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("__repr__", &TRTEngine::to_str)
         .def("__obj_flatten__", &TRTEngine::__obj_flatten__)
         .def("enable_profiling", &TRTEngine::enable_profiling)
+        .def("set_profile_format", &TRTEngine::set_profile_format)
         .def("disable_profiling", &TRTEngine::disable_profiling)
         .def_readwrite("profile_path_prefix", &TRTEngine::profile_path_prefix)
         .def("dump_engine_layer_info_to_file", &TRTEngine::dump_engine_layer_info_to_file)

diff --git a/py/torch_tensorrt/dynamo/__init__.py b/py/torch_tensorrt/dynamo/__init__.py
@@ -19,3 +19,4 @@
     from ._settings import CompilationSettings
     from ._SourceIR import SourceIR
     from ._tracer import trace
+    from .debug._Debugger import Debugger
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -2,6 +2,7 @@
 
 import collections.abc
 import logging
+import os
 import platform
 import warnings
 from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
@@ -31,6 +32,8 @@
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
+from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig
+from torch_tensorrt.dynamo.debug._supports_debugger import fn_supports_debugger
 from torch_tensorrt.dynamo.lowering import (
     get_decompositions,
     post_lowering,
@@ -42,7 +45,6 @@
     get_output_metadata,
     parse_graph_io,
     prepare_inputs,
-    set_log_level,
     to_torch_device,
     to_torch_tensorrt_device,
 )
@@ -64,7 +66,6 @@ def cross_compile_for_windows(
         Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]
     ] = _defaults.ENABLED_PRECISIONS,
     engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY,
-    debug: bool = _defaults.DEBUG,
     num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS,
     workspace_size: int = _defaults.WORKSPACE_SIZE,
     dla_sram_size: int = _defaults.DLA_SRAM_SIZE,
@@ -138,7 +139,6 @@ def cross_compile_for_windows(
         assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
         sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
         enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
-        debug (bool): Enable debuggable engine
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
@@ -185,8 +185,12 @@ def cross_compile_for_windows(
             f"Cross compile for windows is only supported on x86-64 Linux architecture, current platform: {platform.system()=}, {platform.architecture()[0]=}"
         )
 
-    if debug:
-        set_log_level(logger.parent, logging.DEBUG)
+    if kwargs.get("debug", False):
+        warnings.warn(
+            "`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     if "truncate_long_and_double" in kwargs.keys():
         if truncate_double is not _defaults.TRUNCATE_DOUBLE:
@@ -297,7 +301,6 @@ def cross_compile_for_windows(
         "enabled_precisions": (
             enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS
         ),
-        "debug": debug,
         "device": device,
         "assume_dynamic_shape_support": assume_dynamic_shape_support,
         "workspace_size": workspace_size,
@@ -399,7 +402,6 @@ def compile(
         Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]
     ] = _defaults.ENABLED_PRECISIONS,
     engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY,
-    debug: bool = _defaults.DEBUG,
     num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS,
     workspace_size: int = _defaults.WORKSPACE_SIZE,
     dla_sram_size: int = _defaults.DLA_SRAM_SIZE,
@@ -475,7 +477,6 @@ def compile(
         assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
         sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
         enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
-        debug (bool): Enable debuggable engine
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
@@ -518,8 +519,13 @@ def compile(
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
     """
 
-    if debug:
-        set_log_level(logger.parent, logging.DEBUG)
+    if kwargs.get("debug", False):
+        warnings.warn(
+            "`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
     if "truncate_long_and_double" in kwargs.keys():
         if truncate_double is not _defaults.TRUNCATE_DOUBLE:
             raise ValueError(
@@ -641,7 +647,6 @@ def compile(
         "enabled_precisions": (
             enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS
         ),
-        "debug": debug,
         "device": device,
         "assume_dynamic_shape_support": assume_dynamic_shape_support,
         "workspace_size": workspace_size,
@@ -715,12 +720,15 @@ def compile(
     return trt_gm
 
 
+@fn_supports_debugger
 def compile_module(
     gm: torch.fx.GraphModule,
     sample_arg_inputs: Sequence[Input],
     sample_kwarg_inputs: Optional[dict[Any, Any]] = None,
     settings: CompilationSettings = CompilationSettings(),
     engine_cache: Optional[BaseEngineCache] = None,
+    *,
+    _debugger_config: Optional[DebuggerConfig] = None,
 ) -> torch.fx.GraphModule:
     """Compile a traced FX module
 
@@ -744,7 +752,7 @@ def compile_module(
 
     # Check the number of supported operations in the graph
     num_supported_ops, total_ops = partitioning.get_graph_converter_support(
-        gm, settings.debug, settings.torch_executed_ops
+        gm, settings.torch_executed_ops
     )
 
     dryrun_tracker.total_ops_in_graph = total_ops
@@ -796,7 +804,6 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
             logger.info("Partitioning the graph via the fast partitioner")
             partitioned_module, supported_ops = partitioning.fast_partition(
                 gm,
-                verbose=settings.debug,
                 min_block_size=settings.min_block_size,
                 torch_executed_ops=settings.torch_executed_ops,
                 require_full_compilation=settings.require_full_compilation,
@@ -817,7 +824,6 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
         logger.info("Partitioning the graph via the global partitioner")
         partitioned_module, supported_ops = partitioning.global_partition(
             gm,
-            verbose=settings.debug,
             min_block_size=settings.min_block_size,
             torch_executed_ops=settings.torch_executed_ops,
             require_full_compilation=settings.require_full_compilation,
@@ -925,6 +931,41 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
 
             trt_modules[name] = trt_module
 
+            if _debugger_config:
+
+                if _debugger_config.save_engine_profile:
+                    if settings.use_python_runtime:
+                        if _debugger_config.profile_format != "cudagraph":
+                            raise ValueError(
+                                "Profiling with TREX can only be enabled when using the C++ runtime. Python runtime profiling only support cudagraph visualization."
+                            )
+                        else:
+                            trt_module.enable_profiling()
+                    else:
+                        if _debugger_config.profile_format == "cudagraph":
+                            raise ValueError(
+                                "Profiling with Cudagraph can only be enabled when using the Python runtime. C++ runtime profiling only support TREX/Perfetto visualization."
+                            )
+                        else:
+                            path = os.path.join(
+                                _debugger_config.logging_dir,
+                                "engine_visualization_profile",
+                            )
+                            os.makedirs(path, exist_ok=True)
+                            trt_module.enable_profiling(
+                                profiling_results_dir=path,
+                                profile_format=_debugger_config.profile_format,
+                            )
+
+                if _debugger_config.save_layer_info:
+                    with open(
+                        os.path.join(
+                            _debugger_config.logging_dir, "engine_layer_info.json"
+                        ),
+                        "w",
+                    ) as f:
+                        f.write(trt_module.get_layer_info())
+
     # Parse the graph I/O and store it in dryrun tracker
     parse_graph_io(gm, dryrun_tracker)
 
@@ -952,7 +993,6 @@ def convert_exported_program_to_serialized_trt_engine(
     enabled_precisions: (
         Set[torch.dtype | dtype] | Tuple[torch.dtype | dtype]
     ) = _defaults.ENABLED_PRECISIONS,
-    debug: bool = _defaults.DEBUG,
     assume_dynamic_shape_support: bool = _defaults.ASSUME_DYNAMIC_SHAPE_SUPPORT,
     workspace_size: int = _defaults.WORKSPACE_SIZE,
     min_block_size: int = _defaults.MIN_BLOCK_SIZE,
@@ -1014,7 +1054,6 @@ def convert_exported_program_to_serialized_trt_engine(
                         torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                     ]
         enabled_precisions (Optional[Set[torch.dtype | _enums.dtype]]): The set of datatypes that TensorRT can use
-        debug (bool): Whether to print out verbose debugging information
         workspace_size (int): Workspace TRT is allowed to use for the module (0 is default)
         min_block_size (int): Minimum number of operators per TRT-Engine Block
         torch_executed_ops (Set[str]): Set of operations to run in Torch, regardless of converter coverage
@@ -1054,8 +1093,12 @@ def convert_exported_program_to_serialized_trt_engine(
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
-    if debug:
-        set_log_level(logger.parent, logging.DEBUG)
+    if kwargs.get("debug", False):
+        warnings.warn(
+            "`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     if "truncate_long_and_double" in kwargs.keys():
         if truncate_double is not _defaults.TRUNCATE_DOUBLE:
@@ -1139,7 +1182,6 @@ def convert_exported_program_to_serialized_trt_engine(
     compilation_options = {
         "assume_dynamic_shape_support": assume_dynamic_shape_support,
         "enabled_precisions": enabled_precisions,
-        "debug": debug,
         "workspace_size": workspace_size,
         "min_block_size": min_block_size,
         "torch_executed_ops": torch_executed_ops,

diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -1,12 +1,12 @@
 import os
+import pwd
 import tempfile
 
 import torch
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
 
 ENABLED_PRECISIONS = {dtype.f32}
-DEBUG = False
 DEVICE = None
 DISABLE_TF32 = False
 ASSUME_DYNAMIC_SHAPE_SUPPORT = False
@@ -50,6 +50,9 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
+DEBUG_LOGGING_DIR = os.path.join(
+    tempfile.gettempdir(), pwd.getpwuid(os.getuid())[0], "torch_tensorrt/debug_logs"
+)
 
 
 def default_device() -> Device: