Fixed the comments

cehongwang · cehongwang · commit 9ac07a93c2c2 · 2025-12-03T17:24:20.000Z
diff --git a/examples/dynamo/low_cpu_memory_compilation.py b/examples/dynamo/low_cpu_memory_compilation.py
@@ -63,6 +63,7 @@ def forward(self, x):
     "min_block_size": 1,
     "immutable_weights": True,
     "reuse_cached_engines": False,
+    "enable_resource_partitioning": True,
     "cpu_memory_budget": 2 * 1024 * 1024 * 1024,  # 2 GiB in bytes
 }
 
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -108,7 +108,8 @@ def cross_compile_for_windows(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
-    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
+    enable_resource_partitioning: bool = _defaults.ENABLE_RESOURCE_PARTITIONING,
+    cpu_memory_budget: Optional[int] = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -183,7 +184,8 @@ def cross_compile_for_windows(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
-        cpu_memory_budget (int): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail. If set to -1, the compilation will use all available CPU memory.
+        enable_resource_partitioning (bool): Enable resource-aware partitioning. This is useful when the model is large and the CPU memory is limited.
+        cpu_memory_budget (Optional[int]): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -339,6 +341,7 @@ def cross_compile_for_windows(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "enable_resource_partitioning": enable_resource_partitioning,
         "cpu_memory_budget": cpu_memory_budget,
     }
 
@@ -441,7 +444,8 @@ def compile(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
-    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
+    cpu_memory_budget: Optional[int] = _defaults.CPU_MEMORY_BUDGET,
+    enable_resource_partitioning: bool = _defaults.ENABLE_RESOURCE_PARTITIONING,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -519,6 +523,8 @@ def compile(
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        enable_resource_partitioning (bool): Enable resource-aware partitioning. This is useful when the model is large and the CPU memory is limited.
+        cpu_memory_budget (Optional[int]): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -688,6 +694,7 @@ def compile(
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "enable_resource_partitioning": enable_resource_partitioning,
         "cpu_memory_budget": cpu_memory_budget,
     }
     logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
@@ -862,10 +869,11 @@ def preserve_module_specs(
             require_full_compilation=settings.require_full_compilation,
         )
 
-    partitioned_module = resource_partition(
-        partitioned_module,
-        cpu_memory_budget=settings.cpu_memory_budget,
-    )
+    if settings.enable_resource_partitioning:
+        partitioned_module = resource_partition(
+            partitioned_module,
+            cpu_memory_budget=settings.cpu_memory_budget,
+        )
 
     dryrun_tracker.unsupported_ops = supported_ops.unsupported_operators
 
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -2,7 +2,6 @@
 import platform
 import tempfile
 
-import psutil
 import torch
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
@@ -58,7 +57,8 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
-CPU_MEMORY_BUDGET = psutil.virtual_memory().available
+ENABLE_RESOURCE_PARTITIONING = False
+CPU_MEMORY_BUDGET = None
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -15,6 +15,7 @@
     DRYRUN,
     ENABLE_CROSS_COMPILE_FOR_WINDOWS,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    ENABLE_RESOURCE_PARTITIONING,
     ENABLE_WEIGHT_STREAMING,
     ENABLED_PRECISIONS,
     ENGINE_CAPABILITY,
@@ -141,6 +142,7 @@ class CompilationSettings:
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+    enable_resource_partitioning: bool = ENABLE_RESOURCE_PARTITIONING
     cpu_memory_budget: int = CPU_MEMORY_BUDGET
 
     def __getstate__(self) -> dict[str, Any]:
@@ -174,7 +176,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
     "enable_weight_streaming",
     "tiling_optimization_level",
     "l2_limit_for_tiling",
-    "cpu_memory_budget",
+    "enable_resource_partitioning",
 )
 
 
diff --git a/py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py
@@ -46,7 +46,7 @@
 """
 
 import logging
-from typing import Dict, List, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 import psutil
 import torch
@@ -59,6 +59,8 @@
 
 logger = logging.getLogger(__name__)
 
+MAX_NUM_OF_ENGINES = 40
+
 
 class ResourcePartitioner(_SplitterBase):  # type: ignore
     """Refine capability-based subgraphs to meet host CPU memory constraints.
@@ -78,14 +80,19 @@ class ResourcePartitioner(_SplitterBase):  # type: ignore
     def __init__(
         self,
         module: torch.fx.GraphModule,
-        cpu_memory_budget: int,
+        cpu_memory_budget: Optional[int],
         submodule_name: str,
     ):
 
         assert isinstance(module, torch.fx.GraphModule)
 
         self.module = module
-        self.cpu_memory_budget = cpu_memory_budget
+        self.cpu_memory_budget = (
+            cpu_memory_budget
+            if cpu_memory_budget is not None
+            else psutil.virtual_memory().available
+        )
+        self.not_set_limit = cpu_memory_budget is None
         self.resource_split_count = 0
         self.submodule_name = submodule_name
         self.deps = self.find_deps()
@@ -148,10 +155,6 @@ def put_nodes_into_subgraphs(self) -> list[Subgraph]:
         subgraphs = [Subgraph(is_acc=True, nodes=nodes)]
         self.fusion_patterns = get_node_in_fusion_pattern(self.module.graph)
 
-        assert self.check_topological_order(
-            subgraphs
-        ), "The subgraphs are not topologically ordered"
-
         return subgraphs
 
     def check_topological_order(self, subgraphs: List[Subgraph]) -> bool:
@@ -186,7 +189,11 @@ def calculate_size_budget(
         """
 
         used_rss: int = psutil.Process().memory_info().rss
-        available_rss = self.cpu_memory_budget - used_rss
+        available_rss = (
+            self.cpu_memory_budget
+            if self.not_set_limit
+            else self.cpu_memory_budget - used_rss
+        )
         return available_rss // engine_compilation_memory_usage_multiplier
 
     def break_subgraphs(
@@ -214,12 +221,17 @@ def break_subgraphs(
         # We throw an error if the remaining memory is almost empty compared to the model size.
         # i.e. if the remaining memory is 4G (budget is 1G) the model size is greater than 40G, we stop the compilation.
         sizes = self.size_of_subgraphs(subgraphs)
-        if sum(sizes) > subgraph_size_budget * 40:
-            raise ValueError(
-                "CPU memory budget or available memory is too small to compile the model. "
-                + f"CPU memory budget: {self.cpu_memory_budget // (1024 * 1024)} MB, Model size: {sum(sizes) // (1024 * 1024)} MB. "
-                + "Consider setting cpu_memory_budget to a larger value or disable offload_module_to_cpu to save more CPU memory."
-            )
+        if sum(sizes) > subgraph_size_budget * MAX_NUM_OF_ENGINES:
+            if self.not_set_limit:
+                raise ValueError(
+                    "The system memory is too constrained to compile the model without severe perf degradation. Consider setting offload_module_to_cpu=False to save more CPU memory."
+                )
+            else:
+                raise ValueError(
+                    "CPU memory budget is too small to compile the model. "
+                    + f"CPU memory budget: {self.cpu_memory_budget // (1024 * 1024)} MB, Model size: {sum(sizes) // (1024 * 1024)} MB. "
+                    + "Consider setting cpu_memory_budget to a larger value."
+                )
         for subgraph, size in zip(subgraphs, sizes):
 
             while size > subgraph_size_budget:
@@ -233,11 +245,11 @@ def break_subgraphs(
             if len(subgraph.nodes) != 0:
                 new_subgraphs.append(subgraph)
 
-        self._varify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
+        self._verify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
 
         return new_subgraphs
 
-    def _varify_all_fusion_nodes_in_same_subgraph(
+    def _verify_all_fusion_nodes_in_same_subgraph(
         self, subgraphs: List[Subgraph]
     ) -> None:
         """Assert that every fusion group is contained in exactly one subgraph."""
diff --git a/tests/py/dynamo/partitioning/test_resource_partitioning.py b/tests/py/dynamo/partitioning/test_resource_partitioning.py
@@ -63,6 +63,7 @@ def forward(self, x):
             "min_block_size": 1,
             "immutable_weights": True,
             "reuse_cached_engines": False,
+            "enable_resource_partitioning": True,
         }
         settings = CompilationSettings(**compilation_options)
 
@@ -144,6 +145,7 @@ def forward(self, x):
             "immutable_weights": True,
             "reuse_cached_engines": False,
             "torch_executed_ops": {"torch.ops.aten.max_pool2d.default"},
+            "enable_resource_partitioning": True,
         }
         settings = CompilationSettings(**compilation_options)
 
@@ -175,8 +177,8 @@ def forward(self, x):
                     if "_run_on_acc" in name
                 ]
             )
-            == 5
-        ), "The graph should have 5 accelerated subgraphs"
+            > 3
+        ), "The graph should have more than 3 accelerated subgraphs"
         assert (
             len(
                 [
@@ -275,6 +277,7 @@ def forward(self, x):
             "immutable_weights": True,
             "reuse_cached_engines": False,
             "torch_executed_ops": {"torch.ops.aten.max_pool2d.default"},
+            "enable_resource_partitioning": True,
         }
         settings = CompilationSettings(**compilation_options)
 
@@ -355,6 +358,7 @@ def forward(self, x):
             "min_block_size": 1,
             "immutable_weights": True,
             "reuse_cached_engines": False,
+            "enable_resource_partitioning": True,
         }
         settings = CompilationSettings(**compilation_options)
 
@@ -409,7 +413,7 @@ def forward(self, x):
             assert broken_fusion
 
             # The fusion should be fixed after the step
-            partitioner._varify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
+            partitioner._verify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
 
             break
 
@@ -463,6 +467,7 @@ def forward(self, x):
             "immutable_weights": True,
             "reuse_cached_engines": False,
             "torch_executed_ops": {"torch.ops.aten.max_pool2d.default"},
+            "enable_resource_partitioning": True,
         }
         settings = CompilationSettings(**compilation_options)
 

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@ def forward(self, x):`
`63`	`63`	`"min_block_size": 1,`
`64`	`64`	`"immutable_weights": True,`
`65`	`65`	`"reuse_cached_engines": False,`
	`66`	`+ "enable_resource_partitioning": True,`
`66`	`67`	`"cpu_memory_budget": 2 * 1024 * 1024 * 1024, # 2 GiB in bytes`
`67`	`68`	`}`
`68`	`69`