Skip to content

Commit dda5832

Browse files
committed
[ModelSuite] Refactor TorchBench for ModelSuite inheritance
This PR integrates operator benchmarking into the Model Suite by having it inherit from TorchBenchTestSuite. The suite now extracts operator lists from model configs and benchmarks those operators using TorchBench data before running end-to-end model tests. This approach aligns with the core goal of BackendBench: testing operators. The Model Suite is designed with the assumption that for a given set of ops, users can provide kernel implementations, and the suite will benchmark both the individual ops and the full model using those implementations. The long-term vision is to make this process seamless—allowing users to run both operator and model benchmarking with a single command. TorchBench is used here because it provides the strongest guarantee that running the suite benchmarks all operators required for a specific model configuration. Its dataset is easily extensible and includes realistic tensor shapes derived from actual models. The main design drawback is that this integration makes supporting kernel fusions with models more complex. However, it is preferable to handle kernel fusions in a separate suite regardless. ### Testing Running `uv run python BackendBench/scripts/main.py --suite model --backend directory --topn 1` with a working mm kernel and other kernels being watermakred yeilds the expected result (below) ```bash Successfully registered 36 custom operators [2025-10-02 07:21:23][INFO][main.py] ============================================================ [2025-10-02 07:21:23][INFO][main.py] MODEL EVALUATION RESULTS [2025-10-02 07:21:23][INFO][main.py] ============================================================ [2025-10-02 07:21:23][INFO][model.py] Model: ToyCoreOpsModel [2025-10-02 07:21:23][INFO][model.py] Status: ✗ Failed (0/3 tests) [2025-10-02 07:21:23][INFO][model.py] ✗ small_batch [2025-10-02 07:21:23][INFO][model.py] Error: Model ToyCoreOpsModel::small_batch failed: Expected number of channels in input to be divisible by num_groups, but got input of shape [2, 3, 32, 32] and num_groups=8 [2025-10-02 07:21:23][INFO][model.py] ✗ medium_batch [2025-10-02 07:21:23][INFO][model.py] Error: Model ToyCoreOpsModel::medium_batch failed: Expected number of channels in input to be divisible by num_groups, but got input of shape [4, 3, 64, 64] and num_groups=8 [2025-10-02 07:21:23][INFO][model.py] ✗ large_input [2025-10-02 07:21:23][INFO][model.py] Error: Model ToyCoreOpsModel::large_input failed: Expected number of channels in input to be divisible by num_groups, but got input of shape [2, 3, 128, 128] and num_groups=8 [2025-10-02 07:21:23][INFO][model.py] Model: SmokeTestModel [2025-10-02 07:21:23][INFO][model.py] Status: ✓ Passed (3/3 tests) [2025-10-02 07:21:23][INFO][model.py] ✓ small_batch [2025-10-02 07:21:23][INFO][model.py] Output match: ✓ Gradients match: ✓ (4 gradients) [2025-10-02 07:21:23][INFO][model.py] ✓ medium_batch [2025-10-02 07:21:23][INFO][model.py] Output match: ✓ Gradients match: ✓ (4 gradients) [2025-10-02 07:21:23][INFO][model.py] ✓ large_batch [2025-10-02 07:21:23][INFO][model.py] Output match: ✓ Gradients match: ✓ (4 gradients) [2025-10-02 07:21:23][INFO][main.py] ============================================================ [2025-10-02 07:21:23][INFO][output.py] Full results saved to generated_kernels/full_results.json [2025-10-02 07:21:23][INFO][output.py] Operator summary CSV saved to generated_kernels/operator_summary.csv [2025-10-02 07:21:23][INFO][output.py] Failed operations log saved to generated_kernels/failed_tests.json [2025-10-02 07:21:23][INFO][output.py] Overall summary saved to generated_kernels/OVERALL_SUMMARY.md [2025-10-02 07:21:23][INFO][output.py] Results saved to directory: /home/dev/sapling_repos/BackendBench/generated_kernels Results saved to directory: /home/dev/sapling_repos/BackendBench/generated_kernels Overall summary saved to: /home/dev/sapling_repos/BackendBench/generated_kernels/OVERALL_SUMMARY.md ``` ### Future work with Model Suite #181
1 parent 4409ee2 commit dda5832

File tree

3 files changed

+51
-14
lines changed

3 files changed

+51
-14
lines changed

BackendBench/scripts/main.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,6 @@ def cli(
184184
p,
185185
):
186186
if suite != "torchbench":
187-
if topn_inputs is not None:
188-
raise ValueError("topn-inputs is only supported for torchbench suite")
189187
if check_overhead_dominated_ops:
190188
raise ValueError("check-overhead-dominated-ops is only supported for torchbench suite")
191189

@@ -198,6 +196,10 @@ def cli(
198196
if suite != "model" and model_filter is not None:
199197
raise ValueError("--model-filter is only supported for model suite")
200198

199+
if suite != "model" and suite != "torchbench":
200+
if topn_inputs is not None:
201+
raise ValueError("topn-inputs is only supported for torchbench suite")
202+
201203
setup_logging(log_level)
202204
if ops:
203205
ops = ops.split(",")
@@ -225,7 +227,7 @@ def cli(
225227
torch.bfloat16,
226228
filter=ops,
227229
),
228-
"model": lambda: ModelSuite(filter=model_filter),
230+
"model": lambda: ModelSuite(filter=model_filter, topn=topn_inputs),
229231
}[suite]()
230232

231233
backend_name = backend
@@ -259,11 +261,6 @@ def cli(
259261
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
260262
log_dir = f"backendbench_output_{timestamp}"
261263

262-
if suite.name == "model":
263-
_test_full_models(suite, backend)
264-
# currently model suite does not support op testing so now we're done
265-
return
266-
267264
overall_correctness = []
268265
overall_performance = []
269266
all_correctness_results = []
@@ -332,6 +329,9 @@ def cli(
332329
f"perf@p score (rate of correct samples with a speedup greater than p, p={p}): {perf_at_p_score:.2f}"
333330
)
334331

332+
if suite.name == "model":
333+
_test_full_models(suite, backend)
334+
335335
command = "python -m BackendBench.scripts.main " + " ".join(sys.argv[1:])
336336

337337
# Save results if not disabled

BackendBench/suite/model.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55
# LICENSE file in the root directory of this source tree.
66

77
"""
8-
Model Suite for testing models defined in configs.
8+
Model Suite for testing operators defined in toy model configs.
9+
10+
This suite extends TorchBenchTestSuite by reading operator lists from
11+
model configs, validating they exist in the TorchBench dataset, then
12+
filtering to include only those operators.
913
"""
1014

1115
import importlib.util
@@ -16,6 +20,8 @@
1620

1721
from BackendBench.eval_model import eval_model_correctness_test
1822

23+
from .torchbench import TorchBenchTestSuite
24+
1925
logger = logging.getLogger(__name__)
2026

2127

@@ -89,29 +95,52 @@ def load_models(
8995
return models
9096

9197

92-
class ModelSuite:
93-
"""Model Suite for end-to-end model testing."""
98+
class ModelSuite(TorchBenchTestSuite):
99+
"""Model Suite that filters TorchBench operators based on model configs.
100+
101+
This suite reads operator lists from model configs, validates they exist
102+
in the TorchBench dataset, then creates a filtered suite containing only
103+
those operators.
104+
"""
94105

95106
def __init__(
96107
self,
97108
name: str = "model",
98109
filter: Optional[List[str]] = None,
110+
topn: Optional[int] = None,
99111
):
100112
"""Initialize ModelSuite.
101113
102114
Args:
103115
name: Suite name (default: "model")
104116
filter: Optional list of model names to load
117+
topn: Optional limit on number of tests per operator
105118
"""
106119
models_dir = os.path.join(os.path.dirname(__file__), "models")
107120

108121
# Load models
109122
models = load_models(models_dir=models_dir, filter=filter)
110123
logger.info(f"ModelSuite: Loaded {len(models)} models from {models_dir}")
111-
112-
# Store loaded models
124+
model_ops = self.get_model_ops(models)
125+
filter = list(model_ops)
126+
# Store loaded models for evaluation
113127
self.models = models
114-
self.name = name
128+
129+
self._initialize_torchbench_suite(name, None, filter, topn, False)
130+
131+
def get_model_ops(self, models: List[Dict[str, Any]]) -> List[str]:
132+
# Extract operators from model configs
133+
model_ops = set()
134+
for model in models:
135+
config_ops = model["config"]["ops"]
136+
ops_list = config_ops["forward"]
137+
ops_list.extend(config_ops["backward"])
138+
139+
model_ops.update(ops_list)
140+
logger.info(f"Model {model['name']}: {len(ops_list)} operators defined in config")
141+
142+
logger.info(f"ModelSuite: Total {len(model_ops)} unique operators across all models")
143+
return model_ops
115144

116145
def eval_model(self, model_dict: Dict[str, Any], backend) -> Dict[str, Any]:
117146
"""Run evaluation on a single model.

BackendBench/suite/torchbench.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@ def __init__(
7878
filter=None,
7979
topn=None,
8080
check_overhead_dominated_ops=False,
81+
):
82+
self._initialize_torchbench_suite(
83+
name, filename, filter, topn, check_overhead_dominated_ops
84+
)
85+
86+
def _initialize_torchbench_suite(
87+
self, name, filename, filter, topn, check_overhead_dominated_ops
8188
):
8289
self.name = name
8390
self.topn = topn
@@ -87,6 +94,7 @@ def __init__(
8794
format="auto", # Auto-detect based on file extension
8895
filter=filter,
8996
)
97+
9098
if check_overhead_dominated_ops:
9199
# Only include ops which are overhead dominated (this is useful as a performance canary)
92100
ops_list = [op for op in ops_list if op.get("is_overhead_dominated_op", False)]

0 commit comments

Comments
 (0)