[Model Suite] Add model correctness testing

PaliC · PaliC · commit 0ef64c64fce8 · 2025-10-02T08:04:19.000Z
This PR adds end to end model correctness testing testing to Model suite by comparing the outputs and gradients (after a backwards pass) with 1 iteration of the model. We also integrate it into CI. ### Testing Running `uv run python BackendBench/scripts/main.py --suite model --backend directory` with a working mm kernel and a watermarked kernel for everything else yeilds ```bash [2025-10-02 07:16:13][INFO][main.py] ============================================================ [2025-10-02 07:16:13][INFO][main.py] MODEL EVALUATION RESULTS [2025-10-02 07:16:13][INFO][main.py] ============================================================ [2025-10-02 07:16:13][INFO][model.py] Model: ToyCoreOpsModel [2025-10-02 07:16:13][INFO][model.py] Status: ✗ Failed (0/3 tests) [2025-10-02 07:16:13][INFO][model.py] ✗ small_batch [2025-10-02 07:16:13][INFO][model.py] Error: Model ToyCoreOpsModel::small_batch failed: Expected number of channels in input to be divisible by num_groups, but got input of shape [2, 3, 32, 32] and num_groups=8 [2025-10-02 07:16:13][INFO][model.py] ✗ medium_batch [2025-10-02 07:16:13][INFO][model.py] Error: Model ToyCoreOpsModel::medium_batch failed: Expected number of channels in input to be divisible by num_groups, but got input of shape [4, 3, 64, 64] and num_groups=8 [2025-10-02 07:16:13][INFO][model.py] ✗ large_input [2025-10-02 07:16:13][INFO][model.py] Error: Model ToyCoreOpsModel::large_input failed: Expected number of channels in input to be divisible by num_groups, but got input of shape [2, 3, 128, 128] and num_groups=8 [2025-10-02 07:16:13][INFO][model.py] Model: SmokeTestModel [2025-10-02 07:16:13][INFO][model.py] Status: ✓ Passed (3/3 tests) [2025-10-02 07:16:13][INFO][model.py] ✓ small_batch [2025-10-02 07:16:13][INFO][model.py] Output match: ✓ Gradients match: ✓ (4 gradients) [2025-10-02 07:16:13][INFO][model.py] ✓ medium_batch [2025-10-02 07:16:13][INFO][model.py] Output match: ✓ Gradients match: ✓ (4 gradients) [2025-10-02 07:16:13][INFO][model.py] ✓ large_batch [2025-10-02 07:16:13][INFO][model.py] Output match: ✓ Gradients match: ✓ (4 gradients) [2025-10-02 07:16:13][INFO][main.py] ============================================================ ``` ### Future work with Model Suite #181
diff --git a/BackendBench/eval_model.py b/BackendBench/eval_model.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Model-level evaluation utilities for testing full model correctness."""
+
+import logging
+import random
+import traceback
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+
+import torch
+
+import BackendBench
+from BackendBench.eval import allclose
+from BackendBench.utils import deserialize_args
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelCorrectnessTestResult:
+    """Result from testing a model configuration."""
+
+    model_name: str
+    test_name: str
+    is_correct: bool = False
+    error_msg: str = ""
+    error_type: str = ""
+    traceback: str = ""
+    output_match: bool = False
+    gradients_match: bool = False
+    num_gradients: int = 0
+
+
+def eval_model_correctness_test(
+    model_name: str,
+    model_class: type,
+    model_config: Dict[str, Any],
+    test_name: str,
+    test_args: str,
+    kernel_dir: str = None,
+    atol: float = 1e-2,
+    rtol: float = 1e-2,
+) -> ModelCorrectnessTestResult:
+    """Evaluate model correctness by comparing eager vs backend execution.
+
+    Similar to eval_correctness_test in eval.py, but for full models instead of individual ops.
+
+    Args:
+        model_name: Name of the model being tested
+        model_class: Model class to instantiate
+        model_config: Model configuration dict with init_args
+        test_name: Name of this test configuration
+        test_args: Serialized arguments string for forward pass
+        kernel_dir: Optional directory containing kernels for backend
+        atol: Absolute tolerance for allclose
+        rtol: Relative tolerance for allclose
+
+    Returns:
+        ModelCorrectnessTestResult with detailed comparison results
+    """
+    try:
+        # Generate a single seed to use for both eager and backend runs
+        # This ensures both runs use the same model initialization
+        seed = random.randint(0, 2**32 - 1)
+
+        # Run in eager mode (reference)
+        eager_out, eager_grads = _run_model(
+            model_class,
+            model_config,
+            test_args,
+            backend_enabled=False,
+            kernel_dir=None,
+            seed=seed,
+        )
+
+        # Run with backend (implementation)
+        backend_out, backend_grads = _run_model(
+            model_class,
+            model_config,
+            test_args,
+            backend_enabled=True,
+            kernel_dir=kernel_dir,
+            seed=seed,
+        )
+
+        # Compare outputs
+        output_match = allclose(eager_out, backend_out, atol=atol, rtol=rtol)
+
+        # Compare gradients
+        gradients_match = True
+        if len(eager_grads) != len(backend_grads):
+            gradients_match = False
+        else:
+            for eager_grad, backend_grad in zip(eager_grads, backend_grads):
+                if not allclose(eager_grad, backend_grad, atol=atol, rtol=rtol):
+                    gradients_match = False
+                    break
+
+        is_correct = output_match and gradients_match
+
+        return ModelCorrectnessTestResult(
+            model_name=model_name,
+            test_name=test_name,
+            is_correct=is_correct,
+            output_match=output_match,
+            gradients_match=gradients_match,
+            num_gradients=len(eager_grads),
+        )
+
+    except Exception as e:
+        error_msg = f"Model {model_name}::{test_name} failed: {e}"
+        logger.error(error_msg)
+        return ModelCorrectnessTestResult(
+            model_name=model_name,
+            test_name=test_name,
+            is_correct=False,
+            error_msg=error_msg,
+            error_type=str(type(e)),
+            traceback=traceback.format_exc(),
+        )
+
+
+def _move_model_to_input_device(
+    model: torch.nn.Module, args: List[Any], kwargs: Dict[str, Any]
+) -> torch.nn.Module:
+    """Move model to the same device as input tensor.
+
+    Args:
+        model: Model to move
+        args: Positional arguments list
+        kwargs: Keyword arguments dict
+
+    Returns:
+        Model on input device (or original model if no input tensor found)
+    """
+
+    # this is specific to our configs atm, we should generalize this
+    input_tensor = kwargs["x"]
+    if input_tensor is not None:
+        device = input_tensor.device
+        model = model.to(device)
+    return model
+
+
+def _collect_gradients(
+    model: torch.nn.Module, args: List[Any], kwargs: Dict[str, Any]
+) -> List[torch.Tensor]:
+    """Collect gradients from input and model parameters.
+
+    Args:
+        model: Model with computed gradients
+        args: Positional arguments list
+        kwargs: Keyword arguments dict
+
+    Returns:
+        List of gradient tensors [input_grad, param1_grad, ...]
+    """
+    grads = []
+
+    # Input gradient - check both args and kwargs
+    input_grad = None
+    if args and isinstance(args[0], torch.Tensor) and args[0].grad is not None:
+        input_grad = args[0].grad
+    elif "x" in kwargs and isinstance(kwargs["x"], torch.Tensor) and kwargs["x"].grad is not None:
+        input_grad = kwargs["x"].grad
+
+    if input_grad is not None:
+        grads.append(input_grad.clone())
+
+    # Parameter gradients
+    for param in model.parameters():
+        if param.grad is not None:
+            grads.append(param.grad.clone())
+
+    return grads
+
+
+def _run_model(
+    model_class: type,
+    model_config: Dict[str, Any],
+    test_args: str,
+    backend_enabled: bool,
+    kernel_dir: str = "generated_kernels",
+    seed: int = None,
+) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    """Run model with or without backend enabled.
+
+    Args:
+        model_class: Model class to instantiate
+        model_config: Model configuration dict with init_args
+        test_args: Serialized arguments string for forward pass
+        backend_enabled: If True, use BackendBench context manager
+        kernel_dir: Optional directory containing kernels
+        seed: Random seed for reproducibility. If None, generates a random seed.
+
+    Returns:
+        Tuple of (output, gradients) where:
+        - output: Model output tensor (detached)
+        - gradients: List of gradient tensors [input_grad, param1_grad, ...]
+    """
+
+    # Generate seed dynamically and set for deterministic behavior
+    # IMPORTANT: Must set seed BEFORE deserializing args, because deserialization
+    # may create random tensors!
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+    torch.manual_seed(seed)
+
+    # Deserialize test arguments (now uses the seed we just set)
+    args, kwargs = deserialize_args(test_args)
+
+    # Extract model initialization args
+    init_args = model_config.get("init_args", {}).copy()
+
+    # Create fresh model instance
+    model = model_class(**init_args)
+    model.train()
+
+    # Move model to same device as input
+    model = _move_model_to_input_device(model, args, kwargs)
+    ctx = (
+        BackendBench.BackendBench.enable(kernel_dir=kernel_dir)
+        if backend_enabled
+        else nullcontext()
+    )
+    # Run forward + backward with or without backend
+    with ctx:
+        output = model(*args, **kwargs)
+        loss = output.sum()
+        loss.backward()
+
+    # Collect gradients
+    grads = _collect_gradients(model, args, kwargs)
+
+    return output.detach(), grads
diff --git a/BackendBench/scripts/main.py b/BackendBench/scripts/main.py
@@ -41,6 +41,21 @@ def setup_logging(log_level):
     )
 
 
+# Helper function as model suite gets fleshed out
+def _test_full_models(suite, backend):
+    assert suite.name == "model"
+    all_results = []
+    for model in suite.models:
+        results = suite.eval_model(model, backend)
+        all_results.append(results)
+    logger.info("=" * 60)
+    logger.info("MODEL EVALUATION RESULTS")
+    logger.info("=" * 60)
+    for result in all_results:
+        suite.print_results(result)
+    logger.info("=" * 60)
+
+
 @click.command()
 @click.option(
     "--log-level",
@@ -179,8 +194,6 @@ def cli(
             raise ValueError(
                 "--ops filter is not supported for model suite. Use --model-filter instead"
             )
-        # remove this in later PR as model suite is supported
-        raise NotImplementedError("Model suite is not supported yet")
 
     if suite != "model" and model_filter is not None:
         raise ValueError("--model-filter is only supported for model suite")
@@ -246,6 +259,11 @@ def cli(
             timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
             log_dir = f"backendbench_output_{timestamp}"
 
+    if suite.name == "model":
+        _test_full_models(suite, backend)
+        # currently model suite does not support op testing so now we're done
+        return
+
     overall_correctness = []
     overall_performance = []
     all_correctness_results = []
diff --git a/BackendBench/suite/model.py b/BackendBench/suite/model.py
@@ -14,6 +14,8 @@
 import os
 from typing import Any, Dict, List, Optional
 
+from BackendBench.eval_model import eval_model_correctness_test
+
 logger = logging.getLogger(__name__)
 
 
@@ -110,3 +112,96 @@ def __init__(
         # Store loaded models
         self.models = models
         self.name = name
+
+    def eval_model(self, model_dict: Dict[str, Any], backend) -> Dict[str, Any]:
+        """Run evaluation on a single model.
+
+        Args:
+            model_dict: Dictionary with keys 'name', 'class', 'config'
+            backend: Backend to use for evaluation
+
+        Returns:
+            Dictionary with evaluation results including correctness and performance
+        """
+
+        model_class = model_dict["class"]
+        model_name = model_dict["name"]
+        config = model_dict["config"]
+
+        # Extract model configuration and tests
+        model_config = config.get("model_config", {})
+        model_tests = config.get("model_tests", {})
+
+        if not model_tests:
+            return {
+                "model_name": model_name,
+                "passed": False,
+                "error": "No model_tests found in config",
+                "test_results": [],
+            }
+
+        # Get kernel_dir from backend if available
+        kernel_dir = getattr(backend, "ops_dir", None)
+
+        # Run each test
+        test_results = []
+        for test_name, test_args in model_tests.items():
+            result = eval_model_correctness_test(
+                model_name=model_name,
+                model_class=model_class,
+                model_config=model_config,
+                test_name=test_name,
+                test_args=test_args,
+                kernel_dir=kernel_dir,
+            )
+            test_results.append(result)
+
+        # Aggregate results
+        all_passed = all(r.is_correct for r in test_results)
+        num_passed = sum(1 for r in test_results if r.is_correct)
+        num_total = len(test_results)
+
+        return {
+            "model_name": model_name,
+            "passed": all_passed,
+            "num_passed": num_passed,
+            "num_total": num_total,
+            "test_results": test_results,
+        }
+
+    def print_results(self, results: Dict[str, Any]) -> None:
+        """Print model evaluation results.
+
+        Args:
+            results: Dictionary with evaluation results from eval_model
+        """
+        model_name = results.get("model_name", "Unknown")
+        passed = results.get("passed", False)
+        num_passed = results.get("num_passed", 0)
+        num_total = results.get("num_total", 0)
+
+        logger.info(f"\nModel: {model_name}")
+        logger.info(
+            f"Status: {'✓ Passed' if passed else '✗ Failed'} ({num_passed}/{num_total} tests)"
+        )
+
+        # Print details for each test
+        test_results = results.get("test_results", [])
+        for result in test_results:
+            status = "✓" if result.is_correct else "✗"
+            logger.info(f"  {status} {result.test_name}")
+
+            if not result.is_correct:
+                if result.error_msg:
+                    logger.info(f"    Error: {result.error_msg}")
+                else:
+                    # Show what failed
+                    if not result.output_match:
+                        logger.info("    Output mismatch")
+                    if not result.gradients_match:
+                        logger.info(f"    Gradient mismatch ({result.num_gradients} gradients)")
+            else:
+                # Show success details
+                logger.info(
+                    f"    Output match: ✓  Gradients match: ✓ ({result.num_gradients} gradients)"
+                )