Garden-AI · hholb · Nov 19, 2025 · Nov 21, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/garden_ai/backend_client.py b/garden_ai/backend_client.py
@@ -6,6 +6,10 @@
 
 from garden_ai.constants import GardenConstants
 from garden_ai.gardens import Garden
+from garden_ai.schemas.benchmark import (
+    BenchmarkResultCreateRequest,
+    BenchmarkResultResponse,
+)
 from garden_ai.schemas.garden import GardenMetadata
 from garden_ai.schemas.hpc import HpcInvocationCreateRequest
 from garden_ai.schemas.modal import (
@@ -182,3 +186,9 @@ def search_gardens(self, payload: dict) -> dict:
     def create_hpc_invocation(self, payload: HpcInvocationCreateRequest) -> dict:
         response = self._post("/hpc/invocations", payload.model_dump(mode="json"))
         return response
+
+    def publish_benchmark_result(
+        self, payload: BenchmarkResultCreateRequest
+    ) -> BenchmarkResultResponse:
+        response = self._post("/benchmarks", payload.model_dump(mode="json"))
+        return BenchmarkResultResponse(**response)
diff --git a/garden_ai/benchmarks/__init__.py b/garden_ai/benchmarks/__init__.py
@@ -0,0 +1,110 @@
+"""Garden AI benchmarking framework.
+
+This module provides interfaces for running standardized benchmarks on
+models hosted in Garden AI or developed locally.
+
+Available benchmarks:
+    - MatbenchDiscovery: Materials discovery benchmark suite
+"""
+
+from typing import Any, Dict, Optional
+
+from garden_ai.client import GardenClient
+from garden_ai.schemas.benchmark import BenchmarkResultCreateRequest
+
+from .matbench_discovery.enums import DatasetSize, MatbenchTask
+from .matbench_discovery.tasks import MatbenchDiscovery
+
+__all__ = [
+    "MatbenchDiscovery",
+    "MatbenchTask",
+    "DatasetSize",
+    "publish_benchmark_result",
+]
+
+
+def publish_benchmark_result(
+    result: Dict[str, Any],
+    model_name: str,
+    garden_doi: Optional[str] = None,
+    benchmark_name: Optional[str] = None,
+    task_name: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Publish benchmark results to the Garden AI backend.
+
+    This function takes the output from a benchmark task (e.g., MatbenchDiscovery.IS2RE.remote())
+    and publishes it to the Garden backend for tracking and leaderboard purposes.
+
+    Args:
+        result: The output dictionary from a benchmark task. Should contain:
+            - 'metrics': Dictionary of benchmark metrics (F1, DAF, MAE, etc.)
+            - 'run_metadata': Optional run metadata (hardware, timing, cost)
+            - '_benchmark_info': Auto-injected benchmark/task info (if from wrapped method)
+        model_name: The specific name/variant of the model (e.g., "mace-mp-0-medium", "chgnet-v0.3.0").
+            This is required to identify the model on the leaderboard.
+        garden_doi: Optional DOI for the Garden publication associated with this benchmark result.
+        benchmark_name: Override for benchmark name (defaults to auto-detected from result)
+        task_name: Override for task name (defaults to auto-detected from result)
+
+    Returns:
+        Dictionary containing the response from the backend, including the result ID.
+
+    Raises:
+        ValueError: If benchmark_name or task_name cannot be determined.
+        requests.HTTPError: If the backend request fails.
+
+    Example:
+        ```python
+        from garden_ai.benchmarks import MatbenchDiscovery, publish_benchmark_result
+
+        # Run a benchmark
+        output = MatbenchDiscovery.IS2RE.remote(...)
+
+        # Publish the results
+        response = publish_benchmark_result(output, model_name="mace-medium", garden_doi="10.26311/example.doi")
+        print(f"Published with ID: {response['id']}")
+        ```
+    """
+    # Extract benchmark info from result or use provided overrides
+    benchmark_info = result.get("_benchmark_info", {})
+
+    final_benchmark_name = benchmark_name or benchmark_info.get("benchmark_name")
+    final_task_name = task_name or benchmark_info.get("task_name")
+
+    if not final_benchmark_name:
+        raise ValueError(
+            "benchmark_name is required. Either pass it explicitly or use a result "
+            "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
+        )
+
+    if not final_task_name:
+        raise ValueError(
+            "task_name is required. Either pass it explicitly or use a result "
+            "from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
+        )
+
+    # Inject model name into run_metadata
+    if "run_metadata" not in result:
+        result["run_metadata"] = {}
+    if "model" not in result["run_metadata"]:
+        result["run_metadata"]["model"] = {}
+
+    result["run_metadata"]["model"]["variant"] = model_name
+
+    # Inject garden_doi if provided
+    if garden_doi:
+        result["run_metadata"]["garden_doi"] = garden_doi
+
+    # Create the request payload
+    # Note: We pass the modified result (containing metrics and metadata) as 'metrics'
+    # This assumes the backend handles the unified blob or we rely on the schema field description.
+    payload = BenchmarkResultCreateRequest(
+        benchmark_name=final_benchmark_name,
+        benchmark_task_name=final_task_name,
+        metrics=result,
+    )
+
+    # Get authenticated client and publish
+    client = GardenClient()
+    response = client.backend_client.publish_benchmark_result(payload)
+    return response.model_dump()
diff --git a/garden_ai/benchmarks/matbench_discovery/__init__.py b/garden_ai/benchmarks/matbench_discovery/__init__.py
@@ -0,0 +1,10 @@
+"""Matbench Discovery benchmark adapter for Garden AI."""
+
+from .enums import DatasetSize, MatbenchTask
+from .tasks import MatbenchDiscovery
+
+__all__ = [
+    "MatbenchDiscovery",
+    "MatbenchTask",
+    "DatasetSize",
+]
diff --git a/garden_ai/benchmarks/matbench_discovery/enums.py b/garden_ai/benchmarks/matbench_discovery/enums.py
@@ -0,0 +1,53 @@
+"""Enums for Matbench Discovery benchmark tasks."""
+
+from enum import Enum
+
+
+class MatbenchTask(Enum):
+    """Available Matbench Discovery benchmark tasks."""
+
+    IS2RE = "IS2RE"  # Initial Structure to Relaxed Energy
+    RS2RE = "RS2RE"  # Relaxed Structure to Relaxed Energy
+    S2EFS = "S2EFS"  # Structure to Energy, Forces, Stress
+    S2EF = "S2EF"  # Structure to Energy, Force
+    S2EFSM = "S2EFSM"  # Structure to Energy, Force, Stress, Magmoms
+    IS2E = "IS2E"  # Initial Structure to Energy
+    S2E = "S2E"  # Structure to Energy
+    S2RE = "S2RE"  # Structure to Relaxed Energy
+    RP2RE = "RP2RE"  # Relaxed Prototype to Relaxed Energy
+    IP2E = "IP2E"  # Initial Prototype to Energy
+
+
+class DatasetSize(str, Enum):
+    """Predefined dataset sizes for Matbench Discovery benchmarks.
+
+    These correspond to different subsets of the WBM test set that are commonly
+    used for evaluating materials discovery models.
+    """
+
+    FULL = "full"
+    """Full WBM test set (~257k structures)"""
+
+    UNIQUE_PROTOS = "unique_protos"
+    """Unique prototypes subset (~215k structures) - removes duplicate prototypes"""
+
+    RANDOM_10K = "random_10k"
+    """Random 10k structures from the unique prototypes subset (fixed seed)"""
+
+    RANDOM_100 = "random_100"
+    """Random 100 structures for quick testing (fixed seed)"""
+
+    def seed(self, seed: int) -> "DatasetConfig":
+        """Return a configuration with a custom random seed."""
+        return DatasetConfig(self, seed)
+
+
+class DatasetConfig:
+    """Configuration for a dataset subset with a specific random seed."""
+
+    def __init__(self, subset: DatasetSize, seed: int):
+        self.subset = subset
+        self.seed = seed
+
+    def __repr__(self):
+        return f"{self.subset.name}(seed={self.seed})"
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py b/garden_ai/benchmarks/matbench_discovery/examples/local_execution.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - Local Execution Example"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+
+def create_mattersim_model(device):
+    from mattersim.forcefield import MatterSimCalculator
+
+    return MatterSimCalculator(device=device)
+
+
+print("Running MatterSim benchmark locally...")
+
+# Run IS2RE task locally
+# Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported
+output = MatbenchDiscovery.IS2RE.local(
+    model_factory=create_mattersim_model,
+    model_packages="mattersim",
+    num_structures="random_100",
+)
+
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output.get("metrics"))
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_equiformerv2.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - EquiformerV2 Example"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+
+def create_equiformerv2_model(device):
+    from fairchem.core.calculate.ase_calculator import Calculator  # type: ignore
+
+    # Use pre-trained checkpoint - will auto-download from HuggingFace
+    return Calculator(
+        model_name="EquiformerV2-31M-S2EF-OC20-All+MD", cpu=(device == "cpu")
+    )
+
+
+# Run S2EFS task (structure to energy/forces/stress)
+output = MatbenchDiscovery.S2EFS.remote(
+    endpoint="anvil",
+    account="your-account-here",
+    model_factory=create_equiformerv2_model,
+    model_packages="fairchem-core",
+    num_structures="random_10k",
+)
+
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output.get("metrics"))
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mace_multi_gpu.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - MACE Multi-GPU Example"""
+
+from rich import print
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+
+def create_mace_model(device):
+    from mace.calculators import mace_mp
+
+    return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")
+
+
+print("Running MACE benchmark on endpoint anvil...")
+
+results = MatbenchDiscovery.IS2RE.remote(
+    endpoint="anvil",
+    account="cis250461-gpu",
+    model_factory=create_mace_model,
+    model_packages=[
+        "mace-torch",
+        "cuequivariance",
+        "cuequivariance-torch",
+        "cuequivariance-ops-torch-cu12",
+    ],
+    num_structures="random_100",
+)
+
+if "error" in results.get("metrics", {}):
+    print(f"Error: {results['metrics']['error']}")
+else:
+    print("Benchmark Results:", results)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_mattersim.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - MatterSim Example"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+
+def create_mattersim_model(device):
+    from mattersim.forcefield import MatterSimCalculator
+
+    return MatterSimCalculator(device=device)
+
+
+output = MatbenchDiscovery.IS2RE.remote(
+    endpoint="anvil",
+    account="your-account-here",
+    model_factory=create_mattersim_model,
+    model_packages="mattersim",
+    num_structures="random_100",
+)
+
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output)
diff --git a/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py b/garden_ai/benchmarks/matbench_discovery/examples/matbench_sevennet.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+"""Matbench Discovery Benchmark - SevenNet Example"""
+
+from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery
+
+
+def create_sevennet_model(device):
+    from sevenn.calculator import SevenNetCalculator
+
+    return SevenNetCalculator(model="7net-0", device=device)
+
+
+output = MatbenchDiscovery.IS2RE.remote(
+    endpoint="anvil",
+    account="your-account-here",
+    model_factory=create_sevennet_model,
+    model_packages="sevenn",
+    num_structures="random_100",
+)
+
+if "error" in output.get("metrics", {}):
+    print(f"Error: {output['metrics']['error']}")
+else:
+    print("Benchmark Results:", output)