Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions garden_ai/backend_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

from garden_ai.constants import GardenConstants
from garden_ai.gardens import Garden
from garden_ai.schemas.benchmark import (
BenchmarkResultCreateRequest,
BenchmarkResultResponse,
)
from garden_ai.schemas.garden import GardenMetadata
from garden_ai.schemas.hpc import HpcInvocationCreateRequest
from garden_ai.schemas.modal import (
Expand Down Expand Up @@ -182,3 +186,9 @@ def search_gardens(self, payload: dict) -> dict:
def create_hpc_invocation(self, payload: HpcInvocationCreateRequest) -> dict:
response = self._post("/hpc/invocations", payload.model_dump(mode="json"))
return response

def publish_benchmark_result(
self, payload: BenchmarkResultCreateRequest
) -> BenchmarkResultResponse:
response = self._post("/benchmarks", payload.model_dump(mode="json"))
return BenchmarkResultResponse(**response)
110 changes: 110 additions & 0 deletions garden_ai/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Garden AI benchmarking framework.

This module provides interfaces for running standardized benchmarks on
models hosted in Garden AI or developed locally.

Available benchmarks:
- MatbenchDiscovery: Materials discovery benchmark suite
"""

from typing import Any, Dict, Optional

from garden_ai.client import GardenClient
from garden_ai.schemas.benchmark import BenchmarkResultCreateRequest

from .matbench_discovery.enums import DatasetSize, MatbenchTask
from .matbench_discovery.tasks import MatbenchDiscovery

__all__ = [
"MatbenchDiscovery",
"MatbenchTask",
"DatasetSize",
"publish_benchmark_result",
]


def publish_benchmark_result(
result: Dict[str, Any],
model_name: str,
garden_doi: Optional[str] = None,
benchmark_name: Optional[str] = None,
task_name: Optional[str] = None,
) -> Dict[str, Any]:
"""Publish benchmark results to the Garden AI backend.

This function takes the output from a benchmark task (e.g., MatbenchDiscovery.IS2RE.remote())
and publishes it to the Garden backend for tracking and leaderboard purposes.

Args:
result: The output dictionary from a benchmark task. Should contain:
- 'metrics': Dictionary of benchmark metrics (F1, DAF, MAE, etc.)
- 'run_metadata': Optional run metadata (hardware, timing, cost)
- '_benchmark_info': Auto-injected benchmark/task info (if from wrapped method)
model_name: The specific name/variant of the model (e.g., "mace-mp-0-medium", "chgnet-v0.3.0").
This is required to identify the model on the leaderboard.
garden_doi: Optional DOI for the Garden publication associated with this benchmark result.
benchmark_name: Override for benchmark name (defaults to auto-detected from result)
task_name: Override for task name (defaults to auto-detected from result)

Returns:
Dictionary containing the response from the backend, including the result ID.

Raises:
ValueError: If benchmark_name or task_name cannot be determined.
requests.HTTPError: If the backend request fails.

Example:
```python
from garden_ai.benchmarks import MatbenchDiscovery, publish_benchmark_result

# Run a benchmark
output = MatbenchDiscovery.IS2RE.remote(...)

# Publish the results
response = publish_benchmark_result(output, model_name="mace-medium", garden_doi="10.26311/example.doi")
print(f"Published with ID: {response['id']}")
```
"""
# Extract benchmark info from result or use provided overrides
benchmark_info = result.get("_benchmark_info", {})

final_benchmark_name = benchmark_name or benchmark_info.get("benchmark_name")
final_task_name = task_name or benchmark_info.get("task_name")

if not final_benchmark_name:
raise ValueError(
"benchmark_name is required. Either pass it explicitly or use a result "
"from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
)

if not final_task_name:
raise ValueError(
"task_name is required. Either pass it explicitly or use a result "
"from a MatbenchDiscovery task method (e.g., MatbenchDiscovery.IS2RE.remote())."
)

# Inject model name into run_metadata
if "run_metadata" not in result:
result["run_metadata"] = {}
if "model" not in result["run_metadata"]:
result["run_metadata"]["model"] = {}

result["run_metadata"]["model"]["variant"] = model_name

# Inject garden_doi if provided
if garden_doi:
result["run_metadata"]["garden_doi"] = garden_doi

# Create the request payload
# Note: We pass the modified result (containing metrics and metadata) as 'metrics'
# This assumes the backend handles the unified blob or we rely on the schema field description.
payload = BenchmarkResultCreateRequest(
benchmark_name=final_benchmark_name,
benchmark_task_name=final_task_name,
metrics=result,
)

# Get authenticated client and publish
client = GardenClient()
response = client.backend_client.publish_benchmark_result(payload)
return response.model_dump()
10 changes: 10 additions & 0 deletions garden_ai/benchmarks/matbench_discovery/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Matbench Discovery benchmark adapter for Garden AI."""

from .enums import DatasetSize, MatbenchTask
from .tasks import MatbenchDiscovery

__all__ = [
"MatbenchDiscovery",
"MatbenchTask",
"DatasetSize",
]
53 changes: 53 additions & 0 deletions garden_ai/benchmarks/matbench_discovery/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Enums for Matbench Discovery benchmark tasks."""

from enum import Enum


class MatbenchTask(Enum):
"""Available Matbench Discovery benchmark tasks."""

IS2RE = "IS2RE" # Initial Structure to Relaxed Energy
RS2RE = "RS2RE" # Relaxed Structure to Relaxed Energy
S2EFS = "S2EFS" # Structure to Energy, Forces, Stress
S2EF = "S2EF" # Structure to Energy, Force
S2EFSM = "S2EFSM" # Structure to Energy, Force, Stress, Magmoms
IS2E = "IS2E" # Initial Structure to Energy
S2E = "S2E" # Structure to Energy
S2RE = "S2RE" # Structure to Relaxed Energy
RP2RE = "RP2RE" # Relaxed Prototype to Relaxed Energy
IP2E = "IP2E" # Initial Prototype to Energy


class DatasetSize(str, Enum):
"""Predefined dataset sizes for Matbench Discovery benchmarks.

These correspond to different subsets of the WBM test set that are commonly
used for evaluating materials discovery models.
"""

FULL = "full"
"""Full WBM test set (~257k structures)"""

UNIQUE_PROTOS = "unique_protos"
"""Unique prototypes subset (~215k structures) - removes duplicate prototypes"""

RANDOM_10K = "random_10k"
"""Random 10k structures from the unique prototypes subset (fixed seed)"""

RANDOM_100 = "random_100"
"""Random 100 structures for quick testing (fixed seed)"""

def seed(self, seed: int) -> "DatasetConfig":
"""Return a configuration with a custom random seed."""
return DatasetConfig(self, seed)


class DatasetConfig:
"""Configuration for a dataset subset with a specific random seed."""

def __init__(self, subset: DatasetSize, seed: int):
self.subset = subset
self.seed = seed

def __repr__(self):
return f"{self.subset.name}(seed={self.seed})"
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python3
"""Matbench Discovery Benchmark - Local Execution Example"""

from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery


def create_mattersim_model(device):
from mattersim.forcefield import MatterSimCalculator

return MatterSimCalculator(device=device)


print("Running MatterSim benchmark locally...")

# Run IS2RE task locally
# Note: Requires a GPU or MPS if using MatterSim, or CPU if specified/supported
output = MatbenchDiscovery.IS2RE.local(
model_factory=create_mattersim_model,
model_packages="mattersim",
num_structures="random_100",
)

if "error" in output.get("metrics", {}):
print(f"Error: {output['metrics']['error']}")
else:
print("Benchmark Results:", output.get("metrics"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3
"""Matbench Discovery Benchmark - EquiformerV2 Example"""

from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery


def create_equiformerv2_model(device):
from fairchem.core.calculate.ase_calculator import Calculator # type: ignore

# Use pre-trained checkpoint - will auto-download from HuggingFace
return Calculator(
model_name="EquiformerV2-31M-S2EF-OC20-All+MD", cpu=(device == "cpu")
)


# Run S2EFS task (structure to energy/forces/stress)
output = MatbenchDiscovery.S2EFS.remote(
endpoint="anvil",
account="your-account-here",
model_factory=create_equiformerv2_model,
model_packages="fairchem-core",
num_structures="random_10k",
)

if "error" in output.get("metrics", {}):
print(f"Error: {output['metrics']['error']}")
else:
print("Benchmark Results:", output.get("metrics"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python3
"""Matbench Discovery Benchmark - MACE Multi-GPU Example"""

from rich import print

from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery


def create_mace_model(device):
from mace.calculators import mace_mp

return mace_mp(model="medium-mpa-0", device=device, default_dtype="float64")


print("Running MACE benchmark on endpoint anvil...")

results = MatbenchDiscovery.IS2RE.remote(
endpoint="anvil",
account="cis250461-gpu",
model_factory=create_mace_model,
model_packages=[
"mace-torch",
"cuequivariance",
"cuequivariance-torch",
"cuequivariance-ops-torch-cu12",
],
num_structures="random_100",
)

if "error" in results.get("metrics", {}):
print(f"Error: {results['metrics']['error']}")
else:
print("Benchmark Results:", results)
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
"""Matbench Discovery Benchmark - MatterSim Example"""

from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery


def create_mattersim_model(device):
from mattersim.forcefield import MatterSimCalculator

return MatterSimCalculator(device=device)


output = MatbenchDiscovery.IS2RE.remote(
endpoint="anvil",
account="your-account-here",
model_factory=create_mattersim_model,
model_packages="mattersim",
num_structures="random_100",
)

if "error" in output.get("metrics", {}):
print(f"Error: {output['metrics']['error']}")
else:
print("Benchmark Results:", output)
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
"""Matbench Discovery Benchmark - SevenNet Example"""

from garden_ai.benchmarks.matbench_discovery import MatbenchDiscovery


def create_sevennet_model(device):
from sevenn.calculator import SevenNetCalculator

return SevenNetCalculator(model="7net-0", device=device)


output = MatbenchDiscovery.IS2RE.remote(
endpoint="anvil",
account="your-account-here",
model_factory=create_sevennet_model,
model_packages="sevenn",
num_structures="random_100",
)

if "error" in output.get("metrics", {}):
print(f"Error: {output['metrics']['error']}")
else:
print("Benchmark Results:", output)
Loading
Loading