Skip to content

Commit d891cdc

Browse files
committed
[Test] Add integration tests to validate support for GB200.
1 parent d764202 commit d891cdc

File tree

12 files changed

+964
-1
lines changed

12 files changed

+964
-1
lines changed

tests/integration-tests/clusters_factory.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818

1919
import boto3
2020
import yaml
21+
from assertpy import assert_that
2122
from framework.credential_providers import run_pcluster_command
23+
from remote_command_executor import RemoteCommandExecutor
2224
from retrying import retry
2325
from time_utils import minutes, seconds
2426
from utils import (
@@ -34,6 +36,15 @@
3436
retry_if_subprocess_error,
3537
)
3638

39+
from tests.common.utils import read_remote_file
40+
41+
TAG_CLUSTER_NAME = "parallelcluster:cluster-name"
42+
TAG_NODE_TYPE = "parallelcluster:node-type"
43+
TAG_QUEUE_NAME = "parallelcluster:queue-name"
44+
TAG_COMPUTE_RESOURCE_NAME = "parallelcluster:compute-resource-name"
45+
46+
LAUNCH_TEMPLATES_CONFIG_FILE = "/opt/parallelcluster/shared/launch-templates-config.json"
47+
3748

3849
def suppress_and_log_exception(func):
3950
@functools.wraps(func)
@@ -253,6 +264,63 @@ def describe_cluster_instances(self, node_type=None, queue_name=None):
253264
logging.error("Failed when getting cluster instances with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
254265
raise
255266

267+
@retry(wait_fixed=seconds(5), stop_max_delay=minutes(1))
268+
def get_compute_nodes(
269+
self,
270+
queue_name: str = None,
271+
compute_resource_name: str = None,
272+
state: list = None,
273+
expected_num_nodes: int = None,
274+
):
275+
"""Return the EC2 instance details for compute nodes matching the provided criteria."""
276+
state = ["running"] if state is None else state
277+
ec2 = boto3.client("ec2", region_name=self.region)
278+
filters = [
279+
{"Name": f"tag:{TAG_CLUSTER_NAME}", "Values": [self.cfn_name]},
280+
{"Name": f"tag:{TAG_NODE_TYPE}", "Values": ["Compute"]},
281+
{"Name": "instance-state-name", "Values": state},
282+
]
283+
284+
if queue_name:
285+
filters.append({"Name": f"tag:{TAG_QUEUE_NAME}", "Values": [queue_name]})
286+
if compute_resource_name:
287+
filters.append({"Name": f"tag:{TAG_COMPUTE_RESOURCE_NAME}", "Values": [compute_resource_name]})
288+
289+
instances = []
290+
for reservation in ec2.describe_instances(Filters=filters).get("Reservations"):
291+
instances.extend(reservation.get("Instances", []))
292+
293+
if expected_num_nodes:
294+
assert_that(instances).is_length(expected_num_nodes)
295+
296+
return instances
297+
298+
def get_compute_nodes_private_ip(
299+
self,
300+
queue_name: str = None,
301+
compute_resource_name: str = None,
302+
state: list = None,
303+
expected_num_nodes: int = None,
304+
):
305+
"""Return the private IP address of compute nodes matching the provided criteria."""
306+
instances = self.get_compute_nodes(queue_name, compute_resource_name, state, expected_num_nodes)
307+
return [i.get("PrivateIpAddress") for i in instances]
308+
309+
def get_compute_nodes_launch_template_logical_id(self, queue_name: str, compute_resource_name: str):
310+
"""Return the launch template logical id of compute nodes matching the provided criteria."""
311+
launch_templates_config = json.loads(
312+
read_remote_file(RemoteCommandExecutor(self), LAUNCH_TEMPLATES_CONFIG_FILE)
313+
)
314+
logging.info(f"Read launch template config from {LAUNCH_TEMPLATES_CONFIG_FILE}: {launch_templates_config}")
315+
return (
316+
launch_templates_config.get("Queues", {})
317+
.get(queue_name, {})
318+
.get("ComputeResources", {})
319+
.get(compute_resource_name, {})
320+
.get("LaunchTemplate", {})
321+
.get("LogicalId")
322+
)
323+
256324
def get_cluster_instance_ids(self, node_type=None, queue_name=None):
257325
"""Run pcluster describe-cluster-instances and collect instance ids."""
258326
instances = self.describe_cluster_instances(node_type=node_type, queue_name=queue_name)

tests/integration-tests/configs/develop.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,13 @@ test-suites:
280280
instances: [{{ common.instance("instance_type_1") }}]
281281
oss: [{{ OS_X86_6 }}]
282282
schedulers: [ "slurm" ]
283+
ultraserver:
284+
test_gb200.py::test_gb200:
285+
dimensions:
286+
- regions: [ "us-east-1" ]
287+
instances: [ "g4dn.2xlarge" ]
288+
oss: [ "alinux2023" ]
289+
schedulers: [ "slurm" ]
283290
health_checks:
284291
test_gpu_health_checks.py::test_cluster_with_gpu_health_checks:
285292
dimensions:

tests/integration-tests/conftest.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,32 @@ def test_datadir(request, datadir):
579579
return datadir / "{0}/{1}".format(class_name, function_name)
580580

581581

582+
@pytest.fixture()
583+
def file_reader(test_datadir, request, vpc_stack):
584+
"""
585+
Define a fixture to render file templates associated to the running test.
586+
587+
The template file for a given test is a generic file stored in the configs_datadir folder.
588+
The template can be written by using Jinja2 template engine.
589+
590+
:return: a _file_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template
591+
"""
592+
593+
def _file_renderer(input_file: str = "script.sh", output_file: str = "script_rendered.sh", **kwargs):
594+
input_file_path = test_datadir / input_file
595+
if not os.path.isfile(input_file_path):
596+
raise FileNotFoundError(f"Input file not found in the expected dir {input_file_path}")
597+
output_file_path = test_datadir / output_file if output_file else input_file_path
598+
default_values = _get_default_template_values(vpc_stack, request)
599+
file_loader = FileSystemLoader(str(test_datadir))
600+
env = SandboxedEnvironment(loader=file_loader)
601+
rendered_template = env.get_template(input_file).render(**{**default_values, **kwargs})
602+
output_file_path.write_text(rendered_template)
603+
return output_file_path
604+
605+
return _file_renderer
606+
607+
582608
@pytest.fixture()
583609
def pcluster_config_reader(test_datadir, vpc_stack, request, region, instance, architecture):
584610
"""

tests/integration-tests/tests/common/assertions.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1010
# limitations under the License.
1111
import logging
12+
import re
1213
import time
1314
from typing import List, Union
1415

1516
import boto3
1617
import pytest
1718
from assertpy import assert_that, soft_assertions
19+
from clusters_factory import Cluster
1820
from constants import NodeType
1921
from remote_command_executor import RemoteCommandExecutor
2022
from retrying import RetryError, retry
@@ -28,7 +30,7 @@
2830
)
2931

3032
from tests.common.scaling_common import get_compute_nodes_allocation
31-
from tests.common.utils import get_ddb_item
33+
from tests.common.utils import get_ddb_item, read_remote_file
3234

3335

3436
@retry(wait_fixed=seconds(20), stop_max_delay=minutes(6))
@@ -199,6 +201,19 @@ def wait_for_num_instances_in_queue(cluster_name, region, desired, queue):
199201
return assert_num_instances_in_queue(cluster_name, region, desired, queue)
200202

201203

204+
@retry(wait_fixed=seconds(20), stop_max_delay=minutes(10))
205+
def wait_for_instances_in_compute_resource(
206+
cluster: Cluster, queue: str, compute_resource: str, state: list, desired: int
207+
):
208+
instances = cluster.get_compute_nodes(queue, compute_resource, state)
209+
assert_that(instances).is_length(desired)
210+
logging.info(
211+
f"Cluster {cluster.name} has {desired} compute nodes "
212+
f"in queue {queue} and compute resource {compute_resource}: {instances}"
213+
)
214+
return instances
215+
216+
202217
def assert_num_instances_in_queue(cluster_name, region, desired, queue):
203218
instances = get_cluster_nodes_instance_ids(cluster_name, region, node_type="Compute", queue_name=queue)
204219
assert_that(instances).is_length(desired)
@@ -422,3 +437,10 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p
422437
time.sleep(poll_interval)
423438

424439
pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})")
440+
441+
442+
def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str, negate: bool = True):
443+
rce = RemoteCommandExecutor(cluster, compute_node_ip)
444+
file_content = read_remote_file(rce, file_name)
445+
assertion = assert_that(bool(re.search(pattern, file_content, re.IGNORECASE)))
446+
assertion.is_false() if negate else assertion.is_fals()

tests/integration-tests/tests/common/schedulers_common.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,24 @@ def get_unique_static_nodes(self):
440440
logging.info("All running nodes: %s", result.stdout)
441441
return result.stdout.splitlines()
442442

443+
def get_nodename_from_ip(self, ip: str):
444+
"""Get the nodename from IP address"""
445+
command = (
446+
f"scontrol show nodes --json | "
447+
f'jq -r --arg ip "{ip}" \'.nodes[] | '
448+
f"select(.address == $ip) | .hostname'"
449+
) # noqa: W605
450+
result = self._remote_command_executor.run_remote_command(command)
451+
logging.info(f"Nodename for {ip} is: {result.stdout}")
452+
return result.stdout
453+
454+
def get_batch_host_for_job(self, job_id: str):
455+
"""Get the node list for a given job."""
456+
command = f"scontrol show jobs {job_id} --json | jq -r '.jobs[].batch_host'" # noqa: W605
457+
result = self._remote_command_executor.run_remote_command(command)
458+
logging.info(f"Nodename for {job_id} is: {result.stdout}")
459+
return result.stdout
460+
443461
@retry(retry_on_result=lambda result: "drain" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(5))
444462
def wait_for_locked_node(self): # noqa: D102
445463
return self._remote_command_executor.run_remote_command("sinfo -h -o '%t'").stdout

tests/integration-tests/tests/common/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,13 @@ def run_system_analyzer(cluster, scheduler_commands_factory, request, partition=
416416
logging.info("Compute node system information correctly retrieved.")
417417

418418

419+
def is_existing_remote_file(rce: RemoteCommandExecutor, file_path: str):
420+
"""Return true if the file exists, false otherwise"""
421+
logging.info(f"Checking if remote file exists {file_path}")
422+
result = rce.run_remote_command(f"cat {file_path}", raise_on_error=False)
423+
return not result.failed
424+
425+
419426
@retry(stop_max_attempt_number=5, wait_fixed=seconds(3))
420427
def read_remote_file(remote_command_executor, file_path):
421428
"""Reads the content of a remote file."""
@@ -536,3 +543,12 @@ def write_file(dirname, filename, content):
536543
f.write(content)
537544
logging.info(f"File written: {filepath}")
538545
return filepath
546+
547+
548+
def terminate_nodes_manually(instance_ids, region):
549+
ec2_client = boto3.client("ec2", region_name=region)
550+
for instance_id in instance_ids:
551+
instance_states = ec2_client.terminate_instances(InstanceIds=[instance_id]).get("TerminatingInstances")[0]
552+
assert_that(instance_states.get("InstanceId")).is_equal_to(instance_id)
553+
assert_that(instance_states.get("CurrentState").get("Name")).is_in("shutting-down", "terminated")
554+
logging.info("Terminated nodes: {}".format(instance_ids))

0 commit comments

Comments
 (0)