Skip to content

Commit 9457ba4

Browse files
authored
Add error to point user to slurm resume log (#676)
* Add error to point user to slurm resume log (cherry picked from commit 84ec039) * Fix unit tests (cherry picked from commit c84aeb5) * Fix code linter (cherry picked from commit bdc8706) * Update CHANGELOG * Fix linter errors * Add unit test for logs in clustermgtd
1 parent e45dfd8 commit 9457ba4

File tree

6 files changed

+28
-3
lines changed

6 files changed

+28
-3
lines changed

.flake8

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ ignore =
1818
W503,
1919
# N818: exception name should be named with an Error suffix
2020
N818
21+
# B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`.
22+
# Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525
23+
B042
2124
exclude =
2225
.tox,
2326
.git,

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
3.15.0
7+
------
8+
9+
**CHANGES**
10+
- Direct users to slurm_resume log to see EC2 error codes if no instances are launched.
11+
612
3.14.0
713
------
814

src/slurm_plugin/clustermgtd.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
12621262
return
12631263
log.info(
12641264
"The following compute resources are in down state due to insufficient capacity: %s, "
1265-
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
1265+
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired. "
1266+
"Check the slurm_resume log for EC2 error codes.",
12661267
self._insufficient_capacity_compute_resources,
12671268
self._config.insufficient_capacity_timeout,
12681269
)

src/slurm_plugin/resume.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,11 @@ def _resume(arg_nodes, resume_config, slurm_resume):
227227
print_with_count(failed_nodes),
228228
)
229229
for error_code, node_list in instance_manager.failed_nodes.items():
230-
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
230+
_handle_failed_nodes(
231+
node_list,
232+
reason=f"(Code:{error_code})Failure when resuming nodes - "
233+
f"Check the slurm_resume log for EC2 error codes",
234+
)
231235

232236
event_publisher = ClusterEventPublisher.create_with_default_publisher(
233237
event_logger,

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3533,6 +3533,13 @@ def test_reset_timeout_expired_compute_resources(
35333533
assert_that(cluster_manager._insufficient_capacity_compute_resources).is_equal_to(
35343534
expected_insufficient_capacity_compute_resources
35353535
)
3536+
3537+
if expected_insufficient_capacity_compute_resources:
3538+
assert (
3539+
"compute resources will be reset after insufficient capacity timeout (20 seconds) expired. "
3540+
"Check the slurm_resume log for EC2 error codes."
3541+
) in caplog.text
3542+
35363543
if expected_power_save_node_list:
35373544
power_save_mock.assert_called_with(
35383545
expected_power_save_node_list, reason="Enabling node since insufficient capacity timeout expired"

tests/slurm_plugin/test_resume.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,11 @@ def test_resume_launch(
448448
if expected_failed_nodes:
449449
for error_code, nodeset in expected_failed_nodes.items():
450450
mock_handle_failed_nodes_calls.append(
451-
call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes")
451+
call(
452+
nodeset,
453+
reason=f"(Code:{error_code})Failure when resuming nodes - "
454+
f"Check the slurm_resume log for EC2 error codes",
455+
)
452456
)
453457
mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)
454458
mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size)

0 commit comments

Comments
 (0)