Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
b94c921
DAOS-17427 control: Restart evicted rank after suicide
tanabarr Apr 17, 2025
af7f056
implement suicide event handlers
tanabarr Mar 19, 2026
4ce711f
add unit testing and documentation
tanabarr Mar 19, 2026
550ef12
fix docs and unit tests
tanabarr Mar 25, 2026
1f61b98
revise unit test for suicide handler
tanabarr Mar 26, 2026
8a79efb
fixup tests
tanabarr Mar 27, 2026
f643187
Merge branch 'master' into tanabarr/control-engine-suicide-restart
tanabarr Mar 31, 2026
e9c6896
rename suicide to self terminated
tanabarr Apr 1, 2026
d903017
rename registerFollowerSubscriptions to registerSubscriptions
tanabarr Apr 1, 2026
2b8b16f
add flag to disable automatic engine restart
tanabarr Apr 1, 2026
a593f54
fix intermittent test fails with delay before txt comp
tanabarr Apr 1, 2026
3656620
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr Apr 1, 2026
394ab67
Merge branch 'tanabarr/control-engine-suicide-restart' of github.com:…
tanabarr Apr 1, 2026
f445e51
implement basic rate limiting
tanabarr Apr 1, 2026
2cea5aa
improve naming consistency and fix config unit tests
tanabarr Apr 2, 2026
f6ae57e
add rate-limiting unit test
tanabarr Apr 2, 2026
180cc0c
documentation updates
tanabarr Apr 2, 2026
468507f
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr Apr 3, 2026
f1cdf0a
Q a single restart request if received within timeout period
tanabarr Apr 3, 2026
2c30c0d
address review comments from mjmac and kjacque
tanabarr Apr 20, 2026
7e95333
use channel-based restart manager for rate-limiting
tanabarr Apr 20, 2026
6252edf
fix handleEngineSelfTerminated unit tests
tanabarr Apr 21, 2026
de033c1
add unit tests for engine restart manager
tanabarr Apr 22, 2026
d58bce2
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr Apr 23, 2026
09cc634
remove deprecated code
tanabarr Apr 23, 2026
47883eb
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr Apr 23, 2026
d89a387
DRY-up unit tests for engine restart manager
tanabarr Apr 24, 2026
aff68c3
Merge branch 'master' into tanabarr/control-engine-suicide-restart
tanabarr Apr 30, 2026
6b4528d
DAOS-17427 test: Auto-restart after self-terminate tests (#18006)
tanabarr May 6, 2026
bdfde05
fix server package unit test helpers
tanabarr May 7, 2026
417b33b
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr May 7, 2026
c76507f
Revert "fix server package unit test helpers"
tanabarr May 7, 2026
8d5a1da
fix server package unit test helpers
tanabarr May 7, 2026
4a2938f
addressed review comments from kjacque pt1
tanabarr May 7, 2026
049509c
allow restart manager to close and open again
tanabarr May 7, 2026
bd15522
Revert "allow restart manager to close and open again"
tanabarr May 9, 2026
06f3f6e
address some review comments from kjacque
tanabarr May 11, 2026
8f82660
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr May 12, 2026
5feadfe
comment one start/stop per process lifetime
tanabarr May 12, 2026
969c837
address more review comments from kjacque
tanabarr May 12, 2026
835156b
pylint fixes
tanabarr May 12, 2026
95c061a
using self.register_cleanup (#18240)
tanabarr May 13, 2026
9ab86a9
Apply suggestion from @daltonbohning
tanabarr May 13, 2026
1913fd9
more ftest related review comment updates
tanabarr May 13, 2026
9dc146c
f-string updates and remove step comments in log_step calls use Comma…
tanabarr May 13, 2026
48ae917
Merge remote-tracking branch 'origin/master' into tanabarr/control-en…
tanabarr May 13, 2026
28e4905
Update src/tests/ftest/control/engine_auto_restart.yaml
tanabarr May 13, 2026
69c7327
Update src/tests/ftest/control/engine_auto_restart_disabled.yaml
tanabarr May 13, 2026
f1c54c9
Update src/tests/ftest/control/engine_auto_restart_disabled.yaml
tanabarr May 13, 2026
335590e
Update src/tests/ftest/control/engine_auto_restart_advanced.yaml
tanabarr May 13, 2026
031e5af
Update src/tests/ftest/control/engine_auto_restart_disabled.py
tanabarr May 13, 2026
d6b7993
Update src/tests/ftest/control/engine_auto_restart.yaml
tanabarr May 13, 2026
de28a9f
Update src/tests/ftest/control/engine_auto_restart_advanced.yaml
tanabarr May 13, 2026
42441e3
Update src/tests/ftest/control/engine_auto_restart_disabled.py
tanabarr May 13, 2026
8c14077
fail if delay > 200% of expected
tanabarr May 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 99 additions & 10 deletions docs/admin/administration.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ severity, message, description, and cause.
| engine\_died| STATE\_CHANGE| ERROR| DAOS engine <idx\> exited exited unexpectedly: <error\> | Indicates engine instance <idx\> unexpectedly. <error> describes the exit state returned from exited daos\_engine process.| N/A |
| engine\_asserted| STATE\_CHANGE| ERROR| TBD| Indicates engine instance <idx\> threw a runtime assertion, causing a crash. | An unexpected internal state resulted in assert failure. |
| engine\_clock\_drift| INFO\_ONLY | ERROR| clock drift detected| Indicates CART comms layer has detected clock skew between engines.| NTP may not be syncing clocks across DAOS system. |
| engine\_self\_terminated| INFO\_ONLY| NOTICE| excluded rank self terminated detected| Indicates that a DAOS engine rank has performed a self-termination due to having been excluded from the system's group map. The rank is automatically restarted by the control plane with rate-limiting (default: 5 minute minimum delay between restarts per rank) to prevent restart storms. | An engine was found to be in a transient non-functional state and excluded from the group map. The control plane monitors for this event and automatically restarts the affected engine so it can rejoin the system. Restarts are rate-limited per rank using the `engine_auto_restart_min_delay` configuration parameter. |
| engine\_join\_failed| INFO\_ONLY| ERROR | DAOS engine <idx\> (rank <rank\>) was not allowed to join the system | Join operation failed for the given engine instance ID and rank (if assigned). | Reason should be provided in the extended info field of the event data. |
| pool\_corruption\_detected| INFO\_ONLY| ERROR | Data corruption detected| Indicates a corruption in pool data has been detected. The event fields will contain pool and container UUIDs. | A corruption was found by the checksum scrubber. |
| pool\_rebuild\_started| INFO\_ONLY| NOTICE | Pool rebuild started.| Indicates a pool rebuild has started. The event data field contains pool map version and pool operation identifier. | When a pool rank becomes unavailable a rebuild will be triggered. |
| pool\_rebuild\_finished| INFO\_ONLY| NOTICE| Pool rebuild finished.| Indicates a pool rebuild has finished successfully. The event data field includes the pool map version and pool operation identifier. | N/A|
Expand All @@ -69,7 +71,6 @@ severity, message, description, and cause.
| device\_plugged| INFO\_ONLY| NOTICE| Detected hot plugged device: <bdev-name\> | Indicates device was physically inserted into host. | NVMe SSD physically added to host. |
| device\_replace| INFO\_ONLY| NOTICE or ERROR| Replaced device: <uuid\> with device: <uuid\> [failed: <rc\>] | Indicates that a faulty device was replaced with a new device and if the operation failed. The old and new device IDs as well as any non-zero return code are specified in the event data. | Device was replaced using DMG nvme replace command. |
| system\_fabric\_provider\_changed| INFO\_ONLY| NOTICE| System fabric provider has changed: <old-provider\> -> <new-provider\>| Indicates that the system-wide fabric provider has been updated. No other specific information is included in event data.| A system-wide fabric provider change has been intentionally applied to all joined ranks.|
| engine\_join\_failed| INFO\_ONLY| ERROR | DAOS engine <idx\> (rank <rank\>) was not allowed to join the system | Join operation failed for the given engine instance ID and rank (if assigned). | Reason should be provided in the extended info field of the event data. |
| device\_link\_speed\_changed| INFO\_ONLY| NOTICE or WARNING| NVMe PCIe device at <pci-address\> port-<idx\>: link speed changed to <transfer-rate\> (max <transfer-rate\>)| Indicates that an NVMe device link speed has changed. The negotiated and maximum device link speeds are indicated in the event message field and the severity is set to warning if the negotiated speed is not at maximum capability (and notice level severity if at maximum). No other specific information is included in the event data.| Either device link speed was previously downgraded and has returned to maximum or link speed has downgraded to a value that is less than its maximum capability.|
| device\_link\_width\_changed| INFO\_ONLY| NOTICE or WARNING| NVMe PCIe device at <pci-address\> port-<idx\>: link width changed to <pcie-link-lanes\> (max <pcie-link-lanes\>)| Indicates that an NVMe device link width has changed. The negotiated and maximum device link widths are indicated in the event message field and the severity is set to warning if the negotiated width is not at maximum capability (and notice level severity if at maximum). No other specific information is included in the event data.| Either device link width was previously downgraded and has returned to maximum or link width has downgraded to a value that is less than its maximum capability.|
| device\_led\_set| INFO\_ONLY| NOTICE| LED on device <device\> set to state <state\>| Indicates that the LED state has been changed on a device. Device identifier and LED state are specified in the event message.| LED control command was issued to change device LED state for visual identification or fault indication.|
Expand Down Expand Up @@ -1007,6 +1008,94 @@ specified on the command line:
If the ranks were excluded from pools (e.g., unclean shutdown), they will need to
be reintegrated. Please see the pool operation section for more information.

### Engine Auto-Restart

DAOS automatically restarts engines that self-terminate after being excluded from
the system. This feature improves system availability by recovering from transient
failures without administrator intervention.

#### How It Works

When an engine is excluded (e.g., due to network issues detected by SWIM), the
engine detects the exclusion and performs a self-termination. The control plane
monitors for these events and automatically restarts the affected engine after
clearing the exclusion state, allowing it to rejoin the system.

The automatic restart includes rate-limiting to prevent restart storms. By default,
an engine must wait 5 minutes between automatic restarts.

#### Configuration

Control auto-restart behavior in `daos_server.yml`:

```yaml
# Disable automatic restart (default: enabled)
disable_engine_auto_restart: false

# Minimum delay between automatic restarts per rank (default: 300 seconds)
engine_auto_restart_min_delay: 300
```

#### Manual Operations

Manual `dmg system stop` and `dmg system start` operations are never affected by
the rate-limiting mechanism. Administrators can always immediately stop and start
ranks regardless of recent automatic restart activity.

```bash
# Manual operations always work immediately
$ dmg system stop --ranks=0,1,2
$ dmg system start --ranks=0,1,2
```

When you manually stop or start ranks, the restart history for those ranks is
automatically cleared, ensuring no delays from previous automatic restarts.

#### Monitoring

The `engine_self_terminated` RAS event is logged when an engine self-terminates
and triggers an automatic restart:

```
&&& RAS EVENT id: [engine_self_terminated] ... msg: [excluded rank self terminated detected]
```

Use `dmg system query` to check rank status and incarnation numbers. The
incarnation number increments each time a rank restarts, helping track restart
events:

```bash
$ dmg system query --ranks=0
Rank UUID Control Address Fault Domain State Reason Incarnation
---- ---- --------------- ------------- ----- ------ -----------
0 12345678-1234-1234-1234-123456789012 10.0.0.1:10001 /node1 Joined 3
```

#### Best Practices

- **Leave enabled**: Automatic restart improves availability for transient failures
- **Adjust timing**: For frequent exclusions, consider increasing `engine_auto_restart_min_delay`
- **Monitor events**: Watch for repeated `engine_self_terminated` events indicating persistent issues
- **Manual control**: Use `dmg system stop/start` for maintenance without worrying about delays

#### Troubleshooting

**Problem**: Rank keeps self-terminating and restarting

**Solution**: Investigate root cause:
1. Check network connectivity (SWIM may be detecting real failures)
2. Review engine logs for errors
3. Verify hardware health
4. Consider disabling auto-restart temporarily for investigation

**Problem**: Need immediate restart but recently auto-restarted

**Solution**: Use manual operations (not affected by rate-limiting):
```bash
$ dmg system stop --ranks=X
$ dmg system start --ranks=X
```

### Storage Reformat

To reformat the system after a controlled shutdown, run the command:
Expand Down Expand Up @@ -1052,15 +1141,15 @@ the storage server has not changed the old rank can be "reused" by formatting us

An examples workflow would be:

- `daos_server` is running and PMem NVDIMM fails causing an engine to enter excluded state.
- `daos_server` is stopped, storage server powered down, faulty PMem NVDIMM is replaced.
- After powering up storage server, `daos_server scm prepare` command is used to repair PMem.
- Storage server is rebooted after running `daos_server scm prepare` and command is run again.
- Now PMem is intact, clear with `wipefs -a /dev/pmemX` where "X" refers to the repaired PMem ID.
- `daos_server` can be started again. On start-up repaired engine prompts for "SCM format required".
- Run `dmg storage format --replace` to rejoin with existing rank (if --replace isn't used, a new
rank will be created).
- Formatted engine will join using the existing (old) rank which is mapped to the engine's hardware.
1. `daos_server` is running and PMem NVDIMM fails causing an engine to enter excluded state.
2. `daos_server` is stopped, storage server powered down, faulty PMem NVDIMM is replaced.
3. After powering up storage server, `daos_server scm prepare` command is used to repair PMem.
4. Storage server is rebooted after running `daos_server scm prepare` and command is run again.
5. Now PMem is intact, clear with `wipefs -a /dev/pmemX` where "X" refers to the repaired PMem ID.
6. `daos_server` can be started again. On start-up repaired engine prompts for "SCM format required".
7. Run `dmg storage format --replace` to rejoin with existing rank (if --replace isn't used, a new
rank will be created).
8. Formatted engine will join using the existing (old) rank which is mapped to the engine's hardware.

!!! note
`dmg storage format --replace` can be used to replace a rank in `AdminExcluded` state. The
Expand Down
65 changes: 65 additions & 0 deletions docs/overview/fault.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,68 @@ can now read from the rebuilt object shards.

This rebuild process is executed online while applications continue accessing
and updating objects.

### Engine Self-Termination and Automatic Restart

A DAOS engine may be excluded from the group map because of inactivity
for example. When an engine becomes aware of it's removal from the
group map it will self-terminate to protect data integrity and system stability.

When an engine self terminates, it raises a `engine_self_terminated` RAS event
(INFO_ONLY, NOTICE severity) containing the rank and incarnation information.
The control plane automatically handles this event by:

1. Detecting the engine self terminated event through the RAS event system
2. Identifying the engine instance associated with the rank
3. Waiting for the engine process to fully stop
4. Automatically restarting the engine to rejoin the system

This automatic restart mechanism is implemented in all control servers to ensure
local engine recovery happens regardless of management service leadership state.
The restarted engine will rejoin the system with a new incarnation number and
resume normal operations.

This self-healing mechanism allows DAOS to automatically recover system
membership state from transient engine failures without administrator
intervention, improving overall system availability.

#### Rate Limiting

To prevent restart storms and ensure system stability, automatic engine restarts
are rate-limited on a per-rank basis. By default, a minimum delay of 300 seconds
(5 minutes) is enforced between consecutive restart attempts for the same rank.

When an engine self-terminates within the minimum delay period, the control plane
schedules a deferred restart that will automatically trigger when the delay expires.
If multiple self-termination events occur for the same rank during the delay period
(this would be unexpected) only the most recent event triggers a deferred restart.
This ensures the engine is restarted exactly once after the delay, regardless of
how many self-termination events occur.

The rate-limiting interval can be customized by setting the
`engine_auto_restart_min_delay` configuration option (in seconds) in the
daos_server.yml file. For example:

```yaml
engine_auto_restart_min_delay: 600 # 10 minutes between restarts
```

This protection mechanism prevents scenarios where:
- Repeated transient failures cause excessive restart cycling
- A misconfigured engine continuously self-terminates
- Cascading failures overwhelm the control plane with restart requests

#### Disabling Automatic Restart

The automatic restart behavior can be completely disabled by setting the
`disable_engine_auto_restart` configuration option to `true` in the
daos_server.yml file:

```yaml
disable_engine_auto_restart: true
```

When auto restart is disabled, engines that self-terminate will not be
automatically restarted by the control plane, requiring manual intervention
to restart the affected engine instances. This setting may be useful for
debugging scenarios or when custom external restart management is preferred.
1 change: 1 addition & 0 deletions src/control/cmd/dmg/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ mgmt_svc_replicas:
- hostX:10002
fault_cb: ""
hyperthreads: false
disable_engine_auto_restart: false
`
)

Expand Down
1 change: 1 addition & 0 deletions src/control/events/ras.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ const (
RASUnknownEvent RASID = C.RAS_UNKNOWN_EVENT
RASEngineFormatRequired RASID = C.RAS_ENGINE_FORMAT_REQUIRED // notice
RASEngineDied RASID = C.RAS_ENGINE_DIED // error
RASEngineSelfTerminated RASID = C.RAS_ENGINE_SELF_TERMINATED // notice
RASPoolRepsUpdate RASID = C.RAS_POOL_REPS_UPDATE // info
RASSwimRankAlive RASID = C.RAS_SWIM_RANK_ALIVE // info
RASSwimRankDead RASID = C.RAS_SWIM_RANK_DEAD // info
Expand Down
4 changes: 3 additions & 1 deletion src/control/lib/control/event.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//
// (C) Copyright 2021-2024 Intel Corporation.
// (C) Copyright 2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -170,7 +171,8 @@ func newEventLogger(logBasic logging.Logger, newSyslogger newSysloggerFn) *Event
}

// NewEventLogger returns an initialized EventLogger capable of writing to the
// supplied logger in addition to syslog.
// supplied logger in addition to syslog. Should only be used in production code,
// use MockEventLogger in unit tests.
Comment thread
kjacque marked this conversation as resolved.
func NewEventLogger(log logging.Logger) *EventLogger {
return newEventLogger(log, syslog.NewLogger)
}
10 changes: 9 additions & 1 deletion src/control/lib/control/mocks.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -30,6 +30,7 @@ import (
"github.com/daos-stack/daos/src/control/common/test"
"github.com/daos-stack/daos/src/control/lib/hostlist"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/logging"
"github.com/daos-stack/daos/src/control/server/config"
"github.com/daos-stack/daos/src/control/server/engine"
"github.com/daos-stack/daos/src/control/server/storage"
Expand Down Expand Up @@ -945,3 +946,10 @@ func MockHostFabricMap(t *testing.T, scans ...*MockFabricScan) HostFabricMap {

return hfm
}

// MockEventLogger returns EventLogger reference that has no syslog handlers registered.
func MockEventLogger(logBasic logging.Logger) *EventLogger {
return &EventLogger{
log: logBasic,
}
}
21 changes: 20 additions & 1 deletion src/control/server/config/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ type Server struct {
Path string `yaml:"-"` // path to config file

// Behavior flags
AutoFormat bool `yaml:"-"`
AutoFormat bool `yaml:"-"`
DisableEngineAutoRestart bool `yaml:"disable_engine_auto_restart"`
EngineAutoRestartMinDelay int `yaml:"engine_auto_restart_min_delay,omitempty"`
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There doesn't appear to be any validation of this parameter, e.g. in the Validate() method:

       if cfg.EngineAutoRestartMinDelay < 0 {
               return errors.Errorf("engine_auto_restart_min_delay must be >= 0 (got %d)",
                       cfg.EngineAutoRestartMinDelay)
       }

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


deprecatedParams `yaml:",inline"`
}
Expand Down Expand Up @@ -362,6 +364,18 @@ func (cfg *Server) WithTelemetryPort(port int) *Server {
return cfg
}

// WithDisableEngineAutoRestart enables or disables automatic engine restarts on self-termination.
func (cfg *Server) WithDisableEngineAutoRestart(disabled bool) *Server {
cfg.DisableEngineAutoRestart = disabled
return cfg
}

// WithEngineAutoRestartMinDelay sets minimum time between automatic engine restarts.
func (cfg *Server) WithEngineAutoRestartMinDelay(secs uint) *Server {
cfg.EngineAutoRestartMinDelay = int(secs)
return cfg
}

// DefaultServer creates a new instance of configuration struct
// populated with defaults.
func DefaultServer() *Server {
Expand Down Expand Up @@ -837,6 +851,11 @@ func (cfg *Server) Validate(log logging.Logger) (err error) {
return FaultConfigSysRsvdZero
}

if cfg.EngineAutoRestartMinDelay < 0 {
return errors.Errorf("engine_auto_restart_min_delay must be >= 0 (got %d)",
cfg.EngineAutoRestartMinDelay)
}

// A config without engines is valid when initially discovering hardware prior to adding
// per-engine sections with device allocations.
if len(cfg.Engines) == 0 {
Expand Down
2 changes: 2 additions & 0 deletions src/control/server/config/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,8 @@ func TestServerConfig_Constructed(t *testing.T) {
WithSystemRamReserved(5).
WithAllowNumaImbalance(true).
WithAllowTHP(true).
WithDisableEngineAutoRestart(true).
WithEngineAutoRestartMinDelay(120).
WithKernelConfigPath("/host/boot/config")

// add engines explicitly to test functionality applied in WithEngines()
Expand Down
2 changes: 1 addition & 1 deletion src/control/server/ctl_check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func TestServer_ControlService_CheckEngineRepair(t *testing.T) {
t.Fatalf("setup error - wrong type for Engine (%T)", e)
}

setupTestEngine(t, srv, uint32(i), rankNums[i])
setupTestEngine(t, srv, rankNums[i])

drpcCfg := new(mockDrpcClientConfig)
drpcCfg.ConnectError = tc.drpcErr
Expand Down
25 changes: 24 additions & 1 deletion src/control/server/ctl_ranks_rpc.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//
// (C) Copyright 2020-2024 Intel Corporation.
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -153,6 +153,21 @@ func (svc *ControlService) memberStateResults(instances []Engine, tgtState syste
return results, nil
}

// Clear restart history for manually stopped ranks on this server. This prevents rate-limiting
// from interfering with manual operations and vice versa.
func clearRankRestartHistory(mgr *engineRestartManager, instances []Engine) {
ranks := make([]ranklist.Rank, 0, len(instances))
for _, ei := range instances {
rank, err := ei.GetRank()
if err == nil {
ranks = append(ranks, rank)
}
}
if len(ranks) > 0 {
mgr.clearRankRestartHistory(ranks)
}
}

// StopRanks implements the method defined for the Management Service.
//
// Stop data-plane instance(s) managed by control-plane identified by unique
Expand Down Expand Up @@ -206,6 +221,10 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) (
return nil, err
}

// clear state history for stopped ranks, instances have already been filtered by
// FilterInstancesByRankSet() to match req.GetRanks()
clearRankRestartHistory(svc.restartMgr, instances)

return resp, nil
}

Expand Down Expand Up @@ -319,6 +338,10 @@ func (svc *ControlService) StartRanks(ctx context.Context, req *ctlpb.RanksReq)
return nil, err
}

// clear state history for started ranks, instances have already been filtered by
// FilterInstancesByRankSet() to match req.GetRanks()
clearRankRestartHistory(svc.restartMgr, instances)

return resp, nil
}

Expand Down
Loading
Loading