Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions sn-manager/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ Auto-update checks run every 10 minutes when enabled.

## Version Update Scenarios

The auto-updater follows stable-only, same-major update rules and defers updates while the gateway is busy. Summary:
The auto-updater follows stable-only, same-major update rules and coordinates updates around the gateway state to minimize disruption while avoiding being stuck. Summary:

| Current | Available | Auto-Upgrade Enabled | Auto Updates? | Manual Option |
|---|---|---|---|---|
Expand All @@ -211,17 +211,23 @@ The auto-updater follows stable-only, same-major update rules and defers updates
| v1.7.4 | v1.7.4 (stable) | Yes | ❌ | — |
| v1.7.5 | v1.7.4 (stable) | Yes | ❌ | — |
| Any | Any | No | ❌ | `sn-manager get [version] && sn-manager use [version]` |
| Any | Any | Yes, but gateway busy | ❌ (deferred) | Manual allowed |
| Any | Any | Yes, but gateway busy | ⏳ Deferred (max 1 hour), then ✅ | Manual allowed |

Mechanics and notes:
- Stable-only: auto-updater targets latest stable GitHub release (non-draft, non-prerelease).
- Same-major only: SuperNode and sn-manager auto-update only when the latest is the same major version (the number before the first dot). Example: 1.7 → 1.8 = allowed; 1.x → 2.0 = manual.
- Gateway-aware: updates are applied only when the gateway reports no running tasks; otherwise they are deferred.
- Gateway errors: repeated check failures over a 5-minute window request a clean SuperNode restart (no version change) to recover.
- Gateway idle: updates are applied when the gateway reports no running tasks.
- Gateway busy: if tasks are running, updates are deferred for up to 1 hour for the target version; after that hard window, the update proceeds to avoid being stuck.
- Gateway unresponsive: if an update is available, it proceeds immediately to break the deadlock; if no update is available, a clean SuperNode restart is requested via marker.
- Combined tarball: when updating, sn-manager downloads a single tarball once, then updates itself first (if eligible), then installs/activates the new SuperNode version.
- Config is updated to reflect the new `updates.current_version` after a successful SuperNode update.
- Manual installs: you can always override with `sn-manager get <version>` and `sn-manager use <version>`; pre-releases are supported manually.

### Update Timing

- Checks run every 10 minutes when auto-upgrade is enabled.
- On every `sn-manager start`, the updater runs an immediate check and bypasses the gateway check once so that initial updates can be applied even if the gateway is not yet available.

## Start/Stop Behavior

sn-manager start and supernode start clear the stop marker; supernode stop sets it. How the manager and SuperNode processes behave for each command, plus systemd nuances:
Expand Down
55 changes: 33 additions & 22 deletions sn-manager/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ func runStart(cmd *cobra.Command, args []string) error {
var autoUpdater *updater.AutoUpdater
if cfg.Updates.AutoUpgrade {
autoUpdater = updater.New(home, cfg, appVersion)
// On every manager start, bypass the gateway check once so an
// update can be applied even if the gateway isn't up yet.
autoUpdater.SkipGatewayCheckOnce()
autoUpdater.Start(ctx)
}

Expand Down Expand Up @@ -168,39 +171,47 @@ func runStart(cmd *cobra.Command, args []string) error {
}
}

// ensureBinaryExists ensures we have at least one SuperNode binary
// ensureBinaryExists ensures there is an installed and active SuperNode binary.
// Steps:
// 1) If versions exist: ensure a current symlink and sync config.
// 2) If no versions exist: download, install, and activate the latest release.
func ensureBinaryExists(home string, cfg *config.Config) error {
versionMgr := version.NewManager(home)

// Check if we have any versions installed
versions, err := versionMgr.ListVersions()
if err != nil {
return err
}

if len(versions) > 0 {
// We have versions, make sure current is set
current, err := versionMgr.GetCurrentVersion()
if err != nil || current == "" {
// Set the first available version as current
if err := versionMgr.SetCurrentVersion(versions[0]); err != nil {
return fmt.Errorf("failed to set current version: %w", err)
}
current = versions[0]
}
return ensureCurrentVersionSet(versionMgr, cfg, home, versions)
}
return downloadAndInstallLatest(versionMgr, home, cfg)
}

// Update config if current version is not set or different
if cfg.Updates.CurrentVersion != current {
cfg.Updates.CurrentVersion = current
configPath := filepath.Join(home, "config.yml")
if err := config.Save(cfg, configPath); err != nil {
return fmt.Errorf("failed to update config with current version: %w", err)
}
// ensureCurrentVersionSet sets a current version and syncs config if needed.
// ensureCurrentVersionSet atomically ensures a current version is set and
// persists it to the sn-manager config when changed.
func ensureCurrentVersionSet(versionMgr *version.Manager, cfg *config.Config, home string, versions []string) error {
current, err := versionMgr.GetCurrentVersion()
if err != nil || current == "" {
if err := versionMgr.SetCurrentVersion(versions[0]); err != nil {
return fmt.Errorf("failed to set current version: %w", err)
}
current = versions[0]
}
if cfg.Updates.CurrentVersion != current {
cfg.Updates.CurrentVersion = current
configPath := filepath.Join(home, "config.yml")
if err := config.Save(cfg, configPath); err != nil {
return fmt.Errorf("failed to update config with current version: %w", err)
}
return nil
}
return nil
}

// No versions installed, download latest tarball and extract supernode
// downloadAndInstallLatest downloads, installs, and activates the latest SuperNode.
// downloadAndInstallLatest downloads the combined release, extracts supernode,
// installs the version, activates it, and syncs config.
func downloadAndInstallLatest(versionMgr *version.Manager, home string, cfg *config.Config) error {
fmt.Println("No SuperNode binary found. Downloading latest version...")

client := github.NewClient(config.GitHubRepo)
Expand Down
179 changes: 10 additions & 169 deletions sn-manager/internal/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ import (
"github.com/LumeraProtocol/supernode/v2/sn-manager/internal/config"
)

// Constants for process management
const (
DefaultShutdownTimeout = 30 * time.Second
ProcessCheckInterval = 5 * time.Second
CrashBackoffDelay = 2 * time.Second
StopMarkerFile = ".stop_requested"
RestartMarkerFile = ".needs_restart"
)

// Manager handles the SuperNode process lifecycle
type Manager struct {
config *config.Config
Expand Down Expand Up @@ -171,177 +180,9 @@ func (m *Manager) cleanup() {
}
}

// Constants for process management
const (
DefaultShutdownTimeout = 30 * time.Second
ProcessCheckInterval = 5 * time.Second
CrashBackoffDelay = 2 * time.Second
StopMarkerFile = ".stop_requested"
RestartMarkerFile = ".needs_restart"
)

// Monitor continuously supervises the SuperNode process
// It ensures SuperNode is always running unless a stop marker is present
func (m *Manager) Monitor(ctx context.Context) error {

// Create ticker for periodic checks
ticker := time.NewTicker(ProcessCheckInterval)
defer ticker.Stop()

// Channel to monitor process exits
processExitCh := make(chan error, 1)

// Function to arm the process wait goroutine
armProcessWait := func() {
processExitCh = make(chan error, 1)
go func() {
if err := m.Wait(); err != nil {
processExitCh <- err
} else {
processExitCh <- nil
}
}()
}

// Initial check and start if needed
stopMarkerPath := filepath.Join(m.homeDir, StopMarkerFile)
if _, err := os.Stat(stopMarkerPath); os.IsNotExist(err) {
// No stop marker, ensure SuperNode is running
if !m.IsRunning() {
log.Println("Starting SuperNode...")
if err := m.Start(ctx); err != nil {
log.Printf("Failed to start SuperNode: %v", err)
} else {
armProcessWait()
}
} else {
// Already running, arm the wait
armProcessWait()
}
} else {
log.Println("Stop marker present, SuperNode will not be started")
}

// Main supervision loop
for {
select {
case <-ctx.Done():
// Context cancelled, stop monitoring
return ctx.Err()

case err := <-processExitCh:
// SuperNode process exited
if err != nil {
log.Printf("SuperNode exited with error: %v", err)
} else {
log.Printf("SuperNode exited normally")
}

// Cleanup internal state after exit
m.mu.Lock()
m.cleanup()
m.mu.Unlock()

// Check if we should restart
if _, err := os.Stat(stopMarkerPath); err == nil {
log.Println("Stop marker present, not restarting SuperNode")
continue
}

// Apply backoff to prevent rapid restart loops
time.Sleep(CrashBackoffDelay)

// Restart SuperNode
log.Println("Restarting SuperNode after crash...")
if err := m.Start(ctx); err != nil {
log.Printf("Failed to restart SuperNode: %v", err)
continue
}
armProcessWait()
log.Println("SuperNode restarted successfully")

case <-ticker.C:
// Periodic check for various conditions

// 1. Check if stop marker was removed and we should start
if !m.IsRunning() {
if _, err := os.Stat(stopMarkerPath); os.IsNotExist(err) {
log.Println("Stop marker removed, starting SuperNode...")
if err := m.Start(ctx); err != nil {
log.Printf("Failed to start SuperNode: %v", err)
} else {
armProcessWait()
log.Println("SuperNode started")
}
}
}

// 2. Check if binary was updated and needs restart
restartMarkerPath := filepath.Join(m.homeDir, RestartMarkerFile)
if _, err := os.Stat(restartMarkerPath); err == nil {
if m.IsRunning() {
log.Println("Binary update detected, restarting SuperNode...")

// Remove the restart marker
if err := os.Remove(restartMarkerPath); err != nil && !os.IsNotExist(err) {
log.Printf("Warning: failed to remove restart marker: %v", err)
}

// Create temporary stop marker for clean restart
tmpStopMarker := []byte("update")
os.WriteFile(stopMarkerPath, tmpStopMarker, 0644)

// Stop current process
if err := m.Stop(); err != nil {
log.Printf("Failed to stop for update: %v", err)
if err := os.Remove(stopMarkerPath); err != nil && !os.IsNotExist(err) {
log.Printf("Warning: failed to remove stop marker: %v", err)
}
continue
}

// Brief pause
time.Sleep(CrashBackoffDelay)

// Remove temporary stop marker
if err := os.Remove(stopMarkerPath); err != nil && !os.IsNotExist(err) {
log.Printf("Warning: failed to remove stop marker: %v", err)
}

// Start with new binary
log.Println("Starting with updated binary...")
if err := m.Start(ctx); err != nil {
log.Printf("Failed to start updated binary: %v", err)
} else {
armProcessWait()
log.Println("SuperNode restarted with new binary")
}
}
}

// 3. Health check - ensure process is actually alive
if m.IsRunning() {
// Process thinks it's running, verify it really is
m.mu.RLock()
proc := m.process
m.mu.RUnlock()

if proc != nil {
if err := proc.Signal(syscall.Signal(0)); err != nil {
// Process is dead but not cleaned up
log.Println("Detected stale process, cleaning up...")
m.mu.Lock()
m.cleanup()
m.mu.Unlock()
}
}
}
}
}
}
// Monitor and related helpers moved to monitor.go

// GetConfig returns the manager configuration
func (m *Manager) GetConfig() *config.Config {
return m.config
}

Loading