Skip to content

Commit 9a05074

Browse files
committed
Add unit tests in updater & reafctor
1 parent 38964e9 commit 9a05074

File tree

11 files changed

+810
-383
lines changed

11 files changed

+810
-383
lines changed

sn-manager/README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ Auto-update checks run every 10 minutes when enabled.
200200

201201
## Version Update Scenarios
202202

203-
The auto-updater follows stable-only, same-major update rules and defers updates while the gateway is busy. Summary:
203+
The auto-updater follows stable-only, same-major update rules and coordinates updates around the gateway state to minimize disruption while avoiding being stuck. Summary:
204204

205205
| Current | Available | Auto-Upgrade Enabled | Auto Updates? | Manual Option |
206206
|---|---|---|---|---|
@@ -211,17 +211,23 @@ The auto-updater follows stable-only, same-major update rules and defers updates
211211
| v1.7.4 | v1.7.4 (stable) | Yes |||
212212
| v1.7.5 | v1.7.4 (stable) | Yes |||
213213
| Any | Any | No || `sn-manager get [version] && sn-manager use [version]` |
214-
| Any | Any | Yes, but gateway busy | ❌ (deferred) | Manual allowed |
214+
| Any | Any | Yes, but gateway busy | ⏳ Deferred (max 1 hour), then ✅ | Manual allowed |
215215

216216
Mechanics and notes:
217217
- Stable-only: auto-updater targets latest stable GitHub release (non-draft, non-prerelease).
218218
- Same-major only: SuperNode and sn-manager auto-update only when the latest is the same major version (the number before the first dot). Example: 1.7 → 1.8 = allowed; 1.x → 2.0 = manual.
219-
- Gateway-aware: updates are applied only when the gateway reports no running tasks; otherwise they are deferred.
220-
- Gateway errors: repeated check failures over a 5-minute window request a clean SuperNode restart (no version change) to recover.
219+
- Gateway idle: updates are applied when the gateway reports no running tasks.
220+
- Gateway busy: if tasks are running, updates are deferred for up to 1 hour for the target version; after that hard window, the update proceeds to avoid being stuck.
221+
- Gateway unresponsive: if an update is available, it proceeds immediately to break the deadlock; if no update is available, a clean SuperNode restart is requested via marker.
221222
- Combined tarball: when updating, sn-manager downloads a single tarball once, then updates itself first (if eligible), then installs/activates the new SuperNode version.
222223
- Config is updated to reflect the new `updates.current_version` after a successful SuperNode update.
223224
- Manual installs: you can always override with `sn-manager get <version>` and `sn-manager use <version>`; pre-releases are supported manually.
224225

226+
### Update Timing
227+
228+
- Checks run every 10 minutes when auto-upgrade is enabled.
229+
- On every `sn-manager start`, the updater runs an immediate check and bypasses the gateway check once so that initial updates can be applied even if the gateway is not yet available.
230+
225231
## Start/Stop Behavior
226232

227233
sn-manager start and supernode start clear the stop marker; supernode stop sets it. How the manager and SuperNode processes behave for each command, plus systemd nuances:

sn-manager/cmd/start.go

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ func runStart(cmd *cobra.Command, args []string) error {
125125
var autoUpdater *updater.AutoUpdater
126126
if cfg.Updates.AutoUpgrade {
127127
autoUpdater = updater.New(home, cfg, appVersion)
128+
// On every manager start, bypass the gateway check once so an
129+
// update can be applied even if the gateway isn't up yet.
130+
autoUpdater.SkipGatewayCheckOnce()
128131
autoUpdater.Start(ctx)
129132
}
130133

@@ -168,39 +171,47 @@ func runStart(cmd *cobra.Command, args []string) error {
168171
}
169172
}
170173

171-
// ensureBinaryExists ensures we have at least one SuperNode binary
174+
// ensureBinaryExists ensures there is an installed and active SuperNode binary.
175+
// Steps:
176+
// 1) If versions exist: ensure a current symlink and sync config.
177+
// 2) If no versions exist: download, install, and activate the latest release.
172178
func ensureBinaryExists(home string, cfg *config.Config) error {
173179
versionMgr := version.NewManager(home)
174-
175-
// Check if we have any versions installed
176180
versions, err := versionMgr.ListVersions()
177181
if err != nil {
178182
return err
179183
}
180-
181184
if len(versions) > 0 {
182-
// We have versions, make sure current is set
183-
current, err := versionMgr.GetCurrentVersion()
184-
if err != nil || current == "" {
185-
// Set the first available version as current
186-
if err := versionMgr.SetCurrentVersion(versions[0]); err != nil {
187-
return fmt.Errorf("failed to set current version: %w", err)
188-
}
189-
current = versions[0]
190-
}
185+
return ensureCurrentVersionSet(versionMgr, cfg, home, versions)
186+
}
187+
return downloadAndInstallLatest(versionMgr, home, cfg)
188+
}
191189

192-
// Update config if current version is not set or different
193-
if cfg.Updates.CurrentVersion != current {
194-
cfg.Updates.CurrentVersion = current
195-
configPath := filepath.Join(home, "config.yml")
196-
if err := config.Save(cfg, configPath); err != nil {
197-
return fmt.Errorf("failed to update config with current version: %w", err)
198-
}
190+
// ensureCurrentVersionSet sets a current version and syncs config if needed.
191+
// ensureCurrentVersionSet atomically ensures a current version is set and
192+
// persists it to the sn-manager config when changed.
193+
func ensureCurrentVersionSet(versionMgr *version.Manager, cfg *config.Config, home string, versions []string) error {
194+
current, err := versionMgr.GetCurrentVersion()
195+
if err != nil || current == "" {
196+
if err := versionMgr.SetCurrentVersion(versions[0]); err != nil {
197+
return fmt.Errorf("failed to set current version: %w", err)
198+
}
199+
current = versions[0]
200+
}
201+
if cfg.Updates.CurrentVersion != current {
202+
cfg.Updates.CurrentVersion = current
203+
configPath := filepath.Join(home, "config.yml")
204+
if err := config.Save(cfg, configPath); err != nil {
205+
return fmt.Errorf("failed to update config with current version: %w", err)
199206
}
200-
return nil
201207
}
208+
return nil
209+
}
202210

203-
// No versions installed, download latest tarball and extract supernode
211+
// downloadAndInstallLatest downloads, installs, and activates the latest SuperNode.
212+
// downloadAndInstallLatest downloads the combined release, extracts supernode,
213+
// installs the version, activates it, and syncs config.
214+
func downloadAndInstallLatest(versionMgr *version.Manager, home string, cfg *config.Config) error {
204215
fmt.Println("No SuperNode binary found. Downloading latest version...")
205216

206217
client := github.NewClient(config.GitHubRepo)

sn-manager/internal/manager/manager.go

Lines changed: 10 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ import (
1414
"github.com/LumeraProtocol/supernode/v2/sn-manager/internal/config"
1515
)
1616

17+
// Constants for process management
18+
const (
19+
DefaultShutdownTimeout = 30 * time.Second
20+
ProcessCheckInterval = 5 * time.Second
21+
CrashBackoffDelay = 2 * time.Second
22+
StopMarkerFile = ".stop_requested"
23+
RestartMarkerFile = ".needs_restart"
24+
)
25+
1726
// Manager handles the SuperNode process lifecycle
1827
type Manager struct {
1928
config *config.Config
@@ -171,177 +180,9 @@ func (m *Manager) cleanup() {
171180
}
172181
}
173182

174-
// Constants for process management
175-
const (
176-
DefaultShutdownTimeout = 30 * time.Second
177-
ProcessCheckInterval = 5 * time.Second
178-
CrashBackoffDelay = 2 * time.Second
179-
StopMarkerFile = ".stop_requested"
180-
RestartMarkerFile = ".needs_restart"
181-
)
182-
183-
// Monitor continuously supervises the SuperNode process
184-
// It ensures SuperNode is always running unless a stop marker is present
185-
func (m *Manager) Monitor(ctx context.Context) error {
186-
187-
// Create ticker for periodic checks
188-
ticker := time.NewTicker(ProcessCheckInterval)
189-
defer ticker.Stop()
190-
191-
// Channel to monitor process exits
192-
processExitCh := make(chan error, 1)
193-
194-
// Function to arm the process wait goroutine
195-
armProcessWait := func() {
196-
processExitCh = make(chan error, 1)
197-
go func() {
198-
if err := m.Wait(); err != nil {
199-
processExitCh <- err
200-
} else {
201-
processExitCh <- nil
202-
}
203-
}()
204-
}
205-
206-
// Initial check and start if needed
207-
stopMarkerPath := filepath.Join(m.homeDir, StopMarkerFile)
208-
if _, err := os.Stat(stopMarkerPath); os.IsNotExist(err) {
209-
// No stop marker, ensure SuperNode is running
210-
if !m.IsRunning() {
211-
log.Println("Starting SuperNode...")
212-
if err := m.Start(ctx); err != nil {
213-
log.Printf("Failed to start SuperNode: %v", err)
214-
} else {
215-
armProcessWait()
216-
}
217-
} else {
218-
// Already running, arm the wait
219-
armProcessWait()
220-
}
221-
} else {
222-
log.Println("Stop marker present, SuperNode will not be started")
223-
}
224-
225-
// Main supervision loop
226-
for {
227-
select {
228-
case <-ctx.Done():
229-
// Context cancelled, stop monitoring
230-
return ctx.Err()
231-
232-
case err := <-processExitCh:
233-
// SuperNode process exited
234-
if err != nil {
235-
log.Printf("SuperNode exited with error: %v", err)
236-
} else {
237-
log.Printf("SuperNode exited normally")
238-
}
239-
240-
// Cleanup internal state after exit
241-
m.mu.Lock()
242-
m.cleanup()
243-
m.mu.Unlock()
244-
245-
// Check if we should restart
246-
if _, err := os.Stat(stopMarkerPath); err == nil {
247-
log.Println("Stop marker present, not restarting SuperNode")
248-
continue
249-
}
250-
251-
// Apply backoff to prevent rapid restart loops
252-
time.Sleep(CrashBackoffDelay)
253-
254-
// Restart SuperNode
255-
log.Println("Restarting SuperNode after crash...")
256-
if err := m.Start(ctx); err != nil {
257-
log.Printf("Failed to restart SuperNode: %v", err)
258-
continue
259-
}
260-
armProcessWait()
261-
log.Println("SuperNode restarted successfully")
262-
263-
case <-ticker.C:
264-
// Periodic check for various conditions
265-
266-
// 1. Check if stop marker was removed and we should start
267-
if !m.IsRunning() {
268-
if _, err := os.Stat(stopMarkerPath); os.IsNotExist(err) {
269-
log.Println("Stop marker removed, starting SuperNode...")
270-
if err := m.Start(ctx); err != nil {
271-
log.Printf("Failed to start SuperNode: %v", err)
272-
} else {
273-
armProcessWait()
274-
log.Println("SuperNode started")
275-
}
276-
}
277-
}
278-
279-
// 2. Check if binary was updated and needs restart
280-
restartMarkerPath := filepath.Join(m.homeDir, RestartMarkerFile)
281-
if _, err := os.Stat(restartMarkerPath); err == nil {
282-
if m.IsRunning() {
283-
log.Println("Binary update detected, restarting SuperNode...")
284-
285-
// Remove the restart marker
286-
if err := os.Remove(restartMarkerPath); err != nil && !os.IsNotExist(err) {
287-
log.Printf("Warning: failed to remove restart marker: %v", err)
288-
}
289-
290-
// Create temporary stop marker for clean restart
291-
tmpStopMarker := []byte("update")
292-
os.WriteFile(stopMarkerPath, tmpStopMarker, 0644)
293-
294-
// Stop current process
295-
if err := m.Stop(); err != nil {
296-
log.Printf("Failed to stop for update: %v", err)
297-
if err := os.Remove(stopMarkerPath); err != nil && !os.IsNotExist(err) {
298-
log.Printf("Warning: failed to remove stop marker: %v", err)
299-
}
300-
continue
301-
}
302-
303-
// Brief pause
304-
time.Sleep(CrashBackoffDelay)
305-
306-
// Remove temporary stop marker
307-
if err := os.Remove(stopMarkerPath); err != nil && !os.IsNotExist(err) {
308-
log.Printf("Warning: failed to remove stop marker: %v", err)
309-
}
310-
311-
// Start with new binary
312-
log.Println("Starting with updated binary...")
313-
if err := m.Start(ctx); err != nil {
314-
log.Printf("Failed to start updated binary: %v", err)
315-
} else {
316-
armProcessWait()
317-
log.Println("SuperNode restarted with new binary")
318-
}
319-
}
320-
}
321-
322-
// 3. Health check - ensure process is actually alive
323-
if m.IsRunning() {
324-
// Process thinks it's running, verify it really is
325-
m.mu.RLock()
326-
proc := m.process
327-
m.mu.RUnlock()
328-
329-
if proc != nil {
330-
if err := proc.Signal(syscall.Signal(0)); err != nil {
331-
// Process is dead but not cleaned up
332-
log.Println("Detected stale process, cleaning up...")
333-
m.mu.Lock()
334-
m.cleanup()
335-
m.mu.Unlock()
336-
}
337-
}
338-
}
339-
}
340-
}
341-
}
183+
// Monitor and related helpers moved to monitor.go
342184

343185
// GetConfig returns the manager configuration
344186
func (m *Manager) GetConfig() *config.Config {
345187
return m.config
346188
}
347-

0 commit comments

Comments
 (0)