tarantool · dmyger · Apr 3, 2025 · Apr 1, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,10 +8,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
-  - `tt aeon connect` added tests for connect file/app.
 
+- `tt aeon connect` added tests for connect file/app.
 - `tt pack `: support `.packignore` file to specify files that should not be included
   in package (works the same as `.gitignore`).
+- `tt tcm start`: add the tcm command.
+- `tt tcm start` OR `tt tcm start --path /path/to/tcm`: added the capability to run TCM in interactive mode.
+- `tt tcm start --watchdog`: implemented Watchdog mode for automatic restarting of TCM upon unexpected termination.
 
 ### Changed
 
@@ -708,4 +711,4 @@ Additionally, several fixes were implemented to improve stability.
 - Module ``tt create``, to create an application from a template.
 - Module ``tt build``, to build an application.
 - Module ``tt install``, to install tarantool/tt.
-- Module ``tt remove``, to remove tarantool/tt.
+- Module ``tt remove``, to remove tarantool/tt.
diff --git a/cli/cmd/root.go b/cli/cmd/root.go
@@ -211,6 +211,7 @@ After that tt will be able to manage the application using 'replicaset_example'
 		NewLogCmd(),
 		NewEnableCmd(),
 		NewAeonCmd(),
+		NewTcmCmd(),
 	)
 	if err := injectCmds(rootCmd); err != nil {
 		panic(err.Error())

diff --git a/cli/cmd/tcm.go b/cli/cmd/tcm.go
@@ -0,0 +1,97 @@
+package cmd
+
+import (
+	"errors"
+	"os"
+	"os/exec"
+	"time"
+
+	"github.com/spf13/cobra"
+	"github.com/tarantool/tt/cli/cmdcontext"
+	"github.com/tarantool/tt/cli/modules"
+	tcmCmd "github.com/tarantool/tt/cli/tcm"
+	"github.com/tarantool/tt/cli/util"
+)
+
+var tcmCtx = tcmCmd.TcmCtx{}
+
+func newTcmStartCmd() *cobra.Command {
+	var tcmCmd = &cobra.Command{
+		Use:   "start",
+		Short: "Start tcm application",
+		Long: `Start to the tcm.
+		tt tcm start --watchdog
+		tt tcm start --path`,
+		Run: func(cmd *cobra.Command, args []string) {
+			cmdCtx.CommandName = cmd.Name()
+			err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalStartTcm, args)
+			util.HandleCmdErr(cmd, err)
+
+		},
+	}
+	tcmCmd.Flags().StringVar(&tcmCtx.Executable, "path", "", "the path to the tcm binary file")
+	tcmCmd.Flags().BoolVar(&tcmCtx.Watchdog, "watchdog", false, "enables the watchdog")
+
+	return tcmCmd
+}
+
+func NewTcmCmd() *cobra.Command {
+	var tcmCmd = &cobra.Command{
+		Use:   "tcm",
+		Short: "Manage tcm application",
+	}
+	tcmCmd.AddCommand(
+		newTcmStartCmd(),
+	)
+	return tcmCmd
+}
+
+func startTcmInteractive() error {
+	tcmApp := exec.Command(tcmCtx.Executable)
+
+	tcmApp.Stdout = os.Stdout
+	tcmApp.Stderr = os.Stderr
+
+	if err := tcmApp.Run(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func startTcmUnderWatchDog() error {
+	wd, err := tcmCmd.NewWatchdog(5 * time.Second)
+	if err != nil {
+		return err
+	}
+
+	if err := wd.Start(tcmCtx.Executable); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func internalStartTcm(cmdCtx *cmdcontext.CmdCtx, args []string) error {
+	if cmdCtx.Cli.TarantoolCli.Executable == "" {
+		return errors.New("cannot start: tarantool binary is not found")
+	}
+
+	if cmdCtx.Cli.TcmCli.Executable == "" {
+		return errors.New("cannot start: tcm binary is not found")
+	}
+
+	tcmCtx.Executable = cmdCtx.Cli.TcmCli.Executable
+
+	if !tcmCtx.Watchdog {
+		if err := startTcmInteractive(); err != nil {
+			return err
+		}
+	}
+
+	if err := startTcmUnderWatchDog(); err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/cli/tcm/tcm.go b/cli/tcm/tcm.go
@@ -0,0 +1,6 @@
+package tcm
+
+type TcmCtx struct {
+	Executable string
+	Watchdog   bool
+}
diff --git a/cli/tcm/watchdog.go b/cli/tcm/watchdog.go
@@ -0,0 +1,158 @@
+package tcm
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"sync"
+	"syscall"
+	"time"
+)
+
+// Watchdog manages the lifecycle of a process.
+type Watchdog struct {
+	// The command to execute and monitor.
+	cmd *exec.Cmd
+	// Time to wait before restarting the process.
+	restartTimeout time.Duration
+	// Flag to indicate if the Watchdog should stop.
+	shouldStop bool
+	// Mutex to protect access to shouldStop.
+	stopMutex sync.Mutex
+	// WaitGroup to wait for all goroutines to finish.
+	doneBarrier sync.WaitGroup
+	// File to store the process PID.
+	pidFile string
+}
+
+// NewWatchdog creates a new Watchdog instance.
+func NewWatchdog(restartTimeout time.Duration) (*Watchdog, error) {
+	return &Watchdog{
+		restartTimeout: restartTimeout,
+		pidFile:        "tcm/pidFile.pid",
+	}, nil
+}
+
+// Start starts the process and monitors its execution.
+func (wd *Watchdog) Start(bin string, args ...string) error {
+	wd.doneBarrier.Add(1)
+	defer wd.doneBarrier.Done()
+
+	signalCtx, signalCancel := context.WithCancel(context.Background())
+	defer signalCancel()
+
+	go wd.handleSignals(signalCtx, signalCancel)
+
+	for {
+		wd.stopMutex.Lock()
+		if wd.shouldStop {
+			wd.stopMutex.Unlock()
+			return nil
+		}
+		wd.stopMutex.Unlock()
+
+		wd.cmd = exec.Command(bin, args...)
+		wd.cmd.Stdout = os.Stdout
+		wd.cmd.Stderr = os.Stderr
+
+		log.Println("(INFO): Starting process...")
+		if err := wd.cmd.Start(); err != nil {
+			log.Printf("(ERROR): Failed to start process: %v\n", err)
+			return err
+		}
+
+		if err := wd.writePIDToFile(); err != nil {
+			log.Printf("(ERROR): Failed to write PID to file: %v\n", err)
+			return err
+		}
+
+		err := wd.cmd.Wait()
+		if err != nil {
+			var exitErr *exec.ExitError
+			if errors.As(err, &exitErr) {
+				log.Printf("(WARN): Process exited with error: %v\n", exitErr)
+			} else {
+				log.Printf("(ERROR): Process failed: %v\n", err)
+				return err
+			}
+		} else {
+			log.Println("(INFO): Process completed successfully.")
+		}
+
+		wd.stopMutex.Lock()
+		if wd.shouldStop {
+			wd.stopMutex.Unlock()
+			return nil
+		}
+		wd.stopMutex.Unlock()
+
+		log.Printf("(INFO): Waiting for %s before restart...\n", wd.restartTimeout)
+		time.Sleep(wd.restartTimeout)
+	}
+}
+
+// Stop stops the process and shuts down the Watchdog.
+func (wd *Watchdog) Stop() {
+	wd.stopMutex.Lock()
+	wd.shouldStop = true
+	if wd.cmd != nil && wd.cmd.Process != nil {
+		log.Println("(INFO): Stopping process...")
+		if err := wd.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+			log.Printf("(ERROR): Failed to stop process: %v\n", err)
+		}
+	}
+	wd.stopMutex.Unlock()
+
+	wd.doneBarrier.Wait()
+	os.RemoveAll(filepath.Dir(wd.pidFile))
+	log.Println("(INFO): Watchdog stopped.")
+}
+
+// handleSignals listens for OS signals and stops the Watchdog gracefully.
+func (wd *Watchdog) handleSignals(ctx context.Context, cancel context.CancelFunc) {
+	signalChan := make(chan os.Signal, 1)
+	signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
+
+	select {
+	case <-signalChan:
+		log.Println("(INFO): Received stop signal.")
+		wd.Stop()
+		cancel()
+	case <-ctx.Done():
+		return
+	}
+}
+
+// writePIDToFile writes the PID of the process to a file.
+func (wd *Watchdog) writePIDToFile() error {
+	if wd.cmd == nil || wd.cmd.Process == nil {
+		return errors.New("process is not running")
+	}
+
+	pid := wd.cmd.Process.Pid
+	pidData := fmt.Sprintf("%d", pid)
+
+	dir := filepath.Dir(wd.pidFile)
+	if err := os.MkdirAll(dir, os.ModePerm); err != nil {
+		return err
+	}
+
+	file, err := os.Create(wd.pidFile)
+	if err != nil {
+		return fmt.Errorf("failed to create PID file: %v", err)
+	}
+	defer file.Close()
+
+	_, err = file.WriteString(pidData)
+	if err != nil {
+		return err
+	}
+
+	log.Printf("(INFO): PID %d written to %s\n", pid, wd.pidFile)
+	return nil
+}
diff --git a/cli/tcm/watchdog_test.go b/cli/tcm/watchdog_test.go
@@ -0,0 +1,69 @@
+package tcm
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestWatchdogStartProcess(t *testing.T) {
+	watchdog, err := NewWatchdog(1 * time.Second)
+	require.NoError(t, err)
+
+	go func() {
+		watchdog.Start("sleep", "5")
+		require.NoError(t, err)
+	}()
+
+	time.Sleep(2 * time.Second)
+
+	_, err = os.Stat(watchdog.pidFile)
+	require.NoError(t, err)
+
+	watchdog.Stop()
+}
+
+func TestWatchdogRestartProcess(t *testing.T) {
+	watchdog, err := NewWatchdog(1 * time.Second)
+	require.NoError(t, err)
+
+	go func() {
+		err := watchdog.Start("sleep", "1")
+		require.NoError(t, err)
+	}()
+
+	time.Sleep(3 * time.Second)
+
+	_, err = os.Stat(watchdog.pidFile)
+	require.NoError(t, err)
+
+	watchdog.Stop()
+}
+
+func TestWritePIDToFile(t *testing.T) {
+	pidFile := "/tmp/watchdog_test.pid"
+	defer os.Remove(pidFile)
+
+	cmd := exec.Command("sleep", "1")
+	err := cmd.Start()
+	require.NoError(t, err)
+	defer cmd.Process.Kill()
+
+	watchdog := &Watchdog{
+		cmd:     cmd,
+		pidFile: pidFile,
+	}
+
+	err = watchdog.writePIDToFile()
+	require.NoError(t, err)
+
+	pidData, err := os.ReadFile(pidFile)
+	require.NoError(t, err)
+
+	expectedPID := fmt.Sprintf("%d", cmd.Process.Pid)
+	require.Equal(t, expectedPID, string(pidData))
+}