Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support --wal-dir in keeper #865

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions cmd/keeper/cmd/keeper.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ type config struct {

uid string
dataDir string
walDir string
debug bool
pgListenAddress string
pgAdvertiseAddress string
Expand Down Expand Up @@ -126,6 +127,7 @@ func init() {
CmdKeeper.PersistentFlags().StringVar(&cfg.uid, "id", "", "keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.")
CmdKeeper.PersistentFlags().StringVar(&cfg.uid, "uid", "", "keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.")
CmdKeeper.PersistentFlags().StringVar(&cfg.dataDir, "data-dir", "", "data directory")
CmdKeeper.PersistentFlags().StringVar(&cfg.walDir, "wal-dir", "", "wal directory")
CmdKeeper.PersistentFlags().StringVar(&cfg.pgListenAddress, "pg-listen-address", "", "postgresql instance listening address, local address used for the postgres instance. For all network interface, you can set the value to '*'.")
CmdKeeper.PersistentFlags().StringVar(&cfg.pgAdvertiseAddress, "pg-advertise-address", "", "postgresql instance address from outside. Use it to expose ip different than local ip with a NAT networking config")
CmdKeeper.PersistentFlags().StringVar(&cfg.pgPort, "pg-port", "5432", "postgresql instance listening port")
Expand Down Expand Up @@ -471,6 +473,7 @@ type PostgresKeeper struct {
bootUUID string

dataDir string
walDir string
pgListenAddress string
pgAdvertiseAddress string
pgPort string
Expand Down Expand Up @@ -522,6 +525,7 @@ func NewPostgresKeeper(cfg *config, end chan error) (*PostgresKeeper, error) {
bootUUID: common.UUID(),

dataDir: dataDir,
walDir: cfg.walDir,

pgListenAddress: cfg.pgListenAddress,
pgAdvertiseAddress: cfg.pgAdvertiseAddress,
Expand Down Expand Up @@ -823,7 +827,7 @@ func (p *PostgresKeeper) Start(ctx context.Context) {

// TODO(sgotti) reconfigure the various configurations options
// (RequestTimeout) after a changed cluster config
pgm := pg.NewManager(p.pgBinPath, p.dataDir, p.getLocalConnParams(), p.getLocalReplConnParams(), p.pgSUAuthMethod, p.pgSUUsername, p.pgSUPassword, p.pgReplAuthMethod, p.pgReplUsername, p.pgReplPassword, p.requestTimeout)
pgm := pg.NewManager(p.pgBinPath, p.dataDir, p.walDir, p.getLocalConnParams(), p.getLocalReplConnParams(), p.pgSUAuthMethod, p.pgSUUsername, p.pgSUPassword, p.pgReplAuthMethod, p.pgReplUsername, p.pgReplPassword, p.requestTimeout)
p.pgm = pgm

_ = p.pgm.StopIfStarted(true)
Expand Down Expand Up @@ -916,7 +920,7 @@ func (p *PostgresKeeper) resync(db, masterDB, followedDB *cluster.DB, tryPgrewin
replSlot = common.StolonName(db.UID)
}

if err := pgm.RemoveAll(); err != nil {
if err := pgm.RemoveAllIfInitialized(); err != nil {
return fmt.Errorf("failed to remove the postgres data dir: %v", err)
}
if slog.IsDebug() {
Expand Down Expand Up @@ -1115,7 +1119,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
}

// Clean up cluster db datadir
if err = pgm.RemoveAll(); err != nil {
if err = pgm.RemoveAllIfInitialized(); err != nil {
log.Errorw("failed to remove the postgres data dir", zap.Error(err))
return
}
Expand Down Expand Up @@ -1174,7 +1178,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
log.Errorw("failed to stop pg instance", zap.Error(err))
return
}
if err = pgm.RemoveAll(); err != nil {
if err = pgm.RemoveAllIfInitialized(); err != nil {
log.Errorw("failed to remove the postgres data dir", zap.Error(err))
return
}
Expand Down Expand Up @@ -1236,7 +1240,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
log.Errorw("failed to stop pg instance", zap.Error(err))
return
}
if err = pgm.RemoveAll(); err != nil {
if err = pgm.RemoveAllIfInitialized(); err != nil {
log.Errorw("failed to remove the postgres data dir", zap.Error(err))
return
}
Expand Down
1 change: 1 addition & 0 deletions doc/commands/stolon-keeper.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ stolon-keeper [flags]
--store-skip-tls-verify skip store certificate verification (insecure!!!)
--store-timeout duration store request timeout (default 5s)
--uid string keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.
--wal-dir string wal directory
```

###### Auto generated by spf13/cobra on 24-Feb-2021
2 changes: 1 addition & 1 deletion doc/pitr.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Note: the `\"` is needed by json to put double quotes inside strings. We aren't
When initializing a cluster in pitr init mode a random registered keeper will be choosed and it'll start restoring the database with these steps:

* Remove the current data directory
* Call the `dataRestoreCommand` expanding every %d to the data directory full path. If it exits with a non zero exit code then stop here since something went wrong.
* Call the `dataRestoreCommand` expanding every %d to the data directory full path and every %w to the wal directory full path (if wal directory is provided to the keeper). If it exits with a non zero exit code then stop here since something went wrong.
* Create a `recovery.conf` with the right parameters and with `restore_command` set to `restoreCommand`.
* Start the postgres instance and wait for the archive recovery.

Expand Down
112 changes: 107 additions & 5 deletions internal/postgresql/postgresql.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"fmt"
"io"
"io/fs"
"io/ioutil"
"os"
"os/exec"
Expand Down Expand Up @@ -66,6 +67,7 @@ type PGManager interface {
type Manager struct {
pgBinPath string
dataDir string
walDir string
parameters common.Parameters
recoveryOptions *RecoveryOptions
hba []string
Expand Down Expand Up @@ -133,10 +135,11 @@ func SetLogger(l *zap.SugaredLogger) {
log = l
}

func NewManager(pgBinPath string, dataDir string, localConnParams, replConnParams ConnParams, suAuthMethod, suUsername, suPassword, replAuthMethod, replUsername, replPassword string, requestTimeout time.Duration) *Manager {
func NewManager(pgBinPath string, dataDir, walDir string, localConnParams, replConnParams ConnParams, suAuthMethod, suUsername, suPassword, replAuthMethod, replUsername, replPassword string, requestTimeout time.Duration) *Manager {
return &Manager{
pgBinPath: pgBinPath,
dataDir: filepath.Join(dataDir, "postgres"),
walDir: walDir,
parameters: make(common.Parameters),
recoveryOptions: NewRecoveryOptions(),
curParameters: make(common.Parameters),
Expand Down Expand Up @@ -222,6 +225,13 @@ func (p *Manager) Init(initConfig *InitConfig) error {
}
log.Debugw("execing cmd", "cmd", cmd)

// initdb supports configuring a separate wal directory via symlinks. Normally this
// parameter might be part of the initConfig, but it will also be required whenever we
// fall-back to a pg_basebackup during a re-sync, which is why it's a Manager field.
if p.walDir != "" {
cmd.Args = append(cmd.Args, "--waldir", p.walDir)
}

if initConfig.Locale != "" {
cmd.Args = append(cmd.Args, "--locale", initConfig.Locale)
}
Expand All @@ -240,7 +250,9 @@ func (p *Manager) Init(initConfig *InitConfig) error {
}
// remove the dataDir, so we don't end with an half initialized database
if err != nil {
os.RemoveAll(p.dataDir)
if cleanupErr := p.RemoveAll(); cleanupErr != nil {
log.Errorf("failed to cleanup database: %v", cleanupErr)
}
return err
}
return nil
Expand All @@ -250,7 +262,7 @@ func (p *Manager) Restore(command string) error {
var err error
var cmd *exec.Cmd

command = expand(command, p.dataDir)
command = expandRecoveryCommand(command, p.dataDir, p.walDir)

if err = os.MkdirAll(p.dataDir, 0700); err != nil {
err = fmt.Errorf("cannot create data dir: %v", err)
Expand All @@ -269,7 +281,9 @@ func (p *Manager) Restore(command string) error {
// On every error remove the dataDir, so we don't end with an half initialized database
out:
if err != nil {
os.RemoveAll(p.dataDir)
if cleanupErr := p.RemoveAll(); cleanupErr != nil {
log.Errorf("failed to cleanup database: %v", cleanupErr)
}
return err
}
return nil
Expand All @@ -286,10 +300,84 @@ func (p *Manager) StartTmpMerged() error {
return p.start("-c", fmt.Sprintf("config_file=%s", tmpPostgresConfPath))
}

func (p *Manager) moveWal() (err error) {
var curPath string
var desiredPath string
var tmpPath string
symlinkPath := filepath.Join(p.dataDir, "pg_wal")
if curPath, err = filepath.EvalSymlinks(symlinkPath); err != nil {
log.Errorf("could not evaluate symlink %s: %e", symlinkPath, err)
return err
}
if p.walDir == "" {
desiredPath = symlinkPath
tmpPath = filepath.Join(p.dataDir, "pg_wal_new")
} else {
desiredPath = p.walDir
tmpPath = p.walDir
}
if curPath == desiredPath {
return nil
}
if p.walDir == "" {
log.Infof("moving WAL from %s to %s first and then to %s", curPath, tmpPath, desiredPath)
} else {
log.Infof("moving WAL from %s to new location %s", curPath, desiredPath)
}
// We use tmpPath here first and (if needed) mv tmpPath to desiredPath when all is copied.
// This allows stolon-keeper to re-read symlink dest and continue should stolon-keeper be restarted while copying.
if err = moveDirRecursive(curPath, tmpPath); err != nil {
return err
}

var symlinkStat fs.FileInfo
if symlinkStat, err = os.Lstat(symlinkPath); errors.Is(err, os.ErrNotExist) {
// File or folder already removed
} else if err != nil {
log.Errorf("could not get info on current pg_wal folder/symlink %s: %e", symlinkPath, err)
return err
} else if symlinkStat.Mode()&os.ModeSymlink != 0 {
if err = os.Remove(symlinkPath); err != nil {
log.Errorf("could not remove current pg_wal symlink %s: %e", symlinkPath, err)
return err
}
} else if symlinkStat.IsDir() {
if err = syscall.Rmdir(symlinkPath); err != nil {
log.Errorf("could not remove current folder %s: %e", symlinkPath, err)
return err
}
} else {
err = fmt.Errorf("location %s is no symlink and no dir, so please check and resolve by hand", symlinkPath)
log.Error(err)
return err
}
if p.walDir == "" {
// So we were moving WAL files back into PGDATA. Let's rename the tmpDir now holding all WAL files and use that
// as PGDATA/pg_wal
if err = os.Rename(tmpPath, desiredPath); err != nil {
sebasmannem marked this conversation as resolved.
Show resolved Hide resolved
log.Errorf("cannot move %s to %s: %e", tmpPath, desiredPath, err)
return err
}
} else {
log.Infof("symlinking %s to %s", symlinkPath, desiredPath)
if err = os.Symlink(desiredPath, symlinkPath); err != nil {
// We were copying WAL files from PGDATA (or another location) to a location outside of PGDATA and
// pointing the symlink in the right direction failed.
log.Errorf("could not create symlink %s to %s: %e", symlinkPath, desiredPath, err)
return err
}
}
log.Infof("moving pg_wal from %s to %s is succesful", curPath, desiredPath)
return nil
}

func (p *Manager) Start() error {
if err := p.writeConfs(false); err != nil {
return err
}
if err := p.moveWal(); err != nil {
return err
}
return p.start()
}

Expand Down Expand Up @@ -967,6 +1055,9 @@ func (p *Manager) SyncFromFollowed(followedConnParams ConnParams, replSlot strin
if replSlot != "" {
args = append(args, "--slot", replSlot)
}
if p.walDir != "" {
args = append(args, "--waldir", p.walDir)
}
cmd := exec.Command(name, args...)

cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
Expand Down Expand Up @@ -1000,7 +1091,7 @@ func (p *Manager) SyncFromFollowed(followedConnParams ConnParams, replSlot strin
return nil
}

func (p *Manager) RemoveAll() error {
func (p *Manager) RemoveAllIfInitialized() error {
initialized, err := p.IsInitialized()
if err != nil {
return fmt.Errorf("failed to retrieve instance state: %v", err)
Expand All @@ -1016,6 +1107,17 @@ func (p *Manager) RemoveAll() error {
if started {
return fmt.Errorf("cannot remove postregsql database. Instance is active")
}

return p.RemoveAll()
}

// RemoveAll entirely cleans up the data directory, including any wal directory if that
// exists outside of the data directory.
func (p *Manager) RemoveAll() error {
if p.walDir != "" {
os.RemoveAll(p.walDir)
}

return os.RemoveAll(p.dataDir)
}

Expand Down
Loading