Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
/.vendor/
.idea/
*.tmp

tools/
*__failpoint_binding__.go
*.go__failpoint_stash__
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ require (
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
github.com/pingcap/errors v0.11.5-0.20260310054046-9c8b3586e4b2 // indirect
github.com/pingcap/failpoint v0.0.0-20260521055755-e7642935314f // indirect
github.com/pingcap/log v1.1.1-0.20260227082333-572e590d08f1 // indirect
github.com/pingcap/tidb/pkg/parser v0.0.0-20260504140133-511dba1dbe17 // indirect
github.com/pkg/errors v0.9.1 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb h1:3pSi4EDG6hg0o
github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb/go.mod h1:X2r9ueLEUZgtx2cIogM0v4Zj5uvvzhuuiu7Pn8HzMPg=
github.com/pingcap/errors v0.11.5-0.20260310054046-9c8b3586e4b2 h1:cLgCk5mwDG9lDH+dPK8TmEliTjyGJwwKN0qevWAl8IY=
github.com/pingcap/errors v0.11.5-0.20260310054046-9c8b3586e4b2/go.mod h1:ktAJCA9lxrHHjVyVl2pKJFvzBnq2eZbb+CUOjBRPlXo=
github.com/pingcap/failpoint v0.0.0-20260521055755-e7642935314f h1:cDo4qNgaQc2POMWTXjNrMA7yySdIF/d1AaW8kOA7qOs=
github.com/pingcap/failpoint v0.0.0-20260521055755-e7642935314f/go.mod h1:jimwlLpI/XtwQdlZML15HS+j4rirvwZM0GLY07wwgOo=
github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 h1:2SOzvGvE8beiC1Y4g9Onkvu6UmuBBOeWRGQEjJaT/JY=
github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4=
github.com/pingcap/log v1.1.1-0.20260227082333-572e590d08f1 h1:A2bEfgSb7hLwR9mxDszgGKweF+xY9YoTDG+8RjdFjDE=
Expand Down
35 changes: 34 additions & 1 deletion go/base/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import (
"github.com/github/gh-ost/go/metrics"
"github.com/github/gh-ost/go/mysql"
"github.com/github/gh-ost/go/sql"
"github.com/openark/golib/log"

"github.com/go-ini/ini"
"github.com/openark/golib/log"
"github.com/pingcap/failpoint"
)

// RowsEstimateMethod is the type of row number estimation
Expand Down Expand Up @@ -312,6 +313,8 @@ type MigrationContext struct {
DrainGTID mysql.BinlogCoordinates // Source @@gtid_executed captured immediately after the source RENAME TABLE; the applier drains until it reaches this coordinate (move-tables only).
}

UnsafeFailPointsEnabled bool

Log Logger
}

Expand Down Expand Up @@ -1206,3 +1209,33 @@ func SendWithContext[T any](ctx context.Context, ch chan<- T, val T) error {
return ctx.Err()
}
}

type failPointOpts struct {
wait time.Duration
}

type FailPointOpt func(*failPointOpts)

// WithFailPointWait sets the time for a fail point to wait before exiting.
func WithFailPointWait(wait time.Duration) FailPointOpt {
return func(opts *failPointOpts) {
opts.wait = wait
}
}

func (mctx *MigrationContext) NewFailPoint(name string, opts ...FailPointOpt) {
if mctx.UnsafeFailPointsEnabled {
var fpo failPointOpts
for _, opt := range opts {
opt(&fpo)
}

failpoint.Inject(name, func(_ failpoint.Value) {
mctx.Log.Debugf("[TEST] Encountered fail point: '%s'", name)
if fpo.wait > 0 {
time.Sleep(fpo.wait)
}
panic(fmt.Sprintf("[TEST] Encountered fail point: '%s'", name))
})
}
Comment thread
chriskirkland marked this conversation as resolved.
}
7 changes: 6 additions & 1 deletion go/cmd/gh-ost/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ func main() {
flag.StringVar(&migrationContext.MoveTables.TargetDatabase, "target-database", "", "Target MySQL database name for --move-tables mode. If not provided, uses the same database name as the source connection")
flag.BoolVar(&migrationContext.MoveTables.AllowOnSourcePrimary, "allow-on-source-primary", false, "allow --move-tables to read (schema, row copy, binlog) from the source cluster's primary. By default gh-ost stops if --host is the primary; prefer pointing --host at a replica to spare the primary the copy load.")

// unsafe fail points, for integration testing purposes
flag.BoolVar(&migrationContext.UnsafeFailPointsEnabled, "unsafe-fail-points-enabled", false, "UNSAFE: Enable fail points for integration testing purposes. Do not use in production.")

flag.CommandLine.SetOutput(os.Stdout)
flag.Parse()
cutOverLockTimeoutUserSpecified := false
Expand Down Expand Up @@ -345,7 +348,9 @@ func main() {
if *storageEngine == "rocksdb" {
migrationContext.Log.Warning("RocksDB storage engine support is experimental")
}
if migrationContext.CheckpointIntervalSeconds < 10 {
// ignore low checkpoint intervals in unsafe mode as frequent checkpoints are required to reliably
// reduce test duration
if migrationContext.CheckpointIntervalSeconds < 10 && !migrationContext.UnsafeFailPointsEnabled {
migrationContext.Log.Fatalf("--checkpoint-seconds should be >=10")
}
if migrationContext.CountTableRows && migrationContext.PanicOnWarnings {
Expand Down
7 changes: 7 additions & 0 deletions go/logic/migrator.go
Comment thread
chriskirkland marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,9 @@ func (mgtr *Migrator) moveTablesCutOver() (err error) {
}
}

mgtr.migrationContext.NewFailPoint("move-tables-panic-before-drain-completion", base.WithFailPointWait(2*time.Second))

// ------ T3: draining applier to drain GTID -----------
if err := mgtr.drainMoveTablesCutOver(drainGTID); err != nil {
return err
}
Expand All @@ -1403,6 +1406,8 @@ func (mgtr *Migrator) moveTablesCutOver() (err error) {
atomic.StoreInt64(&mgtr.migrationContext.CutOverCompleteFlag, 1)
mgtr.migrationContext.Log.Debugf("T4: CutOverCompleteFlag set")

mgtr.migrationContext.NewFailPoint("move-tables-panic-before-on-success-hook", base.WithFailPointWait(2*time.Second))

// ----- T5: on-success hook -----
// Hook unlocks user_rw@target via db-user-management and flips the
// write_cutover? feature flag. Standard env vars only — GH_OST_DRAIN_GTID +
Expand Down Expand Up @@ -2509,6 +2514,8 @@ func (mgtr *Migrator) iterateChunks() error {
}
return terminateRowIteration(err)
}

mgtr.migrationContext.NewFailPoint("move-tables-panic-after-row-copy", base.WithFailPointWait(2*time.Second))
}
}

Expand Down
107 changes: 72 additions & 35 deletions localtests/move-tables-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ original_sql_mode=
current_gtid_mode=
test_timeout=120
test_failure_log_tail_lines=50
tables_to_migrate=()

OPTIND=1
while getopts "b:s:dg" OPTION; do
Expand Down Expand Up @@ -175,8 +176,13 @@ build_ghost_command() {
# Build gh-ost command with all standard options
#
# expected $1 to be a comma-separated list of tables to move

# build comma-separated list of tables to move
move_tables_arg=$(IFS=, ; echo "${tables_to_migrate[*]}")

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[reviewer note] moved this into build_ghost_command so custom test logic (i.e. test.sh) don't have to repeat it.


# NOTE(chriskirkland): fully qualified package name + failpoint name
cmd="GOTRACEBACK=crash $ghost_binary \
--move-tables=$1 \
--move-tables=$move_tables_arg \
--user=root \
--password=opensesame \
--host=$source_replica_host \
Expand All @@ -197,7 +203,13 @@ build_ghost_command() {
--stack \
--checkpoint \
--postpone-cut-over-flag-file=$postpone_cutover_flag_file \
--checkpoint-seconds=1 \
--unsafe-fail-points-enabled \
--execute ${extra_args[@]}"

if [ -n "$GO_FAILPOINTS" ]; then
cmd="GO_FAILPOINTS=\"$GO_FAILPOINTS\" $cmd"
fi
Comment thread
chriskirkland marked this conversation as resolved.
}

print_log_excerpt() {
Expand Down Expand Up @@ -348,46 +360,47 @@ test_single() {
wait $test_pid 2>/dev/null
execution_result=$?
return $execution_result
fi

# kick off the on_test script for the test. this enables arbitrary custom logic
# concurrent with the gh-ost process. this enables additional scenarios like
# streaming of writes prior to the write cutover.
#
# IMPORTANT: The on-test script is executed in the background and will be killed as soon
# as the gh-ost process terminates.
if [ -f $tests_path/$test_name/on_test.sh ]; then
$tests_path/$test_name/on_test.sh &> /dev/null &
on_test_pid=$!
fi
else

# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover
(
sleep 1;
echo "⏩ Sending unpostpone cutover"
rm $postpone_cutover_flag_file &> /dev/null;
) &
# kick off the on_test script for the test. this enables arbitrary custom logic
# concurrent with the gh-ost process. this enables additional scenarios like
# streaming of writes prior to the write cutover.
#
# IMPORTANT: The on-test script is executed in the background and will be killed as soon
# as the gh-ost process terminates.
if [ -f $tests_path/$test_name/on_test.sh ]; then
$tests_path/$test_name/on_test.sh &> /dev/null &
on_test_pid=$!
fi

# Build and execute gh-ost command
move_tables_arg=$(IFS=, ; echo "${tables_to_migrate[*]}")
build_ghost_command "$move_tables_arg"
echo_dot
echo $cmd >$exec_command_file
echo_dot
timeout $test_timeout bash $exec_command_file >$test_logfile 2>&1
# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover
(
sleep 1;
echo "⏩ Sending unpostpone cutover"
rm $postpone_cutover_flag_file &> /dev/null;
) &

execution_result=$?
# Build and execute gh-ost command
build_ghost_command
echo_dot
echo $cmd >$exec_command_file
echo_dot
timeout $test_timeout bash $exec_command_file >$test_logfile 2>&1

if [ -n "$on_test_pid" ]; then
kill -KILL $on_test_pid &>/dev/null
fi
execution_result=$?

# Check for timeout (exit code 124)
if [ $execution_result -eq 124 ]; then
echo
echo "ERROR $test_name execution timed out"
print_log_excerpt
return 1
if [ -n "$on_test_pid" ]; then
kill -KILL $on_test_pid &>/dev/null
fi

# Check for timeout (exit code 124)
if [ $execution_result -eq 124 ]; then
echo
echo "ERROR $test_name execution timed out"
print_log_excerpt
return 1
fi
fi

if [ -f $tests_path/$test_name/sql_mode ]; then
Expand Down Expand Up @@ -453,7 +466,29 @@ test_single() {
done
}

enable_failpoint() {
mkdir -p $repo_root/tools/bin
if [ ! -f $repo_root/tools/bin/failpoint-ctl ]; then
echo "⚙️ Installing failpoint"
GOBIN=$repo_root/tools/bin go install github.com/pingcap/failpoint/failpoint-ctl@v0.0.0-20220801062533-2eaa32854a6c
fi

echo "⚙️ Enabling failpoint"
$repo_root/tools/bin/failpoint-ctl enable go

echo "✅ Successfully enabled failpoint"
}

disable_failpoint() {
echo "⚙️ Disabling failpoint"
$repo_root/tools/bin/failpoint-ctl disable go

echo "✅ Successfully disabled failpoint"
}

build_binary() {
enable_failpoint

@chriskirkland chriskirkland Jun 22, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[reviewer note] only has to be enabled while the code is built. It actually modifies the code on disk, so we need to make sure to disable it to keep git history clean.


echo "Building"
rm -f $default_ghost_binary
[ "$ghost_binary" == "" ] && ghost_binary="$default_ghost_binary"
Expand All @@ -468,6 +503,8 @@ build_binary() {
echo "Build failure"
exit 1
fi

disable_failpoint
}

test_all() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
drop table if exists gh_ost_test;
create table gh_ost_test (
id bigint(20) NOT NULL AUTO_INCREMENT,
column1 int(11) NOT NULL,
column2 smallint(5) unsigned NOT NULL,
column3 mediumint(8) unsigned NOT NULL,
column4 tinyint(3) unsigned NOT NULL,
column5 int(11) NOT NULL,
column6 int(11) NOT NULL,
PRIMARY KEY (id),
KEY c12_ix (column1, column2)
) auto_increment=1;

insert into gh_ost_test values
(NULL, 1001, 100, 500000, 10, 1700000001, 1700000002),
(NULL, 1002, 200, 600000, 20, 1700000003, 1700000004),
(NULL, 1003, 300, 700000, 30, 1700000005, 1700000006),
(NULL, 1004, 400, 800000, 40, 1700000007, 1700000008),
(NULL, 1005, 500, 900000, 50, 1700000009, 1700000010),
(NULL, 1006, 600, 1000000, 60, 1700000011, 1700000012),
(NULL, 1007, 700, 1100000, 70, 1700000013, 1700000014),
(NULL, 1008, 800, 1200000, 80, 1700000015, 1700000016),
(NULL, 1009, 900, 1300000, 90, 1700000017, 1700000018),
(NULL, 1010, 1000, 1400000, 100, 1700000019, 1700000020),
(NULL, 1011, 1100, 1500000, 110, 1700000021, 1700000022),
(NULL, 1012, 1200, 1600000, 120, 1700000023, 1700000024),
(NULL, 1013, 1300, 1700000, 130, 1700000025, 1700000026),
(NULL, 1014, 1400, 1800000, 140, 1700000027, 1700000028),
(NULL, 1015, 1500, 1900000, 150, 1700000029, 1700000030),
(NULL, 1016, 1600, 2000000, 160, 1700000031, 1700000032),
(NULL, 1017, 1700, 2100000, 170, 1700000033, 1700000034),
(NULL, 1018, 1800, 2200000, 180, 1700000035, 1700000036),
(NULL, 1019, 1900, 2300000, 190, 1700000037, 1700000038),
(NULL, 1020, 2000, 2400000, 200, 1700000039, 1700000040);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gh_ost_test
Loading
Loading