diff --git a/migrate/dual-write-and-backfill/dual-write-from-postgres.md b/migrate/dual-write-and-backfill/dual-write-from-postgres.md index 4eb1761cd2..73d2f9db96 100644 --- a/migrate/dual-write-and-backfill/dual-write-from-postgres.md +++ b/migrate/dual-write-and-backfill/dual-write-from-postgres.md @@ -18,6 +18,7 @@ import SourceTargetNote from "versionContent/_partials/_migrate_source_target_no import DumpDatabaseRoles from "versionContent/_partials/_migrate_dual_write_dump_database_roles.mdx"; import Step6eTurnOnCompressionPolicies from "versionContent/_partials/_migrate_dual_write_6e_turn_on_compression_policies.mdx"; import Step6aThroughc from "versionContent/_partials/_migrate_dual_write_6a_through_c.mdx"; +import ExplainPgDumpFlags from "versionContent/_partials/_migrate_explain_pg_dump_flags.mdx"; # Dual-write and backfill from PostgreSQL database diff --git a/migrate/index.md b/migrate/index.md index ae1d89fafc..a614c5a60b 100644 --- a/migrate/index.md +++ b/migrate/index.md @@ -35,8 +35,9 @@ migrating from PostgreSQL. If you're using PostgreSQL, you may also have heard of logical replication being the recommended strategy for migrations with low downtime. Currently, -TimescaleDB doesn't work with logical replication, so this is not a viable -option, but we are actively working on making this possible. +TimescaleDB supports logical replication only from sources that are using +PostgreSQL without Timescale. There's active work being made for supporting +Timescale sources. If you're migrating from something other than PostgreSQL, and don't want to use the dual-write and backfill approach, then the easiest way to move your data to diff --git a/migrate/live-migration/index.md b/migrate/live-migration/index.md new file mode 100644 index 0000000000..c4aba0855d --- /dev/null +++ b/migrate/live-migration/index.md @@ -0,0 +1,45 @@ +--- +title: Live migration +excerpt: Migrate an entire database with low downtime +products: [cloud] +keywords: [migration, low-downtime, backup] +tags: [recovery, logical backup, replication] +--- + +# Live migration + +Live migration is a migration strategy to move a large amount of data +(100 GB-10 TB+) with low downtime (on the order of minutes of +downtime). It is significantly more complicated to execute than a migration +with downtime using [pg_dump/restore][pg-dump-and-restore], but supports more +use-cases and has less requirements than the [dual-write and backfill] method. + + + +Live migration leverages Postgres' built-in replication functionality to +provide a seamless migration with very little application downtime. + +Roughly, it consists of four steps: + +1. Prepare and create replication slot in source database. +2. Copy schema from source to target, optionally enabling hypertables. +3. Copy data from source to target while capturing changes. +4. Apply captured changes from source to target. + +Currently live migration only supports migrating from PostgreSQL, but we are +actively working on supporting TimescaleDB. + +Live migration works well when: +- Large, busy tables have primary keys, or don't have many `UPDATE` or + `DELETE` statements. +- The insert workload does not exceed 20'000 rows per second, and + inserts are batched. If your application exceeds this, you should use + the [dual-write and backfill] migration method. + +For more information, consult the step-by-step migration guide: + +- [Live migration from PostgreSQL][from-postgres] + +[from-postgres]: /migrate/:currentVersion:/live-migration/live-migration-from-postgres/ +[pg-dump-and-restore]: /migrate/:currentVersion:/pg-dump-and-restore/ +[dual-write and backfill]: /migrate/:currentVersion:/dual-write-and-backfill/ \ No newline at end of file diff --git a/migrate/live-migration/live-migration-from-postgres.md b/migrate/live-migration/live-migration-from-postgres.md new file mode 100644 index 0000000000..9a1e27be4b --- /dev/null +++ b/migrate/live-migration/live-migration-from-postgres.md @@ -0,0 +1,415 @@ +--- +title: Migrate from PostgreSQL using live migration +excerpt: Migrate from a PostgreSQL database using the low-downtime live migration method +products: [cloud] +keywords: [migration, low-downtime] +tags: [migration, logical backup, replication] +--- + +import GettingHelp from "versionContent/_partials/_migrate_dual_write_backfill_getting_help.mdx"; +import SourceTargetNote from "versionContent/_partials/_migrate_source_target_note.mdx"; +import StepOne from "versionContent/_partials/_migrate_dual_write_step1.mdx"; +import DumpDatabaseRoles from "versionContent/_partials/_migrate_dual_write_dump_database_roles.mdx"; +import ExplainPgDumpFlags from "versionContent/_partials/_migrate_explain_pg_dump_flags.mdx"; + +# Live migration from PostgreSQL database with pgcopydb + +This document provides detailed step-by-step instructions to migrate data using +[pgcopydb][pgcopydb] to perform a live migration from a source database which +is using PostgreSQL to Timescale. + +You should provision a dedicated instance to run the migration steps from. +Ideally an AWS EC2 instance that's in the same region as the Timescale target +service. For an ingestion load of 10'000 transactions/s, and assuming that the +historical data copy takes 2 days, we recommend 4 CPUs with 4 to 8 GiB of RAM +and 1.2 TiB of storage. + +Before beginning the migration process, ensure that tools `psql`, `pg_dump`, +`pg_dumpall`, and `pgcopydb` are installed and available on the system that +performs the migration. + +For Debian and Ubuntu systems, you can install all the tools with: + +``` +sudo apt update +sudo apt install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh +sudo apt install -y pgcopydb +``` + +On other distributions, you can use the following installation instructions: + +- `pgcopydb`: Installation instructions can be found in the [official + repository][install-pgcopydb]. When installing from package managers like + `apt`, `yum`, or `dnf`, the other required tools are usually also installed + as dependencies to `pgcopydb`. + +- `psql`, `pg_dump`, and `pg_dumpall`: These can be installed by following the + instructions in the [How to Install psql on Mac, Ubuntu, Debian, Windows][install-psql] + blog post. Although the instructions specifically mention `psql`, following + them also installs `pg_dump` and `pg_dumpall`. + + + +In detail, the migration process consists of the following steps: + +1. Set up a target database instance in Timescale. +1. Prepare the source database for the live migration. +1. Set up a replication slot and snapshot. +1. Migrate roles and schema from source to target. +1. Enable hypertables. +1. Migrate initial data from source to target. +1. Apply the replicated changes from source. +1. Promote target database as new primary. + + + + + +## 2. Prepare the source database for the live migration + +It's important to ensure that the `old_snapshot_threshold` value is set to the +default value of `-1` in your source database. This prevents PostgreSQL from +treating the data in a snapshot as outdated. If this value is set other than +`-1`, it might affect the existing data migration step. + +To check the current value of `old_snapshot_threshold`, run the command: + +```sh +psql -X -d $SOURCE -c 'show old_snapshot_threshold' +``` + +If the query returns something other than `-1`, you must change it. + +If you have a superuser on a self-hosted database, run the following command: + +```sh +psql -X -d $SOURCE -c 'alter system set old_snapshot_threshold=-1' +``` + +Otherwise, if you are using a managed service, use your cloud provider's +configuration mechanism to set `old_snapshot_threshold` to `-1`. + +Next, you should set `wal_level` to `logical` so that the write-ahead log (WAL) +records information that is needed for logical decoding. + +To check the current value of `wal_level`, run the command: + +```sh +psql -X -d $SOURCE -c 'show wal_level' +``` + +If the query returns something other than `logical`, you must change it. + +If you have a superuser on a self-hosted database, run the following command: + +```sh +psql -X -d $SOURCE -c 'alter system set wal_level=logical' +``` + +Otherwise, if you are using a managed service, use your cloud provider's +configuration mechanism to set `wal_level` to `logical`. + +Restart your database for the changes to take effect, and verify that the +settings are reflected in your database. + +## 3. Set up a replication slot and snapshot + +The [replication slot][replication-slot] forms the backbone of the replication +strategy. + +> A slot represents a stream of changes that can be replayed to a client in the +order they were made on the origin server. + +The stream of changes emitted by the slot are buffered into disk until they are +applied on the target. The instance used to orchestrate the migration (the one +running the commands) should have enough capacity to store the files, and it +should be actively monitored to prevent any issues that might result due to +lack of space. + +Before starting, there's an important caveat. To replicate `DELETE` and +`UPDATE` operations, the source table must either have a primary key or +`REPLICA IDENTITY` set. Replica identity assists logical decoding in +identifying the rows being modified. It defaults to using the table's primary +key. + +If a table doesn't have a primary key, you'll have to manually set the replica +identity. One option is to use a unique, non-partial, non-deferrable index that +includes only columns marked as `NOT NULL`. This can be set as the replica +identity: + +```sql +ALTER TABLE {table_name} REPLICA IDENTITY USING INDEX {_index_name} +``` + +If there's no primary key or viable unique index to use, you'll have to set +`REPLICA IDENTITY` to `FULL`. If you are expecting a large number of `UPDATE` +or `DELETE` operations on the table we **do not recommend** using `FULL`. For +each `UPDATE` or `DELETE` statement, Postgres will have to read the whole table +to find all matching rows, which will result in significantly slower +replication. + +```sql +ALTER TABLE {table_name} REPLICA IDENTITY FULL +``` + +Once you're sure all your tables are properly configured for replication, use +`pgcopydb`'s follow command to create a replication slot: + +```sh +pgcopydb follow \ + --source "$SOURCE" \ + --target "$TARGET" \ + --fail-fast \ + --plugin wal2json +``` + + + +This command is going to be active during most of the migration process. You +can run it on a separate terminal instance, or start it in the background. To +start it in the background append `> pgcopydb_follow.log 2>&1 &` to the +command. For example: + +```sh +pgcopydb follow \ + --source "$SOURCE" \ + --target "$TARGET" \ + --fail-fast \ + --plugin wal2json > pgcopydb_follow.log 2>&1 & +``` + +The `> pgcopydb_follow.log 2>&1` part redirects all the messages to the +`pgcopydb_follow.log` file, this is optional but recommended. The `pgcopydb +follow` command outputs many messages, if they are not redirected, using the +terminal becomes cumbersome due to the constant pop up of messages. + +The `follow` command not only creates the replication slot for streaming +changes, but also exports a [snapshot][snapshot] ID to `/tmp/pgcopydb/snapshot`. +This ID can be used to migrate the data that was stored in the database prior +to the creation of the replication slot. + +> A snapshot determines which data is visible to the transaction that is using +the snapshot. Synchronized snapshots are necessary when two or more sessions +need to see identical content in the database. + +Before the stream of changes can be applied, the schema and data that existed +prior to the creation of the replication slot must be migrated ([step +4][step-4]). The point that marks the beginning of the replication and +buffering of changes is given by the exported snapshot. The larger the +database, the more time it takes to perform the initial migration, and the +longer the buffered files need to be stored. + +## 4. Migrate roles and schema from source to target + +### 4a. Dump the database roles from the source database + + + +### 4b. Dump the database schema from the source database + +```sh +pg_dump -d "$SOURCE" \ + --format=plain \ + --quote-all-identifiers \ + --no-tablespaces \ + --no-owner \ + --no-privileges \ + --schema-only \ + --file=dump.sql \ + --snapshot=$(cat /tmp/pgcopydb/snapshot) +``` + +- `--schema-only` is used to dump only the object definitions (schema), not + data. + +- `--snapshot` is used to specified the synchronized [snapshot][snapshot] when + making a dump of the database. + + + +### 4c. Load the roles and schema into the target database + +```sh +psql -X -d "$TARGET" \ + -v ON_ERROR_STOP=1 \ + --echo-errors \ + -f roles.sql \ + -f dump.sql +``` + +## 5. Enable hypertables + +This is the ideal point to convert regular tables into hypertables. In simple +terms, you'll want to convert the tables that contain time series data. For +each table that's going to be converted to a Hypertable in the target database, +run the following command: + +```sh +psql -X -d "$TARGET" \ + -v ON_ERROR_STOP=1 \ + -c "SELECT create_hypertable('', '