From 806dbc0e71f3abb99093e5f29011f2872f58f56c Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 05:22:25 +0300 Subject: [PATCH 01/45] Custom user shared invalidation message --- src/backend/utils/cache/inval.c | 49 +++++++++++++++++++++++++++++++++ src/include/storage/sinval.h | 11 ++++++++ src/include/utils/inval.h | 4 +++ 3 files changed, 64 insertions(+) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 0008826f67c..e7f4917613d 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -252,6 +252,7 @@ int debug_discard_caches = 0; #define MAX_SYSCACHE_CALLBACKS 64 #define MAX_RELCACHE_CALLBACKS 10 +#define MAX_USERCACHE_CALLBACKS 10 static struct SYSCACHECALLBACK { @@ -273,6 +274,14 @@ static struct RELCACHECALLBACK static int relcache_callback_count = 0; +static struct USERCACHECALLBACK +{ + UsercacheCallbackFunction function; + Datum arg; +} usercache_callback_list[MAX_RELCACHE_CALLBACKS]; + +static int usercache_callback_count = 0; + /* ---------------------------------------------------------------- * Invalidation subgroup support functions * ---------------------------------------------------------------- @@ -683,6 +692,19 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) else if (msg->sn.dbId == MyDatabaseId) InvalidateCatalogSnapshot(); } + else if (msg->id == SHAREDINVALUSERCACHE_ID) + { + int i; + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + msg->usr.arg1, + msg->usr.arg2, + msg->usr.arg3); + } + } else elog(FATAL, "unrecognized SI message ID: %d", msg->id); } @@ -726,6 +748,17 @@ InvalidateSystemCachesExtended(bool debug_discard) ccitem->function(ccitem->arg, InvalidOid); } + + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + InvalidOid, + InvalidOid, + InvalidOid); + } + } @@ -1570,6 +1603,22 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, ++relcache_callback_count; } +/* + * CacheRegisterUsercacheCallback + */ +void +CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg) +{ + if (usercache_callback_count >= MAX_USERCACHE_CALLBACKS) + elog(FATAL, "out of usercache_callback_list slots"); + + usercache_callback_list[usercache_callback_count].function = func; + usercache_callback_list[usercache_callback_count].arg = arg; + + ++usercache_callback_count; +} + /* * CallSyscacheCallbacks * diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h index 0721e4d2058..ef748bfe1e1 100644 --- a/src/include/storage/sinval.h +++ b/src/include/storage/sinval.h @@ -110,6 +110,16 @@ typedef struct Oid relId; /* relation ID */ } SharedInvalSnapshotMsg; +#define SHAREDINVALUSERCACHE_ID (-6) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid arg1; /* user-specific values */ + Oid arg2; + Oid arg3; +} SharedInvalUserMsg; + typedef union { int8 id; /* type field --- must be first */ @@ -119,6 +129,7 @@ typedef union SharedInvalSmgrMsg sm; SharedInvalRelmapMsg rm; SharedInvalSnapshotMsg sn; + SharedInvalUserMsg usr; } SharedInvalidationMessage; diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 14b4eac0630..a57f834d01d 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -22,6 +22,7 @@ extern PGDLLIMPORT int debug_discard_caches; typedef void (*SyscacheCallbackFunction) (Datum arg, int cacheid, uint32 hashvalue); typedef void (*RelcacheCallbackFunction) (Datum arg, Oid relid); +typedef void (*UsercacheCallbackFunction) (Datum arg, Oid arg1, Oid arg2, Oid arg3); extern void AcceptInvalidationMessages(void); @@ -59,6 +60,9 @@ extern void CacheRegisterSyscacheCallback(int cacheid, extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, Datum arg); +extern void CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg); + extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue); extern void InvalidateSystemCaches(void); From 9bba509c519c8b91aedc085ea5dd7108cde83985 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 05:24:57 +0300 Subject: [PATCH 02/45] CacheInvalidateRelcacheByDbidRelid() --- src/backend/utils/cache/inval.c | 19 +++++++++++++++++++ src/include/utils/inval.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index e7f4917613d..b7a5f0c48a0 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1465,6 +1465,25 @@ CacheInvalidateRelcacheByRelid(Oid relid) ReleaseSysCache(tup); } +/* + * CacheInvalidateRelcacheByDbidRelid + */ +void +CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid) +{ + SharedInvalidationMessage msg; + + PrepareInvalidationState(); + + msg.rc.id = SHAREDINVALRELCACHE_ID; + msg.rc.dbId = dbid; + msg.rc.relId = relid; + /* check AddCatcacheInvalidationMessage() for an explanation */ + VALGRIND_MAKE_MEM_DEFINED(&msg, sizeof(msg)); + + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheInvalidateSmgr diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index a57f834d01d..1461271bbe6 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -49,6 +49,8 @@ extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); extern void CacheInvalidateRelcacheByRelid(Oid relid); +extern void CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid); + extern void CacheInvalidateSmgr(RelFileLocatorBackend rlocator); extern void CacheInvalidateRelmap(Oid databaseId); From a2309b3fd4f8b076a16e0491d1f1601c2b82e64b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 20:26:34 +0300 Subject: [PATCH 03/45] CommitSeqNo data type --- src/include/access/transam.h | 16 ++++++++++++++++ src/include/c.h | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index f5af6d30556..f7bcb4a8822 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -196,6 +196,22 @@ FullTransactionIdAdvance(FullTransactionId *dest) #define FirstUnpinnedObjectId 12000 #define FirstNormalObjectId 16384 +#define COMMITSEQNO_INPROGRESS UINT64CONST(0x0) +#define COMMITSEQNO_NON_DELETED UINT64CONST(0x1) +#define COMMITSEQNO_ABORTED UINT64CONST(0x2) +#define COMMITSEQNO_FROZEN UINT64CONST(0x3) +#define COMMITSEQNO_COMMITTING UINT64CONST(0x4) +#define COMMITSEQNO_FIRST_NORMAL UINT64CONST(0x5) +#define COMMITSEQNO_MAX_NORMAL UINT64CONST(0x7FFFFFFFFFFFFFFF) + +#define COMMITSEQNO_IS_INPROGRESS(csn) ((csn) == COMMITSEQNO_INPROGRESS || (csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_NON_DELETED(csn) ((csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_ABORTED(csn) ((csn) == COMMITSEQNO_ABORTED) +#define COMMITSEQNO_IS_FROZEN(csn) ((csn) == COMMITSEQNO_FROZEN) +#define COMMITSEQNO_IS_NORMAL(csn) ((csn) >= COMMITSEQNO_FIRST_NORMAL) +#define COMMITSEQNO_IS_COMMITTING(csn) ((csn) == COMMITSEQNO_COMMITTING) +#define COMMITSEQNO_IS_COMMITTED(csn) ((csn) >= COMMITSEQNO_FROZEN) + /* * VariableCache is a data structure in shared memory that is used to track * OID and XID assignment state. For largely historical reasons, there is diff --git a/src/include/c.h b/src/include/c.h index f69d739be57..024d376e9fa 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -621,7 +621,7 @@ typedef double float8; /* * Oid, RegProcedure, TransactionId, SubTransactionId, MultiXactId, - * CommandId + * CommandId, CommitSeqNo */ /* typedef Oid is in postgres_ext.h */ @@ -652,6 +652,8 @@ typedef uint32 CommandId; #define FirstCommandId ((CommandId) 0) #define InvalidCommandId (~(CommandId)0) +typedef uint64 CommitSeqNo; + /* ---------------- * Variable-length datatypes all share the 'struct varlena' header. From 47a39c7b4d4e8dab5b195102aca6a8a6c5ebf445 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 20:36:18 +0300 Subject: [PATCH 04/45] Custom TOAST --- contrib/pageinspect/heapfuncs.c | 1 + contrib/test_decoding/test_decoding.c | 2 +- src/backend/access/common/detoast.c | 42 ++++++++++++++++--- src/backend/access/common/toast_compression.c | 7 +++- src/backend/access/common/toast_internals.c | 4 +- src/backend/access/table/toast_helper.c | 6 +-- src/backend/replication/logical/proto.c | 2 +- src/backend/replication/pgoutput/pgoutput.c | 4 +- src/include/access/detoast.h | 14 +++++++ src/include/varatt.h | 33 ++++++++++++++- src/test/regress/regress.c | 2 +- 11 files changed, 98 insertions(+), 19 deletions(-) diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 0f0252558c5..d1ac2fd85ee 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -364,6 +364,7 @@ tuple_data_split_internal(Oid relid, char *tupdata, */ if (VARATT_IS_EXTERNAL(tupdata + off) && !VARATT_IS_EXTERNAL_ONDISK(tupdata + off) && + !VARATT_IS_EXTERNAL_ORIOLEDB(tupdata + off) && !VARATT_IS_EXTERNAL_INDIRECT(tupdata + off)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index 12d1d0505d7..dedc4be074f 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -578,7 +578,7 @@ tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple, bool skip_ /* print data */ if (isnull) appendStringInfoString(s, "null"); - else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval)) + else if (typisvarlena && (VARATT_IS_EXTERNAL_ONDISK(origval) || VARATT_IS_EXTERNAL_ORIOLEDB(origval))) appendStringInfoString(s, "unchanged-toast-datum"); else if (!typisvarlena) print_literal(s, typid, diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index 108e0126a14..8af80c80865 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -26,7 +26,6 @@ static struct varlena *toast_fetch_datum(struct varlena *attr); static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); -static struct varlena *toast_decompress_datum(struct varlena *attr); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); /* ---------- @@ -46,7 +45,7 @@ detoast_external_attr(struct varlena *attr) { struct varlena *result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an external stored plain value @@ -115,7 +114,7 @@ detoast_external_attr(struct varlena *attr) struct varlena * detoast_attr(struct varlena *attr) { - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an externally stored datum --- fetch it back from there @@ -332,6 +331,20 @@ detoast_attr_slice(struct varlena *attr, return result; } +static ToastFunc o_detoast_func = NULL; + +void +register_o_detoast_func(ToastFunc func) +{ + o_detoast_func = func; +} + +void +deregister_o_detoast_func() +{ + o_detoast_func = NULL; +} + /* ---------- * toast_fetch_datum - * @@ -347,6 +360,17 @@ toast_fetch_datum(struct varlena *attr) struct varatt_external toast_pointer; int32 attrsize; + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + if (o_detoast_func != NULL) + { + result = o_detoast_func(attr); + if (result == NULL) + elog(ERROR, "unexpected NULL detoast result"); + return result; + } + } + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums"); @@ -467,7 +491,7 @@ toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, * * Decompress a compressed version of a varlena datum */ -static struct varlena * +struct varlena * toast_decompress_datum(struct varlena *attr) { ToastCompressionId cmid; @@ -547,11 +571,17 @@ toast_raw_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->raw_size + VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { - /* va_rawsize is the size of the original datum -- including header */ struct varatt_external toast_pointer; + /* va_rawsize is the size of the original datum -- including header */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); result = toast_pointer.va_rawsize; } diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 4cf956a759c..4b281ed438d 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -262,7 +262,12 @@ toast_get_compression_id(struct varlena *attr) * the external toast pointer. If compressed inline, fetch it from the * toast compression header. */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + cmid = toasted->formatFlags >> ORIOLEDB_EXT_FORMAT_FLAGS_BITS; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 588825ed85d..9b6a5d9091c 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -240,7 +240,7 @@ toast_save_datum(Relation rel, Datum value, { struct varatt_external old_toast_pointer; - Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal) || VARATT_IS_EXTERNAL_ORIOLEDB(oldexternal)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) @@ -396,7 +396,7 @@ toast_delete_datum(Relation rel, Datum value, bool is_speculative) int validIndex; SnapshotData SnapshotToast; - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!VARATT_IS_EXTERNAL_ONDISK(attr) && !VARATT_IS_EXTERNAL_ORIOLEDB(attr)) return; /* Must copy to access aligned fields */ diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index b5cfeb21aab..2afcd4830d5 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -72,10 +72,10 @@ toast_tuple_init(ToastTupleContext *ttc) * we have to delete it later. */ if (att->attlen == -1 && !ttc->ttc_oldisnull[i] && - VARATT_IS_EXTERNAL_ONDISK(old_value)) + (VARATT_IS_EXTERNAL_ONDISK(old_value) || VARATT_IS_EXTERNAL_ORIOLEDB(old_value))) { if (ttc->ttc_isnull[i] || - !VARATT_IS_EXTERNAL_ONDISK(new_value) || + !(VARATT_IS_EXTERNAL_ONDISK(new_value) || VARATT_IS_EXTERNAL_ORIOLEDB(new_value)) || memcmp((char *) old_value, (char *) new_value, VARSIZE_EXTERNAL(old_value)) != 0) { @@ -331,7 +331,7 @@ toast_delete_external(Relation rel, Datum *values, bool *isnull, if (isnull[i]) continue; - else if (VARATT_IS_EXTERNAL_ONDISK(value)) + else if (VARATT_IS_EXTERNAL_ONDISK(value) || VARATT_IS_EXTERNAL_ORIOLEDB(value)) toast_delete_datum(rel, value, is_speculative); } } diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 504f94d4a77..03f9a54f587 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -814,7 +814,7 @@ logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, continue; } - if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + if (att->attlen == -1 && (VARATT_IS_EXTERNAL_ONDISK(values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(values[i]))) { /* * Unchanged toasted datum. (Note that we don't promise to detect diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index c57c5ed8de9..18f8824d5a3 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -1315,8 +1315,8 @@ pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, * VARTAG_INDIRECT. See ReorderBufferToastReplace. */ if (att->attlen == -1 && - VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && - !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + (VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(new_slot->tts_values[i])) && + !(VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(old_slot->tts_values[i])) ) { if (!tmp_new_slot) { diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h index 908e1fc6919..26ef91e23df 100644 --- a/src/include/access/detoast.h +++ b/src/include/access/detoast.h @@ -63,6 +63,13 @@ extern struct varlena *detoast_attr_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); +/* ---------- + * toast_decompress_datum - + * + * Decompress a compressed version of a varlena datum + */ +extern struct varlena *toast_decompress_datum(struct varlena *attr); + /* ---------- * toast_raw_datum_size - * @@ -79,4 +86,11 @@ extern Size toast_raw_datum_size(Datum value); */ extern Size toast_datum_size(Datum value); +/* + * for in_memory module + */ +typedef struct varlena* (*ToastFunc) (struct varlena *attr); +extern void register_o_detoast_func(ToastFunc func); +extern void deregister_o_detoast_func(void); + #endif /* DETOAST_H */ diff --git a/src/include/varatt.h b/src/include/varatt.h index e34870526ba..0128c2d1778 100644 --- a/src/include/varatt.h +++ b/src/include/varatt.h @@ -38,6 +38,23 @@ typedef struct varatt_external Oid va_toastrelid; /* RelID of TOAST table containing it */ } varatt_external; +typedef struct OToastExternal +{ + uint16 data_size; /* length of OToastExternal data */ + int16 attnum; + int32 raw_size; /* original data size */ + int32 toasted_size; /* compressed original data size */ + /* for fetching data from TOAST tree */ + CommitSeqNo csn; + /* for finding TOAST tree */ + Oid datoid; + Oid relid; + Oid relnode; + /* for storing primary index tuple */ + uint8 formatFlags; /* primary index tuple flags */ + char data[FLEXIBLE_ARRAY_MEMBER]; /* data (primary index tuple) */ +} OToastExternal; + /* * These macros define the "saved size" portion of va_extinfo. Its remaining * two high-order bits identify the compression method. @@ -86,17 +103,21 @@ typedef enum vartag_external VARTAG_INDIRECT = 1, VARTAG_EXPANDED_RO = 2, VARTAG_EXPANDED_RW = 3, - VARTAG_ONDISK = 18 + VARTAG_ONDISK = 18, + VARTAG_ORIOLEDB = 34 } vartag_external; /* this test relies on the specific tag values above */ #define VARTAG_IS_EXPANDED(tag) \ (((tag) & ~1) == VARTAG_EXPANDED_RO) +#define O_TOAST_EXTERNAL_SZ offsetof(OToastExternal, data) + #define VARTAG_SIZE(tag) \ ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ + (tag) == VARTAG_ORIOLEDB ? O_TOAST_EXTERNAL_SZ : \ (AssertMacro(false), 0)) /* @@ -282,11 +303,16 @@ typedef struct #define VARDATA_SHORT(PTR) VARDATA_1B(PTR) #define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) +#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)) \ + + (VARATT_IS_EXTERNAL_ORIOLEDB(PTR) ? \ + *((uint16 *) VARDATA_1B_E(PTR)) \ + : 0)) + #define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) #define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) #define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) + #define VARATT_IS_EXTERNAL_ONDISK(PTR) \ (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) #define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ @@ -299,6 +325,9 @@ typedef struct (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) #define VARATT_IS_EXTERNAL_NON_EXPANDED(PTR) \ (VARATT_IS_EXTERNAL(PTR) && !VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) +#define VARATT_IS_EXTERNAL_ORIOLEDB(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ORIOLEDB) + #define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) #define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index bcbc6d910f1..fb75cf0905f 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -606,7 +606,7 @@ make_tuple_indirect(PG_FUNCTION_ARGS) continue; /* copy datum, so it still lives later */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) attr = detoast_external_attr(attr); else { From 7862dff7a4e49f366aa023145a80ec10aff64083 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 23 Mar 2023 00:12:00 +0300 Subject: [PATCH 05/45] Allow locking updated tuples in tuple_update() and tuple_delete() Discussion: https://postgr.es/m/CAPpHfdua-YFw3XTprfutzGp28xXLigFtzNbuFY8yPhqeq6X5kg%40mail.gmail.com Reviewed-by: Aleksander Alekseev, Pavel Borisov, Vignesh C, Mason Sharp Reviewed-by: Andres Freund, Chris Travers --- src/backend/access/heap/heapam.c | 205 ++++++++++---- src/backend/access/heap/heapam_handler.c | 94 +++++-- src/backend/access/table/tableam.c | 26 +- src/backend/commands/trigger.c | 55 +--- src/backend/executor/execReplication.c | 19 +- src/backend/executor/nodeModifyTable.c | 332 +++++++++-------------- src/include/access/heapam.h | 19 +- src/include/access/tableam.h | 69 +++-- src/include/commands/trigger.h | 4 +- 9 files changed, 476 insertions(+), 347 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 64f84a2e4bd..43d2bbcf84b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2512,10 +2512,11 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) } /* - * heap_delete - delete a tuple + * heap_delete - delete a tuple, optionally fetching it into a slot * * See table_tuple_delete() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -2524,8 +2525,9 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) */ TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + CommandId cid, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2603,7 +2605,7 @@ heap_delete(Relation relation, ItemPointer tid, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to delete invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -2744,7 +2746,30 @@ heap_delete(Relation relation, ItemPointer tid, tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resources on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); if (vmbuffer != InvalidBuffer) @@ -2918,8 +2943,24 @@ heap_delete(Relation relation, ItemPointer tid, */ CacheInvalidateHeapTuple(relation, &tp, NULL); - /* Now we can release the buffer */ - ReleaseBuffer(buffer); + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } /* * Release the lmgr tuple lock, if we had it. @@ -2951,8 +2992,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) result = heap_delete(relation, tid, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, false /* changingPart */ , NULL); switch (result) { case TM_SelfModified: @@ -2979,10 +3020,11 @@ simple_heap_delete(Relation relation, ItemPointer tid) } /* - * heap_update - replace a tuple + * heap_update - replace a tuple, optionally fetching it into a slot * * See table_tuple_update() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -2991,9 +3033,9 @@ simple_heap_delete(Relation relation, ItemPointer tid) */ TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -3170,7 +3212,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); /* see below about the "no wait" case */ - Assert(result != TM_BeingModified || wait); + Assert(result != TM_BeingModified || (options & TABLE_MODIFY_WAIT)); if (result == TM_Invisible) { @@ -3179,7 +3221,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to update invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -3383,7 +3425,30 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resouces on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); if (vmbuffer != InvalidBuffer) @@ -3862,7 +3927,26 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, /* Now we can release the buffer(s) */ if (newbuf != buffer) ReleaseBuffer(newbuf); - ReleaseBuffer(buffer); + + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } + if (BufferIsValid(vmbuffer_new)) ReleaseBuffer(vmbuffer_new); if (BufferIsValid(vmbuffer)) @@ -4070,8 +4154,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, &lockmode, update_indexes, NULL); switch (result) { case TM_SelfModified: @@ -4134,12 +4218,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * tuples. * * Output parameters: - * *tuple: all fields filled in - * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *slot: BufferHeapTupleTableSlot filled with tuple * *tmfd: filled in failure cases (see below) * * Function results are the same as the ones for table_tuple_lock(). * + * If *slot already contains the target tuple, it takes advantage on that by + * skipping the ReadBuffer() call. + * * In the failure cases other than TM_Invisible, the routine fills * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, * if necessary), and t_cmax (the last only for TM_SelfModified, @@ -4150,15 +4236,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * See README.tuplock for a thorough explanation of this mechanism. */ TM_Result -heap_lock_tuple(Relation relation, HeapTuple tuple, +heap_lock_tuple(Relation relation, ItemPointer tid, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, TM_FailureData *tmfd) + bool follow_updates, TM_FailureData *tmfd) { TM_Result result; - ItemPointer tid = &(tuple->t_self); ItemId lp; Page page; + Buffer buffer; Buffer vmbuffer = InvalidBuffer; BlockNumber block; TransactionId xid, @@ -4170,8 +4255,24 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, bool skip_tuple_lock = false; bool have_tuple_lock = false; bool cleared_all_frozen = false; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + HeapTuple tuple = &bslot->base.tupdata; + + Assert(TTS_IS_BUFFERTUPLE(slot)); - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + /* Take advantage if slot already contains the relevant tuple */ + if (!TTS_EMPTY(slot) && + slot->tts_tableOid == relation->rd_id && + ItemPointerCompare(&slot->tts_tid, tid) == 0 && + BufferIsValid(bslot->buffer)) + { + buffer = bslot->buffer; + IncrBufferRefCount(buffer); + } + else + { + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + } block = ItemPointerGetBlockNumber(tid); /* @@ -4180,21 +4281,22 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * in the middle of changing this, so we'll need to recheck after we have * the lock. */ - if (PageIsAllVisible(BufferGetPage(*buffer))) + if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = BufferGetPage(*buffer); + page = BufferGetPage(buffer); lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); Assert(ItemIdIsNormal(lp)); + tuple->t_self = *tid; tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); l3: - result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); + result = HeapTupleSatisfiesUpdate(tuple, cid, buffer); if (result == TM_Invisible) { @@ -4223,7 +4325,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, infomask2 = tuple->t_data->t_infomask2; ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* * If any subtransaction of the current top transaction already holds @@ -4375,12 +4477,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4415,7 +4517,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4443,7 +4545,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * No conflict, but if the xmax changed under us in the * meantime, start over. */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4455,7 +4557,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, } else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* if the xmax changed in the meantime, start over */ if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || @@ -4483,7 +4585,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, TransactionIdIsCurrentTransactionId(xwait)) { /* ... but if the xmax changed in the meantime, start over */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4505,7 +4607,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (require_sleep && (result == TM_Updated || result == TM_Deleted)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } else if (require_sleep) @@ -4530,7 +4632,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } @@ -4556,7 +4658,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4596,7 +4698,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4622,12 +4724,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * xwait is done, but if xwait had just locked the tuple then some @@ -4649,7 +4751,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * don't check for this in the multixact case, because some * locker transactions might still be running. */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + UpdateXmaxHintBits(tuple->t_data, buffer, xwait); } } @@ -4708,9 +4810,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto l3; } @@ -4773,7 +4875,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, cleared_all_frozen = true; - MarkBufferDirty(*buffer); + MarkBufferDirty(buffer); /* * XLOG stuff. You might think that we don't need an XLOG record because @@ -4793,7 +4895,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, XLogRecPtr recptr; XLogBeginInsert(); - XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.xmax = xid; @@ -4814,7 +4916,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, result = TM_Ok; out_locked: - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); out_unlocked: if (BufferIsValid(vmbuffer)) @@ -4832,6 +4934,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (have_tuple_lock) UnlockTupleTuplock(relation, tid, mode); + /* Put the target tuple to the slot */ + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + return result; } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 5a17112c91e..6de8868a91c 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -45,6 +45,12 @@ #include "utils/builtins.h" #include "utils/rel.h" +static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid, + Snapshot snapshot, TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd); + static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); @@ -298,23 +304,55 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, static TM_Result heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { + TM_Result result; + /* * Currently Deleting of index tuples are handled at vacuum, in case if * the storage itself is cleaning the dead tuples by itself, it is the * time to call the index tuple deletion also. */ - return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); + result = heap_delete(relation, tid, cid, crosscheck, options, + tmfd, changingPart, oldSlot); + + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * delete should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_delete(). + */ + result = heapam_tuple_lock(relation, tid, snapshot, + oldSlot, cid, LockTupleExclusive, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + + return result; } static TM_Result heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, - LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, + LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); @@ -324,8 +362,8 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - result = heap_update(relation, otid, tuple, cid, crosscheck, wait, - tmfd, lockmode, update_indexes); + result = heap_update(relation, otid, tuple, cid, crosscheck, options, + tmfd, lockmode, update_indexes, oldSlot); ItemPointerCopy(&tuple->t_self, &slot->tts_tid); /* @@ -352,6 +390,31 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, if (shouldFree) pfree(tuple); + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * update should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_update(). + */ + result = heapam_tuple_lock(relation, otid, snapshot, + oldSlot, cid, *lockmode, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + return result; } @@ -363,7 +426,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, { BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; - Buffer buffer; HeapTuple tuple = &bslot->base.tupdata; bool follow_updates; @@ -373,9 +435,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, Assert(TTS_IS_BUFFERTUPLE(slot)); tuple_lock_retry: - tuple->t_self = *tid; - result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy, - follow_updates, &buffer, tmfd); + result = heap_lock_tuple(relation, tid, slot, cid, mode, wait_policy, + follow_updates, tmfd); if (result == TM_Updated && (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) @@ -383,8 +444,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* Should not encounter speculative tuple on recheck */ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); - ReleaseBuffer(buffer); - if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) { SnapshotData SnapshotDirty; @@ -406,6 +465,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, InitDirtySnapshot(SnapshotDirty); for (;;) { + Buffer buffer = InvalidBuffer; + if (ItemPointerIndicatesMovedPartitions(tid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), @@ -500,7 +561,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* * This is a live tuple, so try to lock it again. */ - ReleaseBuffer(buffer); + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); goto tuple_lock_retry; } @@ -511,7 +572,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, */ if (tuple->t_data == NULL) { - Assert(!BufferIsValid(buffer)); + ReleaseBuffer(buffer); return TM_Deleted; } @@ -564,9 +625,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - /* store in slot, transferring existing pin */ - ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); - return result; } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 771438c8cec..3e88cb82cf0 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -297,16 +297,23 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) +simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; result = table_tuple_delete(rel, tid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + options, + &tmfd, false /* changingPart */ , + oldSlot); switch (result) { @@ -345,17 +352,24 @@ void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; LockTupleMode lockmode; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; result = table_tuple_update(rel, otid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + options, + &tmfd, &lockmode, update_indexes, + oldSlot); switch (result) { diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 8b1d3b99fe9..508f0ea10e4 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2779,8 +2779,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update) { @@ -2789,20 +2789,11 @@ ExecARDeleteTriggers(EState *estate, if ((trigdesc && trigdesc->trig_delete_after_row) || (transition_capture && transition_capture->tcs_delete_old_table)) { - TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); - - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) - GetTupleForTrigger(estate, - NULL, - relinfo, - tupleid, - LockTupleExclusive, - slot, - NULL, - NULL, - NULL); - else + /* + * Put the FDW old tuple to the slot. Otherwise, caller is expected + * to have old tuple alredy fetched to the slot. + */ + if (fdw_trigtuple != NULL) ExecForceStoreHeapTuple(fdw_trigtuple, slot, false); AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, @@ -3093,18 +3084,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, * Note: 'src_partinfo' and 'dst_partinfo', when non-NULL, refer to the source * and destination partitions, respectively, of a cross-partition update of * the root partitioned table mentioned in the query, given by 'relinfo'. - * 'tupleid' in that case refers to the ctid of the "old" tuple in the source - * partition, and 'newslot' contains the "new" tuple in the destination - * partition. This interface allows to support the requirements of - * ExecCrossPartitionUpdateForeignKey(); is_crosspart_update must be true in - * that case. + * 'oldslot' contains the "old" tuple in the source partition, and 'newslot' + * contains the "new" tuple in the destination partition. This interface + * allows to support the requirements of ExecCrossPartitionUpdateForeignKey(); + * is_crosspart_update must be true in that case. */ void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, @@ -3123,29 +3113,14 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, * separately for DELETE and INSERT to capture transition table rows. * In such case, either old tuple or new tuple can be NULL. */ - TupleTableSlot *oldslot; - ResultRelInfo *tupsrc; - Assert((src_partinfo != NULL && dst_partinfo != NULL) || !is_crosspart_update); - tupsrc = src_partinfo ? src_partinfo : relinfo; - oldslot = ExecGetTriggerOldSlot(estate, tupsrc); - - if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) - GetTupleForTrigger(estate, - NULL, - tupsrc, - tupleid, - LockTupleExclusive, - oldslot, - NULL, - NULL, - NULL); - else if (fdw_trigtuple != NULL) + if (fdw_trigtuple != NULL) + { + Assert(oldslot); ExecForceStoreHeapTuple(fdw_trigtuple, oldslot, false); - else - ExecClearTuple(oldslot); + } AfterTriggerSaveEvent(estate, relinfo, src_partinfo, dst_partinfo, diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 25d2868744e..1ed369e5d7b 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -536,6 +536,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { List *recheckIndexes = NIL; TU_UpdateIndexes update_indexes; + TupleTableSlot *oldSlot = NULL; /* Compute stored generated columns */ if (rel->rd_att->constr && @@ -549,8 +550,12 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_after_row) + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, - &update_indexes); + &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, @@ -561,7 +566,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tid, NULL, slot, + NULL, oldSlot, slot, recheckIndexes, NULL, false); list_free(recheckIndexes); @@ -595,12 +600,18 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, if (!skip_tuple) { + TupleTableSlot *oldSlot = NULL; + + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_delete_after_row) + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot); + simple_table_tuple_delete(rel, tid, estate->es_snapshot, oldSlot); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, - tid, NULL, NULL, false); + NULL, oldSlot, NULL, false); } } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 27b55334ed4..a45ae763139 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -141,7 +141,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, ItemPointer tupleid, - TupleTableSlot *oldslot, + TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, @@ -578,6 +578,10 @@ ExecInitInsertProjection(ModifyTableState *mtstate, resultRelInfo->ri_newTupleSlot = table_slot_create(resultRelInfo->ri_RelationDesc, &estate->es_tupleTable); + if (node->onConflictAction == ONCONFLICT_UPDATE) + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); /* Build ProjectionInfo if needed (it probably isn't). */ if (need_projection) @@ -1167,7 +1171,7 @@ ExecInsert(ModifyTableContext *context, ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, NULL, - NULL, + resultRelInfo->ri_oldTupleSlot, slot, NULL, mtstate->mt_transition_capture, @@ -1347,7 +1351,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart) + ItemPointer tupleid, bool changingPart, int options, + TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1355,9 +1360,10 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, - changingPart); + changingPart, + oldSlot); } /* @@ -1369,7 +1375,8 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool changingPart) + ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; EState *estate = context->estate; @@ -1387,8 +1394,8 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, { ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, - NULL, NULL, mtstate->mt_transition_capture, + oldtuple, + slot, NULL, NULL, mtstate->mt_transition_capture, false); /* @@ -1399,10 +1406,30 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } /* AFTER ROW DELETE Triggers */ - ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, + ExecARDeleteTriggers(estate, resultRelInfo, oldtuple, slot, ar_delete_trig_tcs, changingPart); } +/* + * Initializes the tuple slot in a ResultRelInfo for DELETE action. + * + * We mark 'projectNewInfoValid' even though the projections themselves + * are not initialized here. + */ +static void +ExecInitDeleteTupleSlot(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo) +{ + EState *estate = mtstate->ps.state; + + Assert(!resultRelInfo->ri_projectNewInfoValid); + + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + resultRelInfo->ri_projectNewInfoValid = true; +} + /* ---------------------------------------------------------------- * ExecDelete * @@ -1430,6 +1457,7 @@ ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *oldSlot, bool processReturning, bool changingPart, bool canSetTag, @@ -1493,6 +1521,11 @@ ExecDelete(ModifyTableContext *context, } else { + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * delete the tuple * @@ -1503,7 +1536,8 @@ ExecDelete(ModifyTableContext *context, * transaction-snapshot mode transactions. */ ldelete: - result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart); + result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart, + options, oldSlot); if (tmresult) *tmresult = result; @@ -1550,7 +1584,6 @@ ExecDelete(ModifyTableContext *context, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; if (IsolationUsesXactSnapshot()) @@ -1559,87 +1592,29 @@ ExecDelete(ModifyTableContext *context, errmsg("could not serialize access due to concurrent update"))); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - EvalPlanQualBegin(context->epqstate); - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); - - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - LockTupleExclusive, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; - switch (result) + /* + * If requested, skip delete and pass back the updated + * row. + */ + if (epqreturnslot) { - case TM_Ok: - Assert(context->tmfd.traversed); - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* - * If requested, skip delete and pass back the - * updated row. - */ - if (epqreturnslot) - { - *epqreturnslot = epqslot; - return NULL; - } - else - goto ldelete; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously updated by this - * command, ignore the delete, otherwise error - * out. - * - * See also TM_SelfModified response to - * table_tuple_delete() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be deleted was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - default: - - /* - * TM_Invisible should be impossible because we're - * waiting for updated row versions, and would - * already have errored out if the first version - * is invisible. - * - * TM_Updated should be impossible, because we're - * locking the latest version via - * TUPLE_LOCK_FLAG_FIND_LAST_VERSION. - */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; + *epqreturnslot = epqslot; + return NULL; } - - Assert(false); - break; + else + goto ldelete; } case TM_Deleted: @@ -1673,7 +1648,8 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, changingPart); + ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, + oldSlot, changingPart); /* Process RETURNING if present and if requested */ if (processReturning && resultRelInfo->ri_projectReturning) @@ -1691,17 +1667,13 @@ ExecDelete(ModifyTableContext *context, } else { + /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); if (oldtuple != NULL) - { ExecForceStoreHeapTuple(oldtuple, slot, false); - } else - { - if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid, - SnapshotAny, slot)) - elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING"); - } + ExecCopySlot(slot, oldSlot); + Assert(!TupIsNull(slot)); } rslot = ExecProcessReturning(resultRelInfo, slot, context->planSlot); @@ -1801,12 +1773,16 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, MemoryContextSwitchTo(oldcxt); } + /* Make sure ri_oldTupleSlot is initialized. */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(mtstate, resultRelInfo); + /* * Row movement, part 1. Delete the tuple, but skip RETURNING processing. * We want to return rows from INSERT. */ ExecDelete(context, resultRelInfo, - tupleid, oldtuple, + tupleid, oldtuple, resultRelInfo->ri_oldTupleSlot, false, /* processReturning */ true, /* changingPart */ false, /* canSetTag */ @@ -1847,21 +1823,13 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, return true; else { - /* Fetch the most recent version of old tuple. */ - TupleTableSlot *oldSlot; - - /* ... but first, make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(mtstate, resultRelInfo); - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - /* and project the new tuple to retry the UPDATE with */ + /* + * ExecDelete already fetches the most recent version of old tuple + * to resultRelInfo->ri_RelationDesc. So, just project the new + * tuple to retry the UPDATE with. + */ *retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot, - oldSlot); + resultRelInfo->ri_oldTupleSlot); return false; } } @@ -1980,7 +1948,8 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag, UpdateContext *updateCxt) + bool canSetTag, int options, TupleTableSlot *oldSlot, + UpdateContext *updateCxt) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2073,7 +2042,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ExecCrossPartitionUpdateForeignKey(context, resultRelInfo, insert_destrel, - tupleid, slot, + tupleid, + resultRelInfo->ri_oldTupleSlot, inserted_tuple); return TM_Ok; @@ -2116,9 +2086,10 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, &updateCxt->lockmode, - &updateCxt->updateIndexes); + &updateCxt->updateIndexes, + oldSlot); if (result == TM_Ok) updateCxt->updated = true; @@ -2134,7 +2105,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, ResultRelInfo *resultRelInfo, ItemPointer tupleid, - HeapTuple oldtuple, TupleTableSlot *slot) + HeapTuple oldtuple, TupleTableSlot *slot, + TupleTableSlot *oldSlot) { ModifyTableState *mtstate = context->mtstate; List *recheckIndexes = NIL; @@ -2150,7 +2122,7 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, slot, + oldtuple, oldSlot, slot, recheckIndexes, mtstate->operation == CMD_INSERT ? mtstate->mt_oc_transition_capture : @@ -2239,7 +2211,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, /* Perform the root table's triggers. */ ExecARUpdateTriggers(context->estate, rootRelInfo, sourcePartInfo, destPartInfo, - tupleid, NULL, newslot, NIL, NULL, true); + NULL, oldslot, newslot, NIL, NULL, true); } /* ---------------------------------------------------------------- @@ -2261,6 +2233,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, * NULL when the foreign table has no relevant triggers. * * slot contains the new tuple value to be stored. + * oldSlot is the slot to store the old tuple. * planSlot is the output of the ModifyTable's subplan; we use it * to access values from other input tables (for RETURNING), * row-ID junk columns, etc. @@ -2273,7 +2246,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag) + TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2326,6 +2299,11 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } else { + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!locked && !IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here to try again. (We don't need to redo triggers, @@ -2335,7 +2313,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ redo_act: result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot, - canSetTag, &updateCxt); + canSetTag, options, oldSlot, &updateCxt); /* * If ExecUpdateAct reports that a cross-partition update was done, @@ -2386,88 +2364,30 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; - TupleTableSlot *oldSlot; if (IsolationUsesXactSnapshot()) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + Assert(!locked); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); - - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - updateCxt.lockmode, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) - { - case TM_Ok: - Assert(context->tmfd.traversed); - - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* Make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(context->mtstate, - resultRelInfo); - - /* Fetch the most recent version of old tuple. */ - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - slot = ExecGetUpdateNewTuple(resultRelInfo, - epqslot, oldSlot); - goto redo_act; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously modified by - * this command, ignore the redundant update, - * otherwise error out. - * - * See also TM_SelfModified response to - * table_tuple_update() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be updated was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - default: - /* see table_tuple_lock call in ExecDelete() */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; - } + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; + slot = ExecGetUpdateNewTuple(resultRelInfo, + epqslot, + oldSlot); + goto redo_act; } break; @@ -2491,7 +2411,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, (estate->es_processed)++; ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, - slot); + slot, oldSlot); /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) @@ -2709,7 +2629,8 @@ ExecOnConflictUpdate(ModifyTableContext *context, *returning = ExecUpdate(context, resultRelInfo, conflictTid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, - canSetTag); + existing, + canSetTag, true); /* * Clear out existing tuple, as there might not be another conflict among @@ -2913,7 +2834,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, break; /* concurrent update/delete */ } result = ExecUpdateAct(context, resultRelInfo, tupleid, NULL, - newslot, canSetTag, &updateCxt); + newslot, canSetTag, TABLE_MODIFY_WAIT, NULL, + &updateCxt); /* * As in ExecUpdate(), if ExecUpdateAct() reports that a @@ -2931,7 +2853,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok && updateCxt.updated) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot); + tupleid, NULL, newslot, + resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } break; @@ -2945,11 +2868,12 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return true; /* "do nothing" */ break; /* concurrent update/delete */ } - result = ExecDeleteAct(context, resultRelInfo, tupleid, false); + result = ExecDeleteAct(context, resultRelInfo, tupleid, false, + TABLE_MODIFY_WAIT, NULL); if (result == TM_Ok) { ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, - false); + resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } break; @@ -3881,12 +3805,18 @@ ExecModifyTable(PlanState *pstate) /* Now apply the update. */ slot = ExecUpdate(&context, resultRelInfo, tupleid, oldtuple, - slot, node->canSetTag); + slot, resultRelInfo->ri_oldTupleSlot, + node->canSetTag, false); break; case CMD_DELETE: + /* Initialize slot for DELETE to fetch the old tuple */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitDeleteTupleSlot(node, resultRelInfo); + slot = ExecDelete(&context, resultRelInfo, tupleid, oldtuple, - true, false, node->canSetTag, NULL, NULL, NULL); + resultRelInfo->ri_oldTupleSlot, true, false, + node->canSetTag, NULL, NULL, NULL); break; case CMD_MERGE: diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index c7278219b24..72b4f8e7634 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -242,19 +242,22 @@ extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate); extern TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, bool changingPart); + CommandId cid, Snapshot crosscheck, int options, + struct TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot); extern void heap_finish_speculative(Relation relation, ItemPointer tid); extern void heap_abort_speculative(Relation relation, ItemPointer tid); extern TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, struct TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); -extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, struct TM_FailureData *tmfd); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); +extern TM_Result heap_lock_tuple(Relation relation, ItemPointer tid, + TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool follow_updates, + struct TM_FailureData *tmfd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 5e195fd292f..78106f3a100 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -259,6 +259,11 @@ typedef struct TM_IndexDeleteOp /* Follow update chain and lock latest version of tuple */ #define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1) +/* "options" flag bits for table_tuple_update and table_tuple_delete */ +#define TABLE_MODIFY_WAIT 0x0001 +#define TABLE_MODIFY_FETCH_OLD_TUPLE 0x0002 +#define TABLE_MODIFY_LOCK_UPDATED 0x0004 + /* Typedef for callback function for table_index_build_scan */ typedef void (*IndexBuildCallback) (Relation index, @@ -528,9 +533,10 @@ typedef struct TableAmRoutine CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, - bool changingPart); + bool changingPart, + TupleTableSlot *oldSlot); /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, @@ -539,10 +545,11 @@ typedef struct TableAmRoutine CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, @@ -1457,7 +1464,7 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, } /* - * Delete a tuple. + * Delete a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless prepared to deal with * concurrent-update conditions. Use simple_table_tuple_delete instead. @@ -1468,11 +1475,21 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * cid - delete command ID (used for visibility test, and stored into * cmax if successful) * crosscheck - if not InvalidSnapshot, also check tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * changingPart - true iff the tuple is being moved to another partition * table due to an update of the partition key. Otherwise, false. + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. * * Normal, successful return value is TM_Ok, which means we did actually * delete it. Failure return codes are TM_SelfModified, TM_Updated, and @@ -1484,16 +1501,18 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, */ static inline TM_Result table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { return rel->rd_tableam->tuple_delete(rel, tid, cid, snapshot, crosscheck, - wait, tmfd, changingPart); + options, tmfd, changingPart, + oldSlot); } /* - * Update a tuple. + * Update a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless you are prepared to deal with * concurrent-update conditions. Use simple_table_tuple_update instead. @@ -1505,13 +1524,23 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * cid - update command ID (used for visibility test, and stored into * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * lockmode - filled with lock mode acquired on tuple * update_indexes - in success cases this is set to true if new index entries * are required for this tuple - * + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. + * Normal, successful return value is TM_Ok, which means we did actually * update it. Failure return codes are TM_SelfModified, TM_Updated, and * TM_BeingModified (the last only possible if wait == false). @@ -1529,13 +1558,15 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, static inline TM_Result table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { return rel->rd_tableam->tuple_update(rel, otid, slot, cid, snapshot, crosscheck, - wait, tmfd, - lockmode, update_indexes); + options, tmfd, + lockmode, update_indexes, + oldSlot); } /* @@ -2051,10 +2082,12 @@ table_scan_sample_next_tuple(TableScanDesc scan, extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, - Snapshot snapshot); + Snapshot snapshot, + TupleTableSlot *oldSlot); extern void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* ---------------------------------------------------------------------------- diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 430e3ca7ddf..4903b4b7bc2 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -216,8 +216,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, TM_FailureData *tmfd); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update); extern bool ExecIRDeleteTriggers(EState *estate, @@ -240,8 +240,8 @@ extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, From d26db9e0f12c8509684a8fb7b7660ac00631def2 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 22 Mar 2023 16:47:09 -0700 Subject: [PATCH 06/45] Add EvalPlanQual delete returning isolation test Author: Andres Freund Reviewed-by: Pavel Borisov Discussion: https://www.postgresql.org/message-id/flat/CAPpHfdua-YFw3XTprfutzGp28xXLigFtzNbuFY8yPhqeq6X5kg%40mail.gmail.com --- .../isolation/expected/eval-plan-qual-2.out | 37 +++++++++++++++++++ src/test/isolation/isolation_schedule | 1 + .../isolation/specs/eval-plan-qual-2.spec | 30 +++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 src/test/isolation/expected/eval-plan-qual-2.out create mode 100644 src/test/isolation/specs/eval-plan-qual-2.spec diff --git a/src/test/isolation/expected/eval-plan-qual-2.out b/src/test/isolation/expected/eval-plan-qual-2.out new file mode 100644 index 00000000000..117a3d3be8d --- /dev/null +++ b/src/test/isolation/expected/eval-plan-qual-2.out @@ -0,0 +1,37 @@ +Parsed test spec with 3 sessions + +starting permutation: read_u wx2 wb1 c2 c1 read_u read +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 600| 1200 +savings | 600| 1200 +(2 rows) + +step wx2: UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; +balance +------- + 1050 +(1 row) + +step wb1: DELETE FROM accounts WHERE balance = 600 RETURNING *; +step c2: COMMIT; +step wb1: <... completed> +accountid|balance|balance2 +---------+-------+-------- +savings | 600| 1200 +(1 row) + +step c1: COMMIT; +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + +step read: SELECT * FROM accounts ORDER BY accountid; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 9b0bb8a29b3..124c170746c 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -36,6 +36,7 @@ test: fk-partitioned-2 test: fk-snapshot test: subxid-overflow test: eval-plan-qual +test: eval-plan-qual-2 test: eval-plan-qual-trigger test: inplace-inval test: intra-grant-inplace diff --git a/src/test/isolation/specs/eval-plan-qual-2.spec b/src/test/isolation/specs/eval-plan-qual-2.spec new file mode 100644 index 00000000000..30447bef24a --- /dev/null +++ b/src/test/isolation/specs/eval-plan-qual-2.spec @@ -0,0 +1,30 @@ +setup +{ + CREATE TABLE accounts (accountid text PRIMARY KEY, balance numeric not null, + balance2 numeric GENERATED ALWAYS AS (balance * 2) STORED); + INSERT INTO accounts VALUES ('checking', 600), ('savings', 600); +} + +teardown +{ + DROP TABLE accounts; +} + +session s1 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wb1 { DELETE FROM accounts WHERE balance = 600 RETURNING *; } +step c1 { COMMIT; } + +session s2 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wx2 { UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; } +step c2 { COMMIT; } + +session s3 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step read { SELECT * FROM accounts ORDER BY accountid; } +step read_u { SELECT * FROM accounts; } + +teardown { COMMIT; } + +permutation read_u wx2 wb1 c2 c1 read_u read From 89e4d47e8ee5d5a5988fbe18fc506ac7573d7545 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 00:04:21 +0300 Subject: [PATCH 07/45] Improvements to TableAM API --- src/backend/access/common/detoast.c | 20 +- src/backend/access/common/heaptuple.c | 4 + src/backend/access/common/reloptions.c | 6 +- src/backend/access/heap/heapam_handler.c | 343 ++++++++++++++++++- src/backend/access/table/tableam.c | 4 +- src/backend/access/table/tableamapi.c | 23 +- src/backend/catalog/aclchk.c | 2 +- src/backend/commands/analyze.c | 14 +- src/backend/commands/tablecmds.c | 53 +-- src/backend/commands/trigger.c | 237 ++++++++++--- src/backend/executor/execExprInterp.c | 4 +- src/backend/executor/execMain.c | 28 +- src/backend/executor/execReplication.c | 10 +- src/backend/executor/nodeLockRows.c | 17 +- src/backend/executor/nodeModifyTable.c | 402 +++++++---------------- src/backend/executor/nodeTidscan.c | 2 +- src/backend/nodes/read.c | 11 + src/backend/optimizer/plan/planner.c | 16 +- src/backend/optimizer/prep/preptlist.c | 20 +- src/backend/optimizer/util/appendinfo.c | 32 +- src/backend/optimizer/util/inherit.c | 48 ++- src/backend/parser/parse_relation.c | 13 + src/backend/postmaster/autovacuum.c | 4 +- src/backend/rewrite/rewriteHandler.c | 1 + src/backend/utils/adt/ri_triggers.c | 11 +- src/backend/utils/cache/relcache.c | 38 ++- src/backend/utils/sort/tuplestore.c | 30 ++ src/include/access/reloptions.h | 2 + src/include/access/sysattr.h | 3 +- src/include/access/tableam.h | 185 +++++++---- src/include/commands/trigger.h | 4 +- src/include/commands/vacuum.h | 3 + src/include/foreign/fdwapi.h | 6 +- src/include/nodes/execnodes.h | 3 + src/include/nodes/parsenodes.h | 1 + src/include/nodes/plannodes.h | 4 +- src/include/nodes/primnodes.h | 7 + src/include/nodes/readfuncs.h | 1 + src/include/optimizer/appendinfo.h | 5 + src/include/optimizer/planner.h | 3 +- src/include/utils/tuplestore.h | 3 + src/include/varatt.h | 2 + 42 files changed, 1100 insertions(+), 525 deletions(-) diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index 8af80c80865..f54dcb03517 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -28,6 +28,8 @@ static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 slicelength); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); +static ToastFunc o_detoast_func = NULL; + /* ---------- * detoast_external_attr - * @@ -222,7 +224,14 @@ detoast_attr_slice(struct varlena *attr, else if (pg_add_s32_overflow(sliceoffset, slicelength, &slicelimit)) slicelength = slicelimit = -1; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + Assert(o_detoast_func != NULL); + preslice = o_detoast_func(attr); + if (preslice == NULL) + elog(ERROR, "unexpected NULL detoast result"); + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; @@ -331,8 +340,6 @@ detoast_attr_slice(struct varlena *attr, return result; } -static ToastFunc o_detoast_func = NULL; - void register_o_detoast_func(ToastFunc func) { @@ -633,7 +640,12 @@ toast_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->toasted_size - VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { /* * Attribute is stored externally - return the extsize whether diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 6bedbdf07ff..75d9c272177 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -756,6 +756,10 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) case TableOidAttributeNumber: result = ObjectIdGetDatum(tup->t_tableOid); break; + case RowIdAttributeNumber: + *isnull = true; + result = 0; + break; default: elog(ERROR, "invalid attnum: %d", attnum); result = 0; /* keep compiler quiet */ diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 469de9bb49f..a19499af976 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/spgist_private.h" +#include "access/tableam.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" @@ -1379,7 +1380,7 @@ untransformRelOptions(Datum options) */ bytea * extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, - amoptions_function amoptions) + const TableAmRoutine *tableam, amoptions_function amoptions) { bytea *options; bool isnull; @@ -1401,7 +1402,8 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - options = heap_reloptions(classForm->relkind, datum, false); + options = tableam_reloptions(tableam, classForm->relkind, + datum, false); break; case RELKIND_PARTITIONED_TABLE: options = partitioned_table_reloptions(datum, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 6de8868a91c..ea6759e8a7f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -23,6 +23,7 @@ #include "access/heapam.h" #include "access/heaptoast.h" #include "access/multixact.h" +#include "access/reloptions.h" #include "access/rewriteheap.h" #include "access/syncscan.h" #include "access/tableam.h" @@ -45,7 +46,7 @@ #include "utils/builtins.h" #include "utils/rel.h" -static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid, +static TM_Result heapam_tuple_lock(Relation relation, Datum tid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, @@ -75,6 +76,20 @@ heapam_slot_callbacks(Relation relation) return &TTSOpsBufferHeapTuple; } +static RowRefType +heapam_get_row_ref_type(Relation rel) +{ + return ROW_REF_TID; +} + +static void +heapam_free_rd_amcache(Relation rel) +{ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; +} + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM @@ -184,7 +199,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, static bool heapam_fetch_row_version(Relation relation, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -193,7 +208,7 @@ heapam_fetch_row_version(Relation relation, Assert(TTS_IS_BUFFERTUPLE(slot)); - bslot->base.tupdata.t_self = *tid; + bslot->base.tupdata.t_self = *DatumGetItemPointer(tupleid); if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) { /* store in slot, transferring existing pin */ @@ -243,7 +258,7 @@ heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, * ---------------------------------------------------------------------------- */ -static void +static TupleTableSlot * heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, int options, BulkInsertState bistate) { @@ -260,6 +275,8 @@ heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, if (shouldFree) pfree(tuple); + + return slot; } static void @@ -302,13 +319,285 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, pfree(tuple); } +/* + * ExecCheckTupleVisible -- verify tuple is visible + * + * It would not be consistent with guarantees of the higher isolation levels to + * proceed with avoiding insertion (taking speculative insertion's alternative + * path) on the basis of another tuple that is not visible to MVCC snapshot. + * Check for the need to raise a serialization failure, and do so as necessary. + */ +static void +ExecCheckTupleVisible(EState *estate, + Relation rel, + TupleTableSlot *slot) +{ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) + { + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + /* + * We should not raise a serialization failure if the conflict is + * against a tuple inserted by our own transaction, even if it's not + * visible to our snapshot. (This would happen, for example, if + * conflicting keys are proposed for insertion in a single command.) + */ + if (!TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } +} + +/* + * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() + */ +static void +ExecCheckTIDVisible(EState *estate, + Relation rel, + ItemPointer tid, + TupleTableSlot *tempSlot) +{ + /* Redundantly check isolation level */ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_fetch_row_version(rel, PointerGetDatum(tid), + SnapshotAny, tempSlot)) + elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); + ExecCheckTupleVisible(estate, rel, tempSlot); + ExecClearTuple(tempSlot); +} + +static inline TupleTableSlot * +heapam_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + uint32 specToken; + ItemPointerData conflictTid; + bool specConflict; + List *recheckIndexes = NIL; + + while (true) + { + specConflict = false; + if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, &conflictTid, + arbiterIndexes)) + { + if (lockedSlot) + { + TM_Result test; + TM_FailureData tmfd; + Datum xminDatum; + TransactionId xmin; + bool isnull; + + /* Determine lock mode to use */ + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + + /* + * Lock tuple for update. Don't follow updates when tuple cannot be + * locked without doing so. A row locking conflict here means our + * previous conclusion that the tuple is conclusively committed is not + * true anymore. + */ + test = table_tuple_lock(rel, PointerGetDatum(&conflictTid), + estate->es_snapshot, + lockedSlot, estate->es_output_cid, + lockmode, LockWaitBlock, 0, + &tmfd); + switch (test) + { + case TM_Ok: + /* success! */ + break; + + case TM_Invisible: + + /* + * This can occur when a just inserted tuple is updated again in + * the same command. E.g. because multiple rows with the same + * conflicting key values are inserted. + * + * This is somewhat similar to the ExecUpdate() TM_SelfModified + * case. We do not want to proceed because it would lead to the + * same row being updated a second time in some unspecified order, + * and in contrast to plain UPDATEs there's no historical behavior + * to break. + * + * It is the user's responsibility to prevent this situation from + * occurring. These problems are why the SQL standard similarly + * specifies that for SQL MERGE, an exception must be raised in + * the event of an attempt to update the same row twice. + */ + xminDatum = slot_getsysattr(lockedSlot, + MinTransactionIdAttributeNumber, + &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + if (TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + /* translator: %s is a SQL command name */ + errmsg("%s command cannot affect row a second time", + "ON CONFLICT DO UPDATE"), + errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); + + /* This shouldn't happen */ + elog(ERROR, "attempted to lock invisible tuple"); + break; + + case TM_SelfModified: + + /* + * This state should never be reached. As a dirty snapshot is used + * to find conflicting tuples, speculative insertion wouldn't have + * seen this row to conflict with. + */ + elog(ERROR, "unexpected self-updated tuple"); + break; + + case TM_Updated: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * As long as we don't support an UPDATE of INSERT ON CONFLICT for + * a partitioned table we shouldn't reach to a case where tuple to + * be lock is moved to another partition due to concurrent update + * of the partition key. + */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + + /* + * Tell caller to try again from the very start. + * + * It does not make sense to use the usual EvalPlanQual() style + * loop here, as the new version of the row might not conflict + * anymore, or the conflicting tuple has actually been deleted. + */ + ExecClearTuple(lockedSlot); + return false; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + + /* see TM_Updated case */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + ExecClearTuple(lockedSlot); + return false; + + default: + elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + } + + /* Success, the tuple is locked. */ + + /* + * Verify that the tuple is visible to our MVCC snapshot if the current + * isolation level mandates that. + * + * It's not sufficient to rely on the check within ExecUpdate() as e.g. + * CONFLICT ... WHERE clause may prevent us from reaching that. + * + * This means we only ever continue when a new command in the current + * transaction could see the row, even though in READ COMMITTED mode the + * tuple will not be visible according to the current statement's + * snapshot. This is in line with the way UPDATE deals with newer tuple + * versions. + */ + ExecCheckTupleVisible(estate, rel, lockedSlot); + return NULL; + } + else + { + ExecCheckTIDVisible(estate, rel, &conflictTid, tempSlot); + return NULL; + } + } + + /* + * Before we start insertion proper, acquire our "speculative + * insertion lock". Others can use that to wait for us to decide + * if we're going to go ahead with the insertion, instead of + * waiting for the whole transaction to complete. + */ + specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); + + /* insert the tuple, with the speculative token */ + heapam_tuple_insert_speculative(rel, slot, + estate->es_output_cid, + 0, + NULL, + specToken); + + /* insert index entries for tuple */ + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, true, + &specConflict, + arbiterIndexes, + false); + + /* adjust the tuple's state accordingly */ + heapam_tuple_complete_speculative(rel, slot, + specToken, !specConflict); + + /* + * Wake up anyone waiting for our decision. They will re-check + * the tuple, see that it's no longer speculative, and wait on our + * XID as if this was a regularly inserted tuple all along. Or if + * we killed the tuple, they will see it's dead, and proceed as if + * the tuple never existed. + */ + SpeculativeInsertionLockRelease(GetCurrentTransactionId()); + + /* + * If there was a conflict, start from the beginning. We'll do + * the pre-check again, which will now find the conflicting tuple + * (unless it aborts before we get there). + */ + if (specConflict) + { + list_free(recheckIndexes); + CHECK_FOR_INTERRUPTS(); + continue; + } + + return slot; + } +} + static TM_Result -heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, +heapam_tuple_delete(Relation relation, Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, bool changingPart, TupleTableSlot *oldSlot) { TM_Result result; + ItemPointer tid = DatumGetItemPointer(tupleid); /* * Currently Deleting of index tuples are handled at vacuum, in case if @@ -331,7 +620,7 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, * heapam_tuple_lock() will take advantage of tuple loaded into * oldSlot by heap_delete(). */ - result = heapam_tuple_lock(relation, tid, snapshot, + result = heapam_tuple_lock(relation, tupleid, snapshot, oldSlot, cid, LockTupleExclusive, (options & TABLE_MODIFY_WAIT) ? LockWaitBlock : @@ -348,7 +637,7 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, static TM_Result -heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, +heapam_tuple_update(Relation relation, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, @@ -357,6 +646,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); TM_Result result; + ItemPointer otid = DatumGetItemPointer(tupleid); /* Update the tuple with table oid */ slot->tts_tableOid = RelationGetRelid(relation); @@ -403,7 +693,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, * heapam_tuple_lock() will take advantage of tuple loaded into * oldSlot by heap_update(). */ - result = heapam_tuple_lock(relation, otid, snapshot, + result = heapam_tuple_lock(relation, tupleid, snapshot, oldSlot, cid, *lockmode, (options & TABLE_MODIFY_WAIT) ? LockWaitBlock : @@ -419,7 +709,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, } static TM_Result -heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, +heapam_tuple_lock(Relation relation, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) @@ -427,6 +717,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; HeapTuple tuple = &bslot->base.tupdata; + ItemPointer tid = DatumGetItemPointer(tupleid); bool follow_updates; follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; @@ -2594,6 +2885,29 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +static bool +heapam_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + return TransactionIdIsCurrentTransactionId(xmin); +} + +static bytea * +heapam_reloptions(char relkind, Datum reloptions, bool validate) +{ + if (relkind == RELKIND_RELATION || + relkind == RELKIND_TOASTVALUE || + relkind == RELKIND_MATVIEW) + return heap_reloptions(relkind, reloptions, validate); + + return NULL; +} /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2604,6 +2918,8 @@ static const TableAmRoutine heapam_methods = { .type = T_TableAmRoutine, .slot_callbacks = heapam_slot_callbacks, + .get_row_ref_type = heapam_get_row_ref_type, + .free_rd_amcache = heapam_free_rd_amcache, .scan_begin = heap_beginscan, .scan_end = heap_endscan, @@ -2623,8 +2939,7 @@ static const TableAmRoutine heapam_methods = { .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, - .tuple_insert_speculative = heapam_tuple_insert_speculative, - .tuple_complete_speculative = heapam_tuple_complete_speculative, + .tuple_insert_with_arbiter = heapam_tuple_insert_with_arbiter, .multi_insert = heap_multi_insert, .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, @@ -2656,7 +2971,11 @@ static const TableAmRoutine heapam_methods = { .scan_bitmap_next_block = heapam_scan_bitmap_next_block, .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, - .scan_sample_next_tuple = heapam_scan_sample_next_tuple + .scan_sample_next_tuple = heapam_scan_sample_next_tuple, + + .tuple_is_current = heapam_tuple_is_current, + + .reloptions = heapam_reloptions }; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 3e88cb82cf0..c8329db8f34 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -308,7 +308,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_delete(rel, tid, + result = table_tuple_delete(rel, PointerGetDatum(tid), GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, @@ -364,7 +364,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, otid, slot, + result = table_tuple_update(rel, PointerGetDatum(otid), slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index d7798b6afb6..26aca18dc50 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -75,8 +75,7 @@ GetTableAmRoutine(Oid amhandler) * Could be made optional, but would require throwing error during * parse-analysis. */ - Assert(routine->tuple_insert_speculative != NULL); - Assert(routine->tuple_complete_speculative != NULL); + Assert(routine->tuple_insert_with_arbiter != NULL); Assert(routine->multi_insert != NULL); Assert(routine->tuple_delete != NULL); @@ -104,9 +103,29 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->scan_sample_next_block != NULL); Assert(routine->scan_sample_next_tuple != NULL); + Assert(routine->tuple_is_current != NULL); + return routine; } +const TableAmRoutine * +GetTableAmRoutineByAmOid(Oid amoid) +{ + HeapTuple ht_am; + Form_pg_am amrec; + const TableAmRoutine *tableam = NULL; + + ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(amoid)); + if (!HeapTupleIsValid(ht_am)) + elog(ERROR, "cache lookup failed for access method %u", + amoid); + amrec = (Form_pg_am)GETSTRUCT(ht_am); + + tableam = GetTableAmRoutine(amrec->amhandler); + ReleaseSysCache(ht_am); + return tableam; +} + /* check_hook: validate new default_table_access_method */ bool check_default_table_access_method(char **newval, void **extra, GucSource source) diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index a2aad09e6a0..ac39a2c4c0a 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -1617,7 +1617,7 @@ expand_all_col_privileges(Oid table_oid, Form_pg_class classForm, AttrNumber curr_att; Assert(classForm->relnatts - FirstLowInvalidHeapAttributeNumber < num_col_privileges); - for (curr_att = FirstLowInvalidHeapAttributeNumber + 1; + for (curr_att = FirstLowInvalidHeapAttributeNumber + 2; curr_att <= classForm->relnatts; curr_att++) { diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index bda364552ca..cd4a16a5572 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -97,9 +97,6 @@ static void compute_index_stats(Relation onerel, double totalrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, Node *index_expr); -static int acquire_sample_rows(Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); static int compare_rows(const void *a, const void *b, void *arg); static int acquire_inherited_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, @@ -201,10 +198,7 @@ analyze_rel(Oid relid, RangeVar *relation, if (onerel->rd_rel->relkind == RELKIND_RELATION || onerel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so we'll use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - /* Also get regular table's size */ - relpages = RelationGetNumberOfBlocks(onerel); + table_analyze(onerel, &acquirefunc, &relpages); } else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { @@ -1133,7 +1127,7 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) * block. The previous sampling method put too much credence in the row * density near the start of the table. */ -static int +int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) @@ -1460,9 +1454,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, if (childrel->rd_rel->relkind == RELKIND_RELATION || childrel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - relpages = RelationGetNumberOfBlocks(childrel); + table_analyze(childrel, &acquirefunc, &relpages); } else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b4eeb2523a2..98f4602f449 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -684,6 +684,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, LOCKMODE parentLockmode; const char *accessMethod = NULL; Oid accessMethodId = InvalidOid; + const TableAmRoutine *tableam = NULL; /* * Truncate relname to appropriate length (probably a waste of time, as @@ -819,6 +820,26 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, if (!OidIsValid(ownerId)) ownerId = GetUserId(); + /* + * If the statement hasn't specified an access method, but we're defining + * a type of relation that needs one, use the default. + */ + if (stmt->accessMethod != NULL) + { + accessMethod = stmt->accessMethod; + + if (partitioned) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("specifying a table access method is not supported on a partitioned table"))); + } + else if (RELKIND_HAS_TABLE_AM(relkind)) + accessMethod = default_table_access_method; + + /* look up the access method, verify it is for a table */ + if (accessMethod != NULL) + accessMethodId = get_table_am_oid(accessMethod, false); + /* * Parse and validate reloptions, if any. */ @@ -827,6 +848,12 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, switch (relkind) { + case RELKIND_RELATION: + case RELKIND_TOASTVALUE: + case RELKIND_MATVIEW: + tableam = GetTableAmRoutineByAmOid(accessMethodId); + (void) tableam_reloptions(tableam, relkind, reloptions, true); + break; case RELKIND_VIEW: (void) view_reloptions(reloptions, true); break; @@ -835,6 +862,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, break; default: (void) heap_reloptions(relkind, reloptions, true); + break; } if (stmt->ofTypename) @@ -938,26 +966,6 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, attr->attstorage = GetAttributeStorage(attr->atttypid, colDef->storage_name); } - /* - * If the statement hasn't specified an access method, but we're defining - * a type of relation that needs one, use the default. - */ - if (stmt->accessMethod != NULL) - { - accessMethod = stmt->accessMethod; - - if (partitioned) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("specifying a table access method is not supported on a partitioned table"))); - } - else if (RELKIND_HAS_TABLE_AM(relkind)) - accessMethod = default_table_access_method; - - /* look up the access method, verify it is for a table */ - if (accessMethod != NULL) - accessMethodId = get_table_am_oid(accessMethod, false); - /* * Create the relation. Inherited defaults and constraints are passed in * for immediate handling --- since they don't need parsing, they can be @@ -6136,8 +6144,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* Write the tuple out to the new relation */ if (newrel) + { table_tuple_insert(newrel, insertslot, mycid, ti_options, bistate); + } ResetExprContext(econtext); @@ -14435,7 +14445,8 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - (void) heap_reloptions(rel->rd_rel->relkind, newOptions, true); + (void) table_reloptions(rel, rel->rd_rel->relkind, + newOptions, true); break; case RELKIND_PARTITIONED_TABLE: (void) partitioned_table_reloptions(newOptions, true); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 508f0ea10e4..b6f6ebaa624 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -83,7 +83,7 @@ static void SetTriggerFlags(TriggerDesc *trigdesc, Trigger *trigger); static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, TupleTableSlot **epqslot, @@ -2688,7 +2688,7 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -2702,7 +2702,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, bool should_free = false; int i; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -2930,7 +2930,7 @@ ExecASUpdateTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, @@ -2950,7 +2950,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, /* Determine lock mode to use */ lockmode = ExecUpdateLockMode(estate, relinfo); - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -3267,7 +3267,7 @@ static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, TupleTableSlot **epqslot, @@ -3292,7 +3292,9 @@ GetTupleForTrigger(EState *estate, */ if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(relation, tid, estate->es_snapshot, oldslot, + + test = table_tuple_lock(relation, tupleid, + estate->es_snapshot, oldslot, estate->es_output_cid, lockmode, LockWaitBlock, lockflags, @@ -3388,8 +3390,8 @@ GetTupleForTrigger(EState *estate, * We expect the tuple to be present, thus very simple error handling * suffices. */ - if (!table_tuple_fetch_row_version(relation, tid, SnapshotAny, - oldslot)) + if (!table_tuple_fetch_row_version(relation, tupleid, + SnapshotAny, oldslot)) elog(ERROR, "failed to fetch tuple for trigger"); } @@ -3595,18 +3597,22 @@ typedef SetConstraintStateData *SetConstraintState; * cycles. So we need only ensure that ats_firing_id is zero when attaching * a new event to an existing AfterTriggerSharedData record. */ -typedef uint32 TriggerFlags; +typedef uint64 TriggerFlags; -#define AFTER_TRIGGER_OFFSET 0x07FFFFFF /* must be low-order bits */ -#define AFTER_TRIGGER_DONE 0x80000000 -#define AFTER_TRIGGER_IN_PROGRESS 0x40000000 +#define AFTER_TRIGGER_SIZE UINT64CONST(0xFFFF000000000) /* must be low-order bits */ +#define AFTER_TRIGGER_SIZE_SHIFT (36) +#define AFTER_TRIGGER_OFFSET UINT64CONST(0x000000FFFFFFF) /* must be low-order bits */ +#define AFTER_TRIGGER_DONE UINT64CONST(0x0000800000000) +#define AFTER_TRIGGER_IN_PROGRESS UINT64CONST(0x0000400000000) /* bits describing the size and tuple sources of this event */ -#define AFTER_TRIGGER_FDW_REUSE 0x00000000 -#define AFTER_TRIGGER_FDW_FETCH 0x20000000 -#define AFTER_TRIGGER_1CTID 0x10000000 -#define AFTER_TRIGGER_2CTID 0x30000000 -#define AFTER_TRIGGER_CP_UPDATE 0x08000000 -#define AFTER_TRIGGER_TUP_BITS 0x38000000 +#define AFTER_TRIGGER_FDW_REUSE UINT64CONST(0x0000000000000) +#define AFTER_TRIGGER_FDW_FETCH UINT64CONST(0x0000200000000) +#define AFTER_TRIGGER_1CTID UINT64CONST(0x0000100000000) +#define AFTER_TRIGGER_ROWID1 UINT64CONST(0x0000010000000) +#define AFTER_TRIGGER_2CTID UINT64CONST(0x0000300000000) +#define AFTER_TRIGGER_ROWID2 UINT64CONST(0x0000020000000) +#define AFTER_TRIGGER_CP_UPDATE UINT64CONST(0x0000080000000) +#define AFTER_TRIGGER_TUP_BITS UINT64CONST(0x0000380000000) typedef struct AfterTriggerSharedData *AfterTriggerShared; typedef struct AfterTriggerSharedData @@ -3658,6 +3664,9 @@ typedef struct AfterTriggerEventDataZeroCtids } AfterTriggerEventDataZeroCtids; #define SizeofTriggerEvent(evt) \ + (((evt)->ate_flags & AFTER_TRIGGER_SIZE) >> AFTER_TRIGGER_SIZE_SHIFT) + +#define BasicSizeofTriggerEvent(evt) \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_CP_UPDATE ? \ sizeof(AfterTriggerEventData) : \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID ? \ @@ -4010,14 +4019,34 @@ afterTriggerCopyBitmap(Bitmapset *src) */ static void afterTriggerAddEvent(AfterTriggerEventList *events, - AfterTriggerEvent event, AfterTriggerShared evtshared) + AfterTriggerEvent event, AfterTriggerShared evtshared, + bytea *rowid1, bytea *rowid2) { - Size eventsize = SizeofTriggerEvent(event); - Size needed = eventsize + sizeof(AfterTriggerSharedData); + Size basiceventsize = MAXALIGN(BasicSizeofTriggerEvent(event)); + Size eventsize; + Size needed; AfterTriggerEventChunk *chunk; AfterTriggerShared newshared; AfterTriggerEvent newevent; + if (SizeofTriggerEvent(event) == 0) + { + eventsize = basiceventsize; + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + eventsize += MAXALIGN(VARSIZE(rowid1)); + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + eventsize += MAXALIGN(VARSIZE(rowid2)); + + event->ate_flags |= eventsize << AFTER_TRIGGER_SIZE_SHIFT; + } + else + { + eventsize = SizeofTriggerEvent(event); + } + + needed = eventsize + sizeof(AfterTriggerSharedData); + /* * If empty list or not enough room in the tail chunk, make a new chunk. * We assume here that a new shared record will always be needed. @@ -4050,7 +4079,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, * sizes used should be MAXALIGN multiples, to ensure that the shared * records will be aligned safely. */ -#define MIN_CHUNK_SIZE 1024 +#define MIN_CHUNK_SIZE (1024*4) #define MAX_CHUNK_SIZE (1024*1024) #if MAX_CHUNK_SIZE > (AFTER_TRIGGER_OFFSET+1) @@ -4069,6 +4098,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, chunksize *= 2; /* okay, double it */ else chunksize /= 2; /* too many shared records */ + chunksize = Max(chunksize, MIN_CHUNK_SIZE); chunksize = Min(chunksize, MAX_CHUNK_SIZE); } chunk = MemoryContextAlloc(afterTriggers.event_cxt, chunksize); @@ -4109,7 +4139,26 @@ afterTriggerAddEvent(AfterTriggerEventList *events, /* Insert the data */ newevent = (AfterTriggerEvent) chunk->freeptr; - memcpy(newevent, event, eventsize); + if (!rowid1 && !rowid2) + { + memcpy(newevent, event, eventsize); + } + else + { + Pointer ptr = chunk->freeptr; + + memcpy(newevent, event, basiceventsize); + ptr += basiceventsize; + + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + memcpy(ptr, rowid1, MAXALIGN(VARSIZE(rowid1))); + ptr += MAXALIGN(VARSIZE(rowid1)); + } + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + memcpy(ptr, rowid2, MAXALIGN(VARSIZE(rowid2))); + } /* ... and link the new event to its shared record */ newevent->ate_flags &= ~AFTER_TRIGGER_OFFSET; newevent->ate_flags |= (char *) newshared - (char *) newevent; @@ -4269,6 +4318,7 @@ AfterTriggerExecute(EState *estate, int tgindx; bool should_free_trig = false; bool should_free_new = false; + Pointer ptr; /* * Locate trigger in trigdesc. It might not be present, and in fact the @@ -4304,15 +4354,17 @@ AfterTriggerExecute(EState *estate, { Tuplestorestate *fdw_tuplestore = GetCurrentFDWTuplestore(); - if (!tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot1)) + if (!tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot1)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); if ((evtshared->ats_event & TRIGGER_EVENT_OPMASK) == TRIGGER_EVENT_UPDATE && - !tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot2)) + !tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot2)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); + trig_tuple_slot1->tts_tid = event->ate_ctid1; + trig_tuple_slot2->tts_tid = event->ate_ctid2; } /* fall through */ case AFTER_TRIGGER_FDW_REUSE: @@ -4344,13 +4396,26 @@ AfterTriggerExecute(EState *estate, break; default: - if (ItemPointerIsValid(&(event->ate_ctid1))) + ptr = (Pointer) event + MAXALIGN(BasicSizeofTriggerEvent(event)); + if (ItemPointerIsValid(&(event->ate_ctid1)) || + (event->ate_flags & AFTER_TRIGGER_ROWID1)) { + Datum tupleid; + TupleTableSlot *src_slot = ExecGetTriggerOldSlot(estate, src_relInfo); - if (!table_tuple_fetch_row_version(src_rel, - &(event->ate_ctid1), + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + tupleid = PointerGetDatum(ptr); + ptr += MAXALIGN(VARSIZE(ptr)); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid1)); + } + + if (!table_tuple_fetch_row_version(src_rel, tupleid, SnapshotAny, src_slot)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); @@ -4386,13 +4451,23 @@ AfterTriggerExecute(EState *estate, /* don't touch ctid2 if not there */ if (((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID || (event->ate_flags & AFTER_TRIGGER_CP_UPDATE)) && - ItemPointerIsValid(&(event->ate_ctid2))) + (ItemPointerIsValid(&(event->ate_ctid2)) || + (event->ate_flags & AFTER_TRIGGER_ROWID2))) { + Datum tupleid; + TupleTableSlot *dst_slot = ExecGetTriggerNewSlot(estate, dst_relInfo); - if (!table_tuple_fetch_row_version(dst_rel, - &(event->ate_ctid2), + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + { + tupleid = PointerGetDatum(ptr); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid2)); + } + if (!table_tuple_fetch_row_version(dst_rel, tupleid, SnapshotAny, dst_slot)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); @@ -4566,7 +4641,7 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, { deferred_found = true; /* add it to move_list */ - afterTriggerAddEvent(move_list, event, evtshared); + afterTriggerAddEvent(move_list, event, evtshared, NULL, NULL); /* mark original copy "done" so we don't do it again */ event->ate_flags |= AFTER_TRIGGER_DONE; } @@ -4670,6 +4745,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, /* caution: trigdesc could be NULL here */ finfo = rInfo->ri_TrigFunctions; instr = rInfo->ri_TrigInstrument; + if (slot1 != NULL) { ExecDropSingleTupleTableSlot(slot1); @@ -6059,6 +6135,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, int tgtype_level; int i; Tuplestorestate *fdw_tuplestore = NULL; + bytea *rowId1 = NULL; + bytea *rowId2 = NULL; /* * Check state. We use a normal test not Assert because it is possible to @@ -6152,6 +6230,21 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, * if so. This preserves the behavior that statement-level triggers fire * just once per statement and fire after row-level triggers. */ + + /* Determine flags */ + if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) + { + if (row_trigger && event == TRIGGER_EVENT_UPDATE) + { + if (relkind == RELKIND_PARTITIONED_TABLE) + new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; + else + new_event.ate_flags = AFTER_TRIGGER_2CTID; + } + else + new_event.ate_flags = AFTER_TRIGGER_1CTID; + } + switch (event) { case TRIGGER_EVENT_INSERT: @@ -6162,6 +6255,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot != NULL); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(newslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6181,6 +6281,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot == NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(oldslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6196,10 +6303,54 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, tgtype_event = TRIGGER_TYPE_UPDATE; if (row_trigger) { + bool src_rowid = false, + dst_rowid = false; Assert(oldslot != NULL); Assert(newslot != NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid2)); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + Relation src_rel = src_partinfo->ri_RelationDesc; + Relation dst_rel = dst_partinfo->ri_RelationDesc; + + src_rowid = table_get_row_ref_type(src_rel) == + ROW_REF_ROWID; + dst_rowid = table_get_row_ref_type(dst_rel) == + ROW_REF_ROWID; + } + else + { + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + src_rowid = true; + dst_rowid = true; + } + } + + if (src_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(oldslot, + RowIdAttributeNumber, + &isnull); + rowId1 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + } + + if (dst_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(newslot, + RowIdAttributeNumber, + &isnull); + rowId2 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID2; + } /* * Also remember the OIDs of partitions to fetch these tuples @@ -6237,20 +6388,6 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; } - /* Determine flags */ - if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) - { - if (row_trigger && event == TRIGGER_EVENT_UPDATE) - { - if (relkind == RELKIND_PARTITIONED_TABLE) - new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; - else - new_event.ate_flags = AFTER_TRIGGER_2CTID; - } - else - new_event.ate_flags = AFTER_TRIGGER_1CTID; - } - /* else, we'll initialize ate_flags for each trigger */ tgtype_level = (row_trigger ? TRIGGER_TYPE_ROW : TRIGGER_TYPE_STATEMENT); @@ -6416,7 +6553,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, new_shared.ats_modifiedcols = afterTriggerCopyBitmap(modifiedCols); afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth].events, - &new_event, &new_shared); + &new_event, &new_shared, rowId1, rowId2); } /* diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 6b7997465d0..5d1a31566e7 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -4388,7 +4388,9 @@ ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext, op->resnull); *op->resvalue = d; /* this ought to be unreachable, but it's cheap enough to check */ - if (unlikely(*op->resnull)) + if (op->d.var.attnum != RowIdAttributeNumber && + op->d.var.attnum != SelfItemPointerAttributeNumber && + unlikely(*op->resnull)) elog(ERROR, "failed to fetch attribute from slot"); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 4c5a7bbf620..334458574ca 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -869,13 +869,15 @@ InitPlan(QueryDesc *queryDesc, int eflags) Oid relid; Relation relation; ExecRowMark *erm; + RangeTblEntry *rangeEntry; /* ignore "parent" rowmarks; they are irrelevant at runtime */ if (rc->isParent) continue; /* get relation's OID (will produce InvalidOid if subquery) */ - relid = exec_rt_fetch(rc->rti, estate)->relid; + rangeEntry = exec_rt_fetch(rc->rti, estate); + relid = rangeEntry->relid; /* open relation, if we need to access it for this mark type */ switch (rc->markType) @@ -908,6 +910,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) erm->prti = rc->prti; erm->rowmarkId = rc->rowmarkId; erm->markType = rc->markType; + if (erm->markType == ROW_MARK_COPY) + erm->refType = ROW_REF_COPY; + else + erm->refType = rangeEntry->reftype; erm->strength = rc->strength; erm->waitPolicy = rc->waitPolicy; erm->ermActive = false; @@ -1295,6 +1301,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ChildToRootMap = NULL; resultRelInfo->ri_ChildToRootMapValid = false; resultRelInfo->ri_CopyMultiInsertBuffer = NULL; + + resultRelInfo->ri_RowRefType = table_get_row_ref_type(resultRelationDesc); } /* @@ -2429,17 +2437,28 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) aerm->rowmark = erm; /* Look up the resjunk columns associated with this rowmark */ - if (erm->markType != ROW_MARK_COPY) + if (erm->refType == ROW_REF_TID) { + Assert(erm->markType != ROW_MARK_COPY); /* need ctid for all methods other than COPY */ snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId); aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, resname); if (!AttributeNumberIsValid(aerm->ctidAttNo)) elog(ERROR, "could not find junk %s column", resname); + } else if (erm->refType == ROW_REF_ROWID) + { + Assert(erm->markType != ROW_MARK_COPY); + /* need ctid for all methods other than COPY */ + snprintf(resname, sizeof(resname), "rowid%u", erm->rowmarkId); + aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->ctidAttNo)) + elog(ERROR, "could not find junk %s column", resname); } else { + Assert(erm->markType == ROW_MARK_COPY); /* need wholerow if COPY */ snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId); aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist, @@ -2727,8 +2746,9 @@ EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot) { /* ordinary table, fetch the tuple */ if (!table_tuple_fetch_row_version(erm->relation, - (ItemPointer) DatumGetPointer(datum), - SnapshotAny, slot)) + datum, + SnapshotAny, + slot)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); return true; } diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 1ed369e5d7b..6e2388005fb 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -209,7 +209,8 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + GetLatestSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -393,7 +394,8 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + GetLatestSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -528,7 +530,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + PointerGetDatum(tid), NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -595,7 +597,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + PointerGetDatum(tid), NULL, NULL, NULL, NULL); } if (!skip_tuple) diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index e459971d32e..049c9841309 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -27,6 +27,7 @@ #include "executor/nodeLockRows.h" #include "foreign/fdwapi.h" #include "miscadmin.h" +#include "utils/datum.h" #include "utils/rel.h" @@ -157,7 +158,16 @@ ExecLockRows(PlanState *pstate) } /* okay, try to lock (and fetch) the tuple */ - tid = *((ItemPointer) DatumGetPointer(datum)); + if (erm->refType == ROW_REF_TID) + { + tid = *((ItemPointer) DatumGetPointer(datum)); + datum = PointerGetDatum(&tid); + } + else + { + Assert(erm->refType = ROW_REF_ROWID); + datum = datumCopy(datum, false, -1); + } switch (erm->markType) { case ROW_MARK_EXCLUSIVE: @@ -182,12 +192,15 @@ ExecLockRows(PlanState *pstate) if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot, + test = table_tuple_lock(erm->relation, datum, estate->es_snapshot, markSlot, estate->es_output_cid, lockmode, erm->waitPolicy, lockflags, &tmfd); + if (erm->refType == ROW_REF_ROWID) + pfree(DatumGetPointer(datum)); + switch (test) { case TM_WouldBlock: diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index a45ae763139..ef62f283b6f 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -140,12 +140,11 @@ static void ExecPendingInserts(EState *estate); static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning); @@ -158,12 +157,12 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, static TupleTableSlot *ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, bool canSetTag); static void ExecInitMerge(ModifyTableState *mtstate, EState *estate); static bool ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, bool canSetTag); static void ExecMergeNotMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, @@ -278,66 +277,6 @@ ExecProcessReturning(ResultRelInfo *resultRelInfo, return ExecProject(projectReturning); } -/* - * ExecCheckTupleVisible -- verify tuple is visible - * - * It would not be consistent with guarantees of the higher isolation levels to - * proceed with avoiding insertion (taking speculative insertion's alternative - * path) on the basis of another tuple that is not visible to MVCC snapshot. - * Check for the need to raise a serialization failure, and do so as necessary. - */ -static void -ExecCheckTupleVisible(EState *estate, - Relation rel, - TupleTableSlot *slot) -{ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) - { - Datum xminDatum; - TransactionId xmin; - bool isnull; - - xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - /* - * We should not raise a serialization failure if the conflict is - * against a tuple inserted by our own transaction, even if it's not - * visible to our snapshot. (This would happen, for example, if - * conflicting keys are proposed for insertion in a single command.) - */ - if (!TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - } -} - -/* - * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() - */ -static void -ExecCheckTIDVisible(EState *estate, - ResultRelInfo *relinfo, - ItemPointer tid, - TupleTableSlot *tempSlot) -{ - Relation rel = relinfo->ri_RelationDesc; - - /* Redundantly check isolation level */ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot)) - elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); - ExecCheckTupleVisible(estate, rel, tempSlot); - ExecClearTuple(tempSlot); -} - /* * Initialize to compute stored generated columns for a tuple * @@ -1023,12 +962,19 @@ ExecInsert(ModifyTableContext *context, if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) { /* Perform a speculative insertion. */ - uint32 specToken; - ItemPointerData conflictTid; - bool specConflict; List *arbiterIndexes; + TupleTableSlot *existing = NULL, + *returningSlot, + *inserted; + LockTupleMode lockmode = LockTupleExclusive; arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes; + returningSlot = ExecGetReturningSlot(estate, resultRelInfo); + if (onconflict == ONCONFLICT_UPDATE) + { + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + existing = resultRelInfo->ri_onConflict->oc_Existing; + } /* * Do a non-conclusive check for conflicts first. @@ -1045,23 +991,29 @@ ExecInsert(ModifyTableContext *context, */ vlock: CHECK_FOR_INTERRUPTS(); - specConflict = false; - if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, - &conflictTid, arbiterIndexes)) + + inserted = table_tuple_insert_with_arbiter(resultRelInfo, + slot, estate->es_output_cid, + 0, NULL, arbiterIndexes, estate, + lockmode, existing, returningSlot); + if (!inserted) { - /* committed conflict tuple found */ if (onconflict == ONCONFLICT_UPDATE) { + TupleTableSlot *returning = NULL; + + if (TTS_EMPTY(existing)) + goto vlock; + /* * In case of ON CONFLICT DO UPDATE, execute the UPDATE * part. Be prepared to retry if the UPDATE fails because * of another concurrent UPDATE/DELETE to the conflict * tuple. */ - TupleTableSlot *returning = NULL; if (ExecOnConflictUpdate(context, resultRelInfo, - &conflictTid, slot, canSetTag, + slot, canSetTag, &returning)) { InstrCountTuples2(&mtstate->ps, 1); @@ -1084,57 +1036,13 @@ ExecInsert(ModifyTableContext *context, * ExecGetReturningSlot() in the DO NOTHING case... */ Assert(onconflict == ONCONFLICT_NOTHING); - ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid, - ExecGetReturningSlot(estate, resultRelInfo)); InstrCountTuples2(&mtstate->ps, 1); return NULL; } } - - /* - * Before we start insertion proper, acquire our "speculative - * insertion lock". Others can use that to wait for us to decide - * if we're going to go ahead with the insertion, instead of - * waiting for the whole transaction to complete. - */ - specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); - - /* insert the tuple, with the speculative token */ - table_tuple_insert_speculative(resultRelationDesc, slot, - estate->es_output_cid, - 0, - NULL, - specToken); - - /* insert index entries for tuple */ - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, true, - &specConflict, - arbiterIndexes, - false); - - /* adjust the tuple's state accordingly */ - table_tuple_complete_speculative(resultRelationDesc, slot, - specToken, !specConflict); - - /* - * Wake up anyone waiting for our decision. They will re-check - * the tuple, see that it's no longer speculative, and wait on our - * XID as if this was a regularly inserted tuple all along. Or if - * we killed the tuple, they will see it's dead, and proceed as if - * the tuple never existed. - */ - SpeculativeInsertionLockRelease(GetCurrentTransactionId()); - - /* - * If there was a conflict, start from the beginning. We'll do - * the pre-check again, which will now find the conflicting tuple - * (unless it aborts before we get there). - */ - if (specConflict) + else { - list_free(recheckIndexes); - goto vlock; + slot = inserted; } /* Since there was no insertion conflict, we're done */ @@ -1142,9 +1050,9 @@ ExecInsert(ModifyTableContext *context, else { /* insert the tuple normally */ - table_tuple_insert(resultRelationDesc, slot, - estate->es_output_cid, - 0, NULL); + slot = table_tuple_insert(resultRelationDesc, slot, + estate->es_output_cid, + 0, NULL); /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) @@ -1320,7 +1228,7 @@ ExecPendingInserts(EState *estate) */ static bool ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot **epqreturnslot, TM_Result *result) { if (result) @@ -1351,7 +1259,7 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart, int options, + Datum tupleid, bool changingPart, int options, TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1375,7 +1283,7 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + HeapTuple oldtuple, TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; @@ -1455,7 +1363,7 @@ ExecInitDeleteTupleSlot(ModifyTableState *mtstate, static TupleTableSlot * ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *oldSlot, bool processReturning, @@ -1648,7 +1556,7 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, + ExecDeleteEpilogue(context, resultRelInfo, oldtuple, oldSlot, changingPart); /* Process RETURNING if present and if requested */ @@ -1665,7 +1573,7 @@ ExecDelete(ModifyTableContext *context, /* FDW must have provided a slot containing the deleted row */ Assert(!TupIsNull(slot)); } - else + else if (!slot || TupIsNull(slot)) { /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); @@ -1714,7 +1622,7 @@ ExecDelete(ModifyTableContext *context, static bool ExecCrossPartitionUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, UpdateContext *updateCxt, @@ -1870,7 +1778,7 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, */ static bool ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TM_Result *result) { Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -1947,7 +1855,7 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, */ static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, int options, TupleTableSlot *oldSlot, UpdateContext *updateCxt) { @@ -2104,7 +2012,7 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, - ResultRelInfo *resultRelInfo, ItemPointer tupleid, + ResultRelInfo *resultRelInfo, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *oldSlot) { @@ -2154,7 +2062,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldslot, TupleTableSlot *newslot) { @@ -2245,7 +2153,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, */ static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; @@ -2299,10 +2207,14 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } else { - int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + int options = TABLE_MODIFY_WAIT; - if (!locked && !IsolationUsesXactSnapshot()) - options |= TABLE_MODIFY_LOCK_UPDATED; + if (!locked) + { + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + } /* * If we generate a new candidate tuple after EvalPlanQual testing, we @@ -2410,7 +2322,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (canSetTag) (estate->es_processed)++; - ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, + ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, oldtuple, slot, oldSlot); /* Process RETURNING if present */ @@ -2434,144 +2346,26 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning) { ModifyTableState *mtstate = context->mtstate; ExprContext *econtext = mtstate->ps.ps_ExprContext; - Relation relation = resultRelInfo->ri_RelationDesc; ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; - TM_FailureData tmfd; - LockTupleMode lockmode; - TM_Result test; - Datum xminDatum; - TransactionId xmin; - bool isnull; + Datum tupleid; - /* Determine lock mode to use */ - lockmode = ExecUpdateLockMode(context->estate, resultRelInfo); - - /* - * Lock tuple for update. Don't follow updates when tuple cannot be - * locked without doing so. A row locking conflict here means our - * previous conclusion that the tuple is conclusively committed is not - * true anymore. - */ - test = table_tuple_lock(relation, conflictTid, - context->estate->es_snapshot, - existing, context->estate->es_output_cid, - lockmode, LockWaitBlock, 0, - &tmfd); - switch (test) + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { - case TM_Ok: - /* success! */ - break; - - case TM_Invisible: - - /* - * This can occur when a just inserted tuple is updated again in - * the same command. E.g. because multiple rows with the same - * conflicting key values are inserted. - * - * This is somewhat similar to the ExecUpdate() TM_SelfModified - * case. We do not want to proceed because it would lead to the - * same row being updated a second time in some unspecified order, - * and in contrast to plain UPDATEs there's no historical behavior - * to break. - * - * It is the user's responsibility to prevent this situation from - * occurring. These problems are why the SQL standard similarly - * specifies that for SQL MERGE, an exception must be raised in - * the event of an attempt to update the same row twice. - */ - xminDatum = slot_getsysattr(existing, - MinTransactionIdAttributeNumber, - &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - if (TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - /* translator: %s is a SQL command name */ - errmsg("%s command cannot affect row a second time", - "ON CONFLICT DO UPDATE"), - errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); - - /* This shouldn't happen */ - elog(ERROR, "attempted to lock invisible tuple"); - break; - - case TM_SelfModified: - - /* - * This state should never be reached. As a dirty snapshot is used - * to find conflicting tuples, speculative insertion wouldn't have - * seen this row to conflict with. - */ - elog(ERROR, "unexpected self-updated tuple"); - break; - - case TM_Updated: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - - /* - * As long as we don't support an UPDATE of INSERT ON CONFLICT for - * a partitioned table we shouldn't reach to a case where tuple to - * be lock is moved to another partition due to concurrent update - * of the partition key. - */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - - /* - * Tell caller to try again from the very start. - * - * It does not make sense to use the usual EvalPlanQual() style - * loop here, as the new version of the row might not conflict - * anymore, or the conflicting tuple has actually been deleted. - */ - ExecClearTuple(existing); - return false; - - case TM_Deleted: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent delete"))); - - /* see TM_Updated case */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - ExecClearTuple(existing); - return false; - - default: - elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + bool isnull; + tupleid = slot_getsysattr(existing, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&existing->tts_tid); } - - /* Success, the tuple is locked. */ - - /* - * Verify that the tuple is visible to our MVCC snapshot if the current - * isolation level mandates that. - * - * It's not sufficient to rely on the check within ExecUpdate() as e.g. - * CONFLICT ... WHERE clause may prevent us from reaching that. - * - * This means we only ever continue when a new command in the current - * transaction could see the row, even though in READ COMMITTED mode the - * tuple will not be visible according to the current statement's - * snapshot. This is in line with the way UPDATE deals with newer tuple - * versions. - */ - ExecCheckTupleVisible(context->estate, relation, existing); /* * Make tuple and any needed join variables available to ExecQual and @@ -2627,7 +2421,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, /* Execute UPDATE with projection */ *returning = ExecUpdate(context, resultRelInfo, - conflictTid, NULL, + tupleid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, existing, canSetTag, true); @@ -2646,7 +2440,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, */ static TupleTableSlot * ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool canSetTag) + Datum tupleid, bool canSetTag) { bool matched; @@ -2693,7 +2487,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * from ExecMergeNotMatched to ExecMergeMatched, there is no risk of a * livelock. */ - matched = tupleid != NULL; + matched = DatumGetPointer(tupleid) != NULL; if (matched) matched = ExecMergeMatched(context, resultRelInfo, tupleid, canSetTag); @@ -2732,7 +2526,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static bool ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool canSetTag) + Datum tupleid, bool canSetTag) { ModifyTableState *mtstate = context->mtstate; TupleTableSlot *newslot; @@ -2853,7 +2647,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok && updateCxt.updated) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot, + NULL, newslot, resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } @@ -2872,7 +2666,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, TABLE_MODIFY_WAIT, NULL); if (result == TM_Ok) { - ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, + ExecDeleteEpilogue(context, resultRelInfo, NULL, resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } @@ -2992,7 +2786,11 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (TupIsNull(epqslot)) return false; - (void) ExecGetJunkAttribute(epqslot, + /* + * Update tupleid to that of the new tuple, for + * the refetch we do at the top. + */ + tupleid = ExecGetJunkAttribute(epqslot, resultRelInfo->ri_RowIdAttNo, &isNull); if (isNull) @@ -3019,10 +2817,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * that the first qualifying WHEN MATCHED action * is executed. * - * Update tupleid to that of the new tuple, for - * the refetch we do at the top. */ - ItemPointerCopy(&context->tmfd.ctid, tupleid); goto lmerge_matched; case TM_Deleted: @@ -3529,10 +3324,10 @@ ExecModifyTable(PlanState *pstate) PlanState *subplanstate; TupleTableSlot *slot; TupleTableSlot *oldSlot; + Datum tupleid; ItemPointerData tuple_ctid; HeapTupleData oldtupdata; HeapTuple oldtuple; - ItemPointer tupleid; CHECK_FOR_INTERRUPTS(); @@ -3581,6 +3376,8 @@ ExecModifyTable(PlanState *pstate) */ for (;;) { + RowRefType refType; + /* * Reset the per-output-tuple exprcontext. This is needed because * triggers expect to use that context as workspace. It's a bit ugly @@ -3630,7 +3427,7 @@ ExecModifyTable(PlanState *pstate) { EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); - ExecMerge(&context, node->resultRelInfo, NULL, node->canSetTag); + ExecMerge(&context, node->resultRelInfo, PointerGetDatum(NULL), node->canSetTag); continue; /* no RETURNING support yet */ } @@ -3666,7 +3463,8 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = context.planSlot; - tupleid = NULL; + refType = resultRelInfo->ri_RowRefType; + tupleid = PointerGetDatum(NULL); oldtuple = NULL; /* @@ -3708,16 +3506,32 @@ ExecModifyTable(PlanState *pstate) { EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); - ExecMerge(&context, node->resultRelInfo, NULL, node->canSetTag); + ExecMerge(&context, node->resultRelInfo, + PointerGetDatum(NULL), node->canSetTag); continue; /* no RETURNING support yet */ } elog(ERROR, "ctid is NULL"); } - tupleid = (ItemPointer) DatumGetPointer(datum); - tuple_ctid = *tupleid; /* be sure we don't free ctid!! */ - tupleid = &tuple_ctid; + if (refType == ROW_REF_TID) + { + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "ctid is NULL"); + + tuple_ctid = *((ItemPointer) DatumGetPointer(datum)); /* be sure we don't free ctid!! */ + tupleid = PointerGetDatum(&tuple_ctid); + } + else + { + Assert(refType == ROW_REF_ROWID); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "rowid is NULL"); + + tupleid = datumCopy(datum, false, -1); + } } /* @@ -3794,6 +3608,7 @@ ExecModifyTable(PlanState *pstate) /* Fetch the most recent version of old tuple. */ Relation relation = resultRelInfo->ri_RelationDesc; + Assert(DatumGetPointer(tupleid) != NULL); if (!table_tuple_fetch_row_version(relation, tupleid, SnapshotAny, oldSlot)) @@ -3828,6 +3643,9 @@ ExecModifyTable(PlanState *pstate) break; } + if (refType == ROW_REF_ROWID && DatumGetPointer(tupleid) != NULL) + pfree(DatumGetPointer(tupleid)); + /* * If we got a RETURNING result, return it to caller. We'll continue * the work on next call. @@ -4067,10 +3885,20 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { - resultRelInfo->ri_RowIdAttNo = - ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); - if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) - elog(ERROR, "could not find junk ctid column"); + if (resultRelInfo->ri_RowRefType == ROW_REF_TID) + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk ctid column"); + } + else + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "rowid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk rowid column"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { @@ -4382,6 +4210,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_auxmodifytables = lcons(mtstate, estate->es_auxmodifytables); + + return mtstate; } diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 862bd0330bc..8180a2991c3 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -378,7 +378,7 @@ TidNext(TidScanState *node) if (node->tss_isCurrentOf) table_tuple_get_latest_tid(scan, &tid); - if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot)) + if (table_tuple_fetch_row_version(heapRelation, PointerGetDatum(&tid), snapshot, slot)) return slot; /* Bad TID or failed snapshot qual; try next */ diff --git a/src/backend/nodes/read.c b/src/backend/nodes/read.c index 5d76f56e4e8..07df92d813c 100644 --- a/src/backend/nodes/read.c +++ b/src/backend/nodes/read.c @@ -205,6 +205,17 @@ pg_strtok(int *length) return ret_str; } +bool +pg_str_hasfield(void) +{ + const char *local_str = pg_strtok_ptr; + + while (*local_str == ' ' || *local_str == '\n' || *local_str == '\t') + local_str++; + + return (*local_str == ':'); +} + /* * debackslash - * create a palloc'd string holding the given token. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 2ffef1bad78..7198fd4777c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2263,6 +2263,7 @@ preprocess_rowmarks(PlannerInfo *root) RowMarkClause *rc = lfirst_node(RowMarkClause, l); RangeTblEntry *rte = rt_fetch(rc->rti, parse->rtable); PlanRowMark *newrc; + RowRefType refType; /* * Currently, it is syntactically impossible to have FOR UPDATE et al @@ -2285,8 +2286,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = rc->rti; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, rc->strength); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, rc->strength, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = rc->strength; newrc->waitPolicy = rc->waitPolicy; newrc->isParent = false; @@ -2302,6 +2303,7 @@ preprocess_rowmarks(PlannerInfo *root) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, l); PlanRowMark *newrc; + RowRefType refType = ROW_REF_TID; i++; if (!bms_is_member(i, rels)) @@ -2310,8 +2312,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = i; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, LCS_NONE); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, LCS_NONE, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = LCS_NONE; newrc->waitPolicy = LockWaitBlock; /* doesn't matter */ newrc->isParent = false; @@ -2326,11 +2328,13 @@ preprocess_rowmarks(PlannerInfo *root) * Select RowMarkType to use for a given table */ RowMarkType -select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) +select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength, + RowRefType *refType) { if (rte->rtekind != RTE_RELATION) { /* If it's not a table at all, use ROW_MARK_COPY */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else if (rte->relkind == RELKIND_FOREIGN_TABLE) @@ -2341,10 +2345,12 @@ select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) if (fdwroutine->GetForeignRowMarkType != NULL) return fdwroutine->GetForeignRowMarkType(rte, strength); /* Otherwise, use ROW_MARK_COPY by default */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else { + *refType = rte->reftype; /* Regular table, apply the appropriate lock type */ switch (strength) { diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 9d46488ef7c..0d849332904 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -210,7 +210,7 @@ preprocess_targetlist(PlannerInfo *root) if (rc->rti != rc->prti) continue; - if (rc->allMarkTypes & ~(1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_TID)) { /* Need to fetch TID */ var = makeVar(rc->rti, @@ -226,7 +226,23 @@ preprocess_targetlist(PlannerInfo *root) true); tlist = lappend(tlist, tle); } - if (rc->allMarkTypes & (1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_ROWID)) + { + /* Need to fetch TID */ + var = makeVar(rc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", rc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(tlist) + 1, + pstrdup(resname), + true); + tlist = lappend(tlist, tle); + } + if (rc->allRefTypes & (1 << ROW_REF_COPY)) { /* Need the whole row as a junk var */ var = makeWholeRowVar(rt_fetch(rc->rti, range_table), diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index f456b3b0a44..43af763f1fe 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -896,17 +896,35 @@ add_row_identity_columns(PlannerInfo *root, Index rtindex, relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { + RowRefType refType = ROW_REF_TID; + + refType = table_get_row_ref_type(target_relation); + /* * Emit CTID so that executor can find the row to merge, update or * delete. */ - var = makeVar(rtindex, - SelfItemPointerAttributeNumber, - TIDOID, - -1, - InvalidOid, - 0); - add_row_identity_var(root, var, rtindex, "ctid"); + if (refType == ROW_REF_TID) + { + var = makeVar(rtindex, + SelfItemPointerAttributeNumber, + TIDOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "ctid"); + } + else + { + Assert(refType == ROW_REF_ROWID); + var = makeVar(rtindex, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "rowid"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index f9d3ff1e7ac..e16e855cf64 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -16,6 +16,7 @@ #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/partition.h" #include "catalog/pg_inherits.h" #include "catalog/pg_type.h" @@ -91,7 +92,7 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, LOCKMODE lockmode; PlanRowMark *oldrc; bool old_isParent = false; - int old_allMarkTypes = 0; + int old_allRefTypes = 0; Assert(rte->inh); /* else caller error */ @@ -131,8 +132,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, { old_isParent = oldrc->isParent; oldrc->isParent = true; - /* Save initial value of allMarkTypes before children add to it */ - old_allMarkTypes = oldrc->allMarkTypes; + /* Save initial value of allRefTypes before children add to it */ + old_allRefTypes = oldrc->allRefTypes; } /* Scan the inheritance set and expand it */ @@ -239,15 +240,15 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, */ if (oldrc) { - int new_allMarkTypes = oldrc->allMarkTypes; + int new_allRefTypes = oldrc->allRefTypes; Var *var; TargetEntry *tle; char resname[32]; List *newvars = NIL; /* Add TID junk Var if needed, unless we had it already */ - if (new_allMarkTypes & ~(1 << ROW_MARK_COPY) && - !(old_allMarkTypes & ~(1 << ROW_MARK_COPY))) + if (new_allRefTypes & (1 << ROW_REF_TID) && + !(old_allRefTypes & (1 << ROW_REF_TID))) { /* Need to fetch TID */ var = makeVar(oldrc->rti, @@ -266,8 +267,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, } /* Add whole-row junk Var if needed, unless we had it already */ - if ((new_allMarkTypes & (1 << ROW_MARK_COPY)) && - !(old_allMarkTypes & (1 << ROW_MARK_COPY))) + if ((new_allRefTypes & (1 << ROW_REF_COPY)) && + !(old_allRefTypes & (1 << ROW_REF_COPY))) { var = makeWholeRowVar(planner_rt_fetch(oldrc->rti, root), oldrc->rti, @@ -282,6 +283,24 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, newvars = lappend(newvars, var); } + if ((new_allRefTypes & (1 << ROW_REF_ROWID)) && + !(old_allRefTypes & (1 << ROW_REF_ROWID))) + { + var = makeVar(oldrc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", oldrc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(root->processed_tlist) + 1, + pstrdup(resname), + true); + root->processed_tlist = lappend(root->processed_tlist, tle); + newvars = lappend(newvars, var); + } + /* Add tableoid junk Var, unless we had it already */ if (!old_isParent) { @@ -441,7 +460,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, * where the hierarchy is flattened during RTE expansion.) * * PlanRowMarks still carry the top-parent's RTI, and the top-parent's - * allMarkTypes field still accumulates values from all descendents. + * allRefTypes field still accumulates values from all descendents. * * "parentrte" and "parentRTindex" are immediate parent's RTE and * RTI. "top_parentrc" is top parent's PlanRowMark. @@ -485,6 +504,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Assert(parentrte->rtekind == RTE_RELATION); /* else this is dubious */ childrte->relid = childOID; childrte->relkind = childrel->rd_rel->relkind; + childrte->reftype = table_get_row_ref_type(childrel); /* A partitioned child will need to be expanded further. */ if (childrte->relkind == RELKIND_PARTITIONED_TABLE) { @@ -574,14 +594,16 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, if (top_parentrc) { PlanRowMark *childrc = makeNode(PlanRowMark); + RowRefType refType; childrc->rti = childRTindex; childrc->prti = top_parentrc->rti; childrc->rowmarkId = top_parentrc->rowmarkId; /* Reselect rowmark type, because relkind might not match parent */ childrc->markType = select_rowmark_type(childrte, - top_parentrc->strength); - childrc->allMarkTypes = (1 << childrc->markType); + top_parentrc->strength, + &refType); + childrc->allRefTypes = (1 << refType); childrc->strength = top_parentrc->strength; childrc->waitPolicy = top_parentrc->waitPolicy; @@ -592,8 +614,8 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, */ childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in top parent's allMarkTypes */ - top_parentrc->allMarkTypes |= childrc->allMarkTypes; + /* Include child's rowmark type in top parent's allRefTypes */ + top_parentrc->allRefTypes |= childrc->allRefTypes; root->rowMarks = lappend(root->rowMarks, childrc); } diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 58bc222a8b9..23ef258340a 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -20,6 +20,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/heap.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" @@ -1502,6 +1503,7 @@ addRangeTableEntry(ParseState *pstate, rte->relid = RelationGetRelid(rel); rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1587,6 +1589,7 @@ addRangeTableEntryForRelation(ParseState *pstate, rte->relid = RelationGetRelid(rel); rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1656,6 +1659,7 @@ addRangeTableEntryForSubquery(ParseState *pstate, rte->rtekind = RTE_SUBQUERY; rte->subquery = subquery; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_subquery", NIL); numaliases = list_length(eref->colnames); @@ -1764,6 +1768,7 @@ addRangeTableEntryForFunction(ParseState *pstate, rte->functions = NIL; /* we'll fill this list below */ rte->funcordinality = rangefunc->ordinality; rte->alias = alias; + rte->reftype = ROW_REF_COPY; /* * Choose the RTE alias name. We default to using the first function's @@ -2083,6 +2088,7 @@ addRangeTableEntryForTableFunc(ParseState *pstate, rte->coltypmods = tf->coltypmods; rte->colcollations = tf->colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias(refname, NIL); numaliases = list_length(eref->colnames); @@ -2159,6 +2165,7 @@ addRangeTableEntryForValues(ParseState *pstate, rte->coltypmods = coltypmods; rte->colcollations = colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias(refname, NIL); @@ -2256,6 +2263,7 @@ addRangeTableEntryForJoin(ParseState *pstate, rte->joinrightcols = rightcols; rte->join_using_alias = join_using_alias; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_join", NIL); numaliases = list_length(eref->colnames); @@ -2337,6 +2345,7 @@ addRangeTableEntryForCTE(ParseState *pstate, rte->rtekind = RTE_CTE; rte->ctename = cte->ctename; rte->ctelevelsup = levelsup; + rte->reftype = ROW_REF_COPY; /* Self-reference if and only if CTE's parse analysis isn't completed */ rte->self_reference = !IsA(cte->ctequery, Query); @@ -2499,6 +2508,7 @@ addRangeTableEntryForENR(ParseState *pstate, * if they access transition tables linked to a table that is altered. */ rte->relid = enrmd->reliddesc; + rte->reftype = ROW_REF_COPY; /* * Build the list of effective column names using user-supplied aliases @@ -3268,6 +3278,9 @@ get_rte_attribute_name(RangeTblEntry *rte, AttrNumber attnum) attnum > 0 && attnum <= list_length(rte->alias->colnames)) return strVal(list_nth(rte->alias->colnames, attnum - 1)); + if (attnum == RowIdAttributeNumber) + return "rowid"; + /* * If the RTE is a relation, go to the system catalogs not the * eref->colnames list. This is a little slower but it will give the diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 7dd9345c617..6b1fa5315d8 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2834,7 +2834,9 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); - relopts = extractRelOptions(tup, pg_class_desc, NULL); + relopts = extractRelOptions(tup, pg_class_desc, + GetTableAmRoutineByAmOid(((Form_pg_class) GETSTRUCT(tup))->relam), + NULL); if (relopts == NULL) return NULL; diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 9cd96fd17ef..f2307c43612 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -23,6 +23,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/dependency.h" #include "catalog/pg_type.h" #include "commands/trigger.h" diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 6945d99b3d5..9abe334b563 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -247,6 +247,7 @@ RI_FKey_check(TriggerData *trigdata) TupleTableSlot *newslot; RI_QueryKey qkey; SPIPlanPtr qplan; + Relation rel = trigdata->tg_relation; riinfo = ri_FetchConstraintInfo(trigdata->tg_trigger, trigdata->tg_relation, false); @@ -264,7 +265,7 @@ RI_FKey_check(TriggerData *trigdata) * and lock on the buffer to call HeapTupleSatisfiesVisibility. Caller * should be holding pin, but not lock. */ - if (!table_tuple_satisfies_snapshot(trigdata->tg_relation, newslot, SnapshotSelf)) + if (!table_tuple_satisfies_snapshot(rel, newslot, SnapshotSelf)) return PointerGetDatum(NULL); /* @@ -1263,9 +1264,6 @@ RI_FKey_fk_upd_check_required(Trigger *trigger, Relation fk_rel, { const RI_ConstraintInfo *riinfo; int ri_nullcheck; - Datum xminDatum; - TransactionId xmin; - bool isnull; /* * AfterTriggerSaveEvent() handles things such that this function is never @@ -1333,10 +1331,7 @@ RI_FKey_fk_upd_check_required(Trigger *trigger, Relation fk_rel, * this if we knew the INSERT trigger already fired, but there is no easy * way to know that.) */ - xminDatum = slot_getsysattr(oldslot, MinTransactionIdAttributeNumber, &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - if (TransactionIdIsCurrentTransactionId(xmin)) + if (table_tuple_is_current(fk_rel, oldslot)) return true; /* If all old and new key values are equal, no check is needed */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 8e08ca1c680..35e076c536b 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -34,6 +34,7 @@ #include "access/multixact.h" #include "access/nbtree.h" #include "access/parallel.h" +#include "access/relation.h" #include "access/reloptions.h" #include "access/sysattr.h" #include "access/table.h" @@ -317,6 +318,7 @@ static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid, StrategyNumber numSupport); static void RelationCacheInitFileRemoveInDir(const char *tblspcpath); static void unlink_initfile(const char *initfilename, int elevel); +static void release_rd_amcache(Relation rel); /* @@ -461,8 +463,9 @@ AllocateRelationDesc(Form_pg_class relp) static void RelationParseRelOptions(Relation relation, HeapTuple tuple) { - bytea *options; - amoptions_function amoptsfn; + bytea *options; + amoptions_function amoptsfn; + const TableAmRoutine *tableam = NULL; relation->rd_options = NULL; @@ -474,9 +477,10 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) { case RELKIND_RELATION: case RELKIND_TOASTVALUE: - case RELKIND_VIEW: case RELKIND_MATVIEW: + case RELKIND_VIEW: case RELKIND_PARTITIONED_TABLE: + tableam = relation->rd_tableam; amoptsfn = NULL; break; case RELKIND_INDEX: @@ -488,11 +492,12 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) } /* - * Fetch reloptions from tuple; have to use a hardwired descriptor because - * we might not have any other for pg_class yet (consider executing this - * code for pg_class itself) - */ - options = extractRelOptions(tuple, GetPgClassDescriptor(), amoptsfn); + * Fetch reloptions from tuple; have to use a hardwired descriptor because + * we might not have any other for pg_class yet (consider executing this + * code for pg_class itself) + */ + options = extractRelOptions(tuple, GetPgClassDescriptor(), + tableam, amoptsfn); /* * Copy parsed data into CacheMemoryContext. To guard against the @@ -2230,9 +2235,7 @@ RelationReloadIndexInfo(Relation relation) RelationCloseSmgr(relation); /* Must free any AM cached data upon relcache flush */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * If it's a shared index, we might be called before backend startup has @@ -2452,8 +2455,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation->rd_options); if (relation->rd_indextuple) pfree(relation->rd_indextuple); - if (relation->rd_amcache) - pfree(relation->rd_amcache); + release_rd_amcache(relation); if (relation->rd_fdwroutine) pfree(relation->rd_fdwroutine); if (relation->rd_indexcxt) @@ -2515,9 +2517,7 @@ RelationClearRelation(Relation relation, bool rebuild) RelationCloseSmgr(relation); /* Free AM cached data, if any */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * Treat nailed-in system relations separately, they always need to be @@ -6820,3 +6820,9 @@ unlink_initfile(const char *initfilename, int elevel) initfilename))); } } + +static void +release_rd_amcache(Relation rel) +{ + table_free_rd_amcache(rel); +} diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index f60633df241..120db339150 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1100,6 +1100,36 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward, } } +/* + * Same as tuplestore_gettupleslot(), but foces tuple storage to slot. Thus, + * it can work with slot types different than minimal tuple. + */ +bool +tuplestore_force_gettupleslot(Tuplestorestate *state, bool forward, + bool copy, TupleTableSlot *slot) +{ + MinimalTuple tuple; + bool should_free; + + tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free); + + if (tuple) + { + if (copy && !should_free) + { + tuple = heap_copy_minimal_tuple(tuple); + should_free = true; + } + ExecForceStoreMinimalTuple(tuple, slot, should_free); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + /* * tuplestore_advance - exported function to adjust position without fetching * diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index 1d5bfa62ffc..4812bc4481d 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -21,6 +21,7 @@ #include "access/amapi.h" #include "access/htup.h" +#include "access/tableam.h" #include "access/tupdesc.h" #include "nodes/pg_list.h" #include "storage/lock.h" @@ -224,6 +225,7 @@ extern Datum transformRelOptions(Datum oldOptions, List *defList, bool acceptOidsOff, bool isReset); extern List *untransformRelOptions(Datum options); extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, + const TableAmRoutine *tableam, amoptions_function amoptions); extern void *build_reloptions(Datum reloptions, bool validate, relopt_kind kind, diff --git a/src/include/access/sysattr.h b/src/include/access/sysattr.h index 8f08682750b..d717a7cafec 100644 --- a/src/include/access/sysattr.h +++ b/src/include/access/sysattr.h @@ -24,6 +24,7 @@ #define MaxTransactionIdAttributeNumber (-4) #define MaxCommandIdAttributeNumber (-5) #define TableOidAttributeNumber (-6) -#define FirstLowInvalidHeapAttributeNumber (-7) +#define RowIdAttributeNumber (-7) +#define FirstLowInvalidHeapAttributeNumber (-8) #endif /* SYSATTR_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 78106f3a100..cd086ae12ab 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -17,10 +17,14 @@ #ifndef TABLEAM_H #define TABLEAM_H +#include "access/amapi.h" #include "access/relscan.h" #include "access/sdir.h" #include "access/xact.h" #include "executor/tuptable.h" +#include "nodes/execnodes.h" +#include "storage/bufmgr.h" +#include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -39,6 +43,16 @@ struct TBMIterateResult; struct VacuumParams; struct ValidateIndexState; +typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, + double *totaldeadrows); + +/* in commands/analyze.c */ +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); + /* * Bitmask values for the flags argument to the scan_begin callback. */ @@ -300,6 +314,9 @@ typedef struct TableAmRoutine */ const TupleTableSlotOps *(*slot_callbacks) (Relation rel); + RowRefType (*get_row_ref_type) (Relation rel); + + void (*free_rd_amcache) (Relation rel); /* ------------------------------------------------------------------------ * Table scan callbacks. @@ -469,7 +486,7 @@ typedef struct TableAmRoutine * test, returns true, false otherwise. */ bool (*tuple_fetch_row_version) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot); @@ -505,23 +522,19 @@ typedef struct TableAmRoutine */ /* see table_tuple_insert() for reference about parameters */ - void (*tuple_insert) (Relation rel, TupleTableSlot *slot, + TupleTableSlot *(*tuple_insert) (Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate); - /* see table_tuple_insert_speculative() for reference about parameters */ - void (*tuple_insert_speculative) (Relation rel, - TupleTableSlot *slot, - CommandId cid, - int options, - struct BulkInsertStateData *bistate, - uint32 specToken); - - /* see table_tuple_complete_speculative() for reference about parameters */ - void (*tuple_complete_speculative) (Relation rel, - TupleTableSlot *slot, - uint32 specToken, - bool succeeded); + TupleTableSlot *(*tuple_insert_with_arbiter) (ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot); /* see table_multi_insert() for reference about parameters */ void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots, @@ -529,7 +542,7 @@ typedef struct TableAmRoutine /* see table_tuple_delete() for reference about parameters */ TM_Result (*tuple_delete) (Relation rel, - ItemPointer tid, + Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, @@ -540,7 +553,7 @@ typedef struct TableAmRoutine /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, - ItemPointer otid, + Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, @@ -553,7 +566,7 @@ typedef struct TableAmRoutine /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, @@ -873,6 +886,14 @@ typedef struct TableAmRoutine struct SampleScanState *scanstate, TupleTableSlot *slot); + /* Check if tuple in the slot belongs to the current transaction */ + bool (*tuple_is_current) (Relation rel, TupleTableSlot *slot); + + void (*analyze_table) (Relation relation, + AcquireSampleRowsFunc *func, + BlockNumber *totalpages); + + bytea *(*reloptions) (char relkind, Datum reloptions, bool validate); } TableAmRoutine; @@ -1288,7 +1309,7 @@ extern bool table_index_fetch_tuple_check(Relation rel, */ static inline bool table_tuple_fetch_row_version(Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -1300,7 +1321,7 @@ table_tuple_fetch_row_version(Relation rel, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); - return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); + return rel->rd_tableam->tuple_fetch_row_version(rel, tupleid, snapshot, slot); } /* @@ -1400,45 +1421,32 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) * insertion. But note that any toasting of fields within the slot is NOT * reflected in the slots contents. */ -static inline void +static inline TupleTableSlot * table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate) { - rel->rd_tableam->tuple_insert(rel, slot, cid, options, - bistate); + return rel->rd_tableam->tuple_insert(rel, slot, cid, options, bistate); } -/* - * Perform a "speculative insertion". These can be backed out afterwards - * without aborting the whole transaction. Other sessions can wait for the - * speculative insertion to be confirmed, turning it into a regular tuple, or - * aborted, as if it never existed. Speculatively inserted tuples behave as - * "value locks" of short duration, used to implement INSERT .. ON CONFLICT. - * - * A transaction having performed a speculative insertion has to either abort, - * or finish the speculative insertion with - * table_tuple_complete_speculative(succeeded = ...). - */ -static inline void -table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, - CommandId cid, int options, - struct BulkInsertStateData *bistate, - uint32 specToken) -{ - rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options, - bistate, specToken); -} - -/* - * Complete "speculative insertion" started in the same transaction. If - * succeeded is true, the tuple is fully inserted, if false, it's removed. - */ -static inline void -table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, - uint32 specToken, bool succeeded) +static inline TupleTableSlot * +table_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) { - rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken, - succeeded); + Relation rel = resultRelInfo->ri_RelationDesc; + + return rel->rd_tableam->tuple_insert_with_arbiter(resultRelInfo, + slot, cid, options, + bistate, arbiterIndexes, + estate, + lockmode, lockedSlot, + tempSlot); } /* @@ -1500,12 +1508,12 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * TM_FailureData for additional info. */ static inline TM_Result -table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, +table_tuple_delete(Relation rel, Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, bool changingPart, TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_delete(rel, tid, cid, + return rel->rd_tableam->tuple_delete(rel, tupleid, cid, snapshot, crosscheck, options, tmfd, changingPart, oldSlot); @@ -1556,13 +1564,13 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * for additional info. */ static inline TM_Result -table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, +table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_update(rel, otid, slot, + return rel->rd_tableam->tuple_update(rel, tupleid, slot, cid, snapshot, crosscheck, options, tmfd, lockmode, update_indexes, @@ -1603,12 +1611,12 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * comments for struct TM_FailureData for additional info. */ static inline TM_Result -table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, +table_tuple_lock(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) { - return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot, + return rel->rd_tableam->tuple_lock(rel, tupleid, snapshot, slot, cid, mode, wait_policy, flags, tmfd); } @@ -2074,6 +2082,11 @@ table_scan_sample_next_tuple(TableScanDesc scan, slot); } +static inline bool +table_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + return rel->rd_tableam->tuple_is_current(rel, slot); +} /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. @@ -2128,6 +2141,60 @@ extern void table_block_relation_estimate_size(Relation rel, */ extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler); +extern const TableAmRoutine *GetTableAmRoutineByAmOid(Oid amoid); extern const TableAmRoutine *GetHeapamTableAmRoutine(void); +static inline RowRefType +table_get_row_ref_type(Relation rel) +{ + if (rel->rd_tableam) + return rel->rd_tableam->get_row_ref_type(rel); + else + return ROW_REF_TID; +} + +static inline void +table_free_rd_amcache(Relation rel) +{ + if (rel->rd_tableam) + { + rel->rd_tableam->free_rd_amcache(rel); + } + else + { + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + } +} + +static inline void +table_analyze(Relation relation, AcquireSampleRowsFunc *func, + BlockNumber *totalpages) +{ + if (relation->rd_tableam->analyze_table) + { + relation->rd_tableam->analyze_table(relation, func, totalpages); + } + else + { + *func = acquire_sample_rows; + *totalpages = RelationGetNumberOfBlocks(relation); + } +} + +static inline bytea * +table_reloptions(Relation rel, char relkind, + Datum reloptions, bool validate) +{ + return rel->rd_tableam->reloptions(relkind, reloptions, validate); +} + +static inline bytea * +tableam_reloptions(const TableAmRoutine *tableam, char relkind, + Datum reloptions, bool validate) +{ + return tableam->reloptions(relkind, reloptions, validate); +} + #endif /* TABLEAM_H */ diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 4903b4b7bc2..15e1fbe7700 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -209,7 +209,7 @@ extern void ExecASDeleteTriggers(EState *estate, extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -231,7 +231,7 @@ extern void ExecASUpdateTriggers(EState *estate, extern bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 39fbd5f10a5..3a8ee4fbf05 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -376,6 +376,9 @@ extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc); extern void analyze_rel(Oid relid, RangeVar *relation, VacuumParams *params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy); +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); extern bool std_typanalyze(VacAttrStats *stats); /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */ diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index 996c62e3055..50a2494c019 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -13,6 +13,7 @@ #define FDWAPI_H #include "access/parallel.h" +#include "access/tableam.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" @@ -148,11 +149,6 @@ typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate, typedef void (*ExplainDirectModify_function) (ForeignScanState *node, struct ExplainState *es); -typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, - double *totaldeadrows); - typedef bool (*AnalyzeForeignTable_function) (Relation relation, AcquireSampleRowsFunc *func, BlockNumber *totalpages); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 49419f14f0d..037ab7dd3da 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -449,6 +449,8 @@ typedef struct ResultRelInfo /* relation descriptor for result relation */ Relation ri_RelationDesc; + RowRefType ri_RowRefType; + /* # of indices existing on result relation */ int ri_NumIndices; @@ -744,6 +746,7 @@ typedef struct ExecRowMark Index prti; /* parent range table index, if child */ Index rowmarkId; /* unique identifier for resjunk columns */ RowMarkType markType; /* see enum in nodes/plannodes.h */ + RowRefType refType; LockClauseStrength strength; /* LockingClause's strength, or LCS_NONE */ LockWaitPolicy waitPolicy; /* NOWAIT and SKIP LOCKED */ bool ermActive; /* is this mark relevant for current tuple? */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 9dca3b65287..99173f541d5 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1073,6 +1073,7 @@ typedef struct RangeTblEntry int rellockmode; /* lock level that query requires on the rel */ struct TableSampleClause *tablesample; /* sampling info, or NULL */ Index perminfoindex; + RowRefType reftype; /* * Fields valid for a subquery RTE (else NULL): diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index d64fe6a328b..77130245e8f 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1352,7 +1352,7 @@ typedef enum RowMarkType * child relations will also have entries with isParent = true. The child * entries have rti == child rel's RT index and prti == top parent's RT index, * and can therefore be recognized as children by the fact that prti != rti. - * The parent's allMarkTypes field gets the OR of (1< Date: Mon, 13 Dec 2021 00:19:41 +0300 Subject: [PATCH 08/45] Hook for custom error cleanup --- src/backend/access/transam/xact.c | 2 ++ src/backend/postmaster/autovacuum.c | 1 + src/backend/postmaster/auxprocess.c | 1 + src/backend/postmaster/bgwriter.c | 1 + src/backend/postmaster/checkpointer.c | 2 ++ src/backend/postmaster/walwriter.c | 1 + src/backend/replication/walsender.c | 1 + src/backend/storage/lmgr/proc.c | 2 ++ src/backend/utils/error/elog.c | 10 +++++++++- src/include/utils/elog.h | 6 ++++++ 10 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 4a2ea4adbaf..c0a6dd44e3e 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2714,6 +2714,7 @@ AbortTransaction(void) * while cleaning up! */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Clear wait information and command progress indicator */ pgstat_report_wait_end(); @@ -5076,6 +5077,7 @@ AbortSubTransaction(void) * Buffer locks, for example? I don't think so but I'm not sure. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); pgstat_progress_end_command(); diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 6b1fa5315d8..693db1b3c9f 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -538,6 +538,7 @@ AutoVacLauncherMain(int argc, char *argv[]) * transaction. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); UnlockBuffers(); /* this is probably dead code, but let's be safe: */ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index cae6feb3562..bc4c3d11359 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -178,6 +178,7 @@ static void ShutdownAuxiliaryProcess(int code, Datum arg) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); } diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index f2e4f23d9fc..7963fcd2a38 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -166,6 +166,7 @@ BackgroundWriterMain(void) * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); UnlockBuffers(); ReleaseAuxProcessResources(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index f482f6423d1..26a06c6b084 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -207,6 +207,7 @@ CheckpointerMain(void) */ pqsignal(SIGCHLD, SIG_DFL); + /* * Initialize so that first time-driven event happens at the correct time. */ @@ -269,6 +270,7 @@ CheckpointerMain(void) * files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 266fbc23399..4e8a9573006 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -161,6 +161,7 @@ WalWriterMain(void) * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 4c53de08b9b..ce4e40bf137 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -315,6 +315,7 @@ void WalSndErrorCleanup(void) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e9e445bb216..6cf4cf33242 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -829,6 +829,7 @@ ProcKill(int code, Datum arg) * facility by releasing our PGPROC ... */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); @@ -940,6 +941,7 @@ AuxiliaryProcKill(int code, Datum arg) /* Release any LW locks I am holding (see notes above) */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 7112fb00069..5badf5eaedc 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -3749,7 +3749,6 @@ write_stderr(const char *fmt,...) va_end(ap); } - /* * Write a message to STDERR using only async-signal-safe functions. This can * be used to safely emit a message from a signal handler. @@ -3802,3 +3801,12 @@ trace_recovery(int trace_level) return trace_level; } + +CustomErrorCleanupHookType CustomErrorCleanupHook = NULL; + +void +CustomErrorCleanup(void) +{ + if (CustomErrorCleanupHook) + CustomErrorCleanupHook(); +} diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index 0292e88b4f2..5b7deaa286c 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -542,4 +542,10 @@ extern void write_stderr(const char *fmt,...) pg_attribute_printf(1, 2); */ extern void write_stderr_signal_safe(const char *fmt); +typedef void (*CustomErrorCleanupHookType) (void); + +extern CustomErrorCleanupHookType CustomErrorCleanupHook; + +extern void CustomErrorCleanup(void); + #endif /* ELOG_H */ From 32f225b008608c7ede2848e9dd71ddd0006acfad Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 01:51:03 +0300 Subject: [PATCH 09/45] Snapshot extension and hooks Snapshot have two pairing heap nodes: for data and system undos. --- src/backend/access/transam/xact.c | 11 ++++++++ src/backend/access/transam/xlog.c | 3 ++ src/backend/storage/ipc/procarray.c | 8 ++++++ src/backend/utils/time/snapmgr.c | 44 +++++++++++++++++++++++++++++ src/include/access/transam.h | 10 ++++++- src/include/access/xlog.h | 1 + src/include/storage/proc.h | 1 + src/include/storage/procarray.h | 2 ++ src/include/utils/snapmgr.h | 11 +++++++- src/include/utils/snapshot.h | 13 +++++++++ 10 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c0a6dd44e3e..8f3adde828d 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -209,6 +209,7 @@ typedef struct TransactionStateData int parallelModeLevel; /* Enter/ExitParallelMode counter */ bool chain; /* start a new block after this one */ bool topXidLogged; /* for a subxact: is top-level XID logged? */ + CommitSeqNo csn; struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; @@ -242,6 +243,7 @@ static TransactionStateData TopTransactionStateData = { .state = TRANS_DEFAULT, .blockState = TBLOCK_DEFAULT, .topXidLogged = false, + .csn = COMMITSEQNO_INPROGRESS }; /* @@ -2014,6 +2016,7 @@ StartTransaction(void) */ s->state = TRANS_START; s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + s->csn = COMMITSEQNO_INPROGRESS; /* Determine if statements are logged in this transaction */ xact_is_sampled = log_xact_sample_rate != 0 && @@ -2288,7 +2291,9 @@ CommitTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ + MyProc->lastCommittedCSN = s->csn; ProcArrayEndTransaction(MyProc, latestXid); + s->csn = MyProc->lastCommittedCSN; /* * This is all post-commit cleanup. Note that if an error is raised here, @@ -6269,3 +6274,9 @@ xact_redo(XLogReaderState *record) else elog(PANIC, "xact_redo: unknown op code %u", info); } + +CommitSeqNo +GetCurrentCSN(void) +{ + return TopTransactionStateData.csn; +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a19ba7167fd..04b7f1eba1e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -138,6 +138,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +CommitSeqNo startupCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -4710,6 +4711,7 @@ BootStrapXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; + pg_atomic_write_u64(&ShmemVariableCache->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL + 1); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5176,6 +5178,7 @@ StartupXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; + pg_atomic_write_u64(&ShmemVariableCache->nextCommitSeqNo, startupCommitSeqNo); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 316b4fa7197..f41656027d5 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -309,6 +309,8 @@ static GlobalVisState GlobalVisTempRels; */ static TransactionId ComputeXidHorizonsResultLastXmin; +snapshot_hook_type snapshot_hook = NULL; + #ifdef XIDCACHE_DEBUG /* counters for XidCache measurement */ @@ -752,6 +754,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->delayChkptFlags = 0; proc->recoveryConflictPending = false; + proc->lastCommittedCSN = pg_atomic_fetch_add_u64(&ShmemVariableCache->nextCommitSeqNo, 1); /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ @@ -2258,6 +2261,8 @@ GetSnapshotData(Snapshot snapshot) if (GetSnapshotDataReuse(snapshot)) { + if (snapshot_hook) + snapshot_hook(snapshot); LWLockRelease(ProcArrayLock); return snapshot; } @@ -2439,6 +2444,9 @@ GetSnapshotData(Snapshot snapshot) if (!TransactionIdIsValid(MyProc->xmin)) MyProc->xmin = TransactionXmin = xmin; + if (snapshot_hook) + snapshot_hook(snapshot); + LWLockRelease(ProcArrayLock); /* maintain state for GlobalVis* */ diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 3a419e348fa..526cacb70a5 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -116,6 +116,10 @@ TransactionId RecentXmin = FirstNormalTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; +snapshot_hook_type snapshot_register_hook = NULL; +snapshot_hook_type snapshot_deregister_hook = NULL; +reset_xmin_hook_type reset_xmin_hook = NULL; + /* * Elements of the active snapshot stack. * @@ -192,6 +196,11 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; + CommitSeqNo snapshotcsn; + uint64 undoRegularLocation; + uint64 undoRegularXmin; + uint64 undoSystemLocation; + uint64 undoSystemXmin; } SerializedSnapshotData; Size @@ -298,6 +307,8 @@ GetTransactionSnapshot(void) /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } else CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -438,6 +449,8 @@ GetNonHistoricCatalogSnapshot(Oid relid) * CatalogSnapshot pointer is already valid. */ pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(CatalogSnapshot); } return CatalogSnapshot; @@ -459,6 +472,8 @@ InvalidateCatalogSnapshot(void) if (CatalogSnapshot) { pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(CatalogSnapshot); CatalogSnapshot = NULL; SnapshotResetXmin(); } @@ -593,6 +608,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } FirstSnapshotSet = true; @@ -855,7 +872,11 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner) ResourceOwnerRememberSnapshot(owner, snap); if (snap->regd_count == 1) + { pairingheap_add(&RegisteredSnapshots, &snap->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snap); + } return snap; } @@ -893,7 +914,11 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner) snapshot->regd_count--; if (snapshot->regd_count == 0) + { pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(snapshot); + } if (snapshot->regd_count == 0 && snapshot->active_count == 0) { @@ -945,6 +970,9 @@ SnapshotResetXmin(void) { Snapshot minSnapshot; + if (reset_xmin_hook) + reset_xmin_hook(); + if (ActiveSnapshot != NULL) return; @@ -1038,6 +1066,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) Assert(FirstXactSnapshot->regd_count > 0); Assert(!pairingheap_is_empty(&RegisteredSnapshots)); pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(FirstXactSnapshot); } FirstXactSnapshot = NULL; @@ -1069,6 +1099,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) pairingheap_remove(&RegisteredSnapshots, &esnap->snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(esnap->snapshot); } exportedSnapshots = NIL; @@ -1196,6 +1228,8 @@ ExportSnapshot(Snapshot snapshot) snapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snapshot); /* * Fill buf with a text serialization of the snapshot, plus identification @@ -2160,6 +2194,11 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; + serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; + serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; + serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; + serialized_snapshot.undoSystemLocation = snapshot->undoSystemLocationPhNode.undoLocation; /* * Ignore the SubXID array if it has overflowed, unless the snapshot was @@ -2235,6 +2274,11 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; + snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; + snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; + snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; + snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; + snapshot->undoSystemLocationPhNode.undoLocation = serialized_snapshot.undoSystemLocation; /* Copy XIDs, if present. */ if (serialized_snapshot.xcnt > 0) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index f7bcb4a8822..ed931c770ec 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -15,7 +15,9 @@ #define TRANSAM_H #include "access/xlogdefs.h" - +#ifndef FRONTEND +#include "port/atomics.h" +#endif /* ---------------- * Special transaction ID values @@ -268,6 +270,11 @@ typedef struct VariableCacheData */ TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ +#ifndef FRONTEND + pg_atomic_uint64 nextCommitSeqNo; +#else + CommitSeqNo nextCommitSeqNo; +#endif } VariableCacheData; typedef VariableCacheData *VariableCache; @@ -310,6 +317,7 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); extern void StopGeneratingPinnedObjectIds(void); +extern CommitSeqNo GetCurrentCSN(void); #ifdef USE_ASSERT_CHECKING extern void AssertTransactionIdInAllowableRange(TransactionId xid); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 48ca8523810..5517feaefe4 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -53,6 +53,7 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern PGDLLIMPORT CommitSeqNo startupCommitSeqNo; /* Archive modes */ typedef enum ArchiveMode diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index ef74f326932..f3aa3bde389 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -293,6 +293,7 @@ struct PGPROC bool fpVXIDLock; /* are we holding a fast-path VXID lock? */ LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID * lock */ + CommitSeqNo lastCommittedCSN; /* * Support for lock groups. Use LockHashPartitionLockByProc on the group diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index d8cae3ce1c5..64db8a3aa8b 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -96,4 +96,6 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, TransactionId *catalog_xmin); +extern snapshot_hook_type snapshot_hook; + #endif /* PROCARRAY_H */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 980d37a1947..d05de790428 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -18,6 +18,9 @@ #include "utils/resowner.h" #include "utils/snapshot.h" +#ifndef SNAPSHOT_H +typedef void (*snapshot_hook_type) (Snapshot snapshot); +#endif /* * The structure used to map times to TransactionId values for the "snapshot @@ -120,7 +123,7 @@ extern void PushActiveSnapshotWithLevel(Snapshot snapshot, int snap_level); extern void PushCopiedSnapshot(Snapshot snapshot); extern void UpdateActiveSnapshotCommandId(void); extern void PopActiveSnapshot(void); -extern Snapshot GetActiveSnapshot(void); +extern PGDLLIMPORT Snapshot GetActiveSnapshot(void); extern bool ActiveSnapshotSet(void); extern Snapshot RegisterSnapshot(Snapshot snapshot); @@ -178,4 +181,10 @@ extern void SerializeSnapshot(Snapshot snapshot, char *start_address); extern Snapshot RestoreSnapshot(char *start_address); extern void RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc); +typedef void (*reset_xmin_hook_type) (void); + +extern snapshot_hook_type snapshot_register_hook; +extern snapshot_hook_type snapshot_deregister_hook; +extern reset_xmin_hook_type reset_xmin_hook; + #endif /* SNAPMGR_H */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 583a667a40a..d4392d7dc04 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -122,6 +122,13 @@ typedef struct SnapshotData *Snapshot; #define InvalidSnapshot ((Snapshot) NULL) +typedef struct +{ + uint64 undoLocation; /* undo log location retained by this snapshot */ + uint64 xmin; + pairingheap_node ph_node; +} RetainUndoLocationPHNode; + /* * Struct representing all kind of possible snapshots. * @@ -214,6 +221,12 @@ typedef struct SnapshotData * transactions completed since the last GetSnapshotData(). */ uint64 snapXactCompletionCount; + + RetainUndoLocationPHNode undoRegularLocationPhNode; + RetainUndoLocationPHNode undoSystemLocationPhNode; + CommitSeqNo snapshotcsn; } SnapshotData; +typedef void (*snapshot_hook_type) (Snapshot snapshot); + #endif /* SNAPSHOT_H */ From cdc5d3f39ad62c2381d49cb916acc6d00d43bccc Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 01:57:10 +0300 Subject: [PATCH 10/45] Hooks for builtin functions and datatypes and orioledb recovery * Added SearchCatCacheInternal_hook, SearchCatCacheList_hook * Added SysCacheGetAttr_hook --- src/backend/commands/indexcmds.c | 4 ++++ src/backend/executor/execExpr.c | 3 +++ src/backend/utils/cache/catcache.c | 25 +++++++++++++++++++++++++ src/backend/utils/cache/syscache.c | 10 ++++++++-- src/backend/utils/cache/typcache.c | 14 ++++++++++++++ src/backend/utils/fmgr/fmgr.c | 4 ++-- src/include/commands/defrem.h | 3 +++ src/include/utils/catcache.h | 23 +++++++++++++++++++++++ src/include/utils/fmgrtab.h | 3 +++ src/include/utils/typcache.h | 5 +++++ 10 files changed, 90 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 21ed483b7fa..e0e94c6b486 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -69,6 +69,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +GetDefaultOpClass_hook_type GetDefaultOpClass_hook = NULL; /* non-export function prototypes */ static bool CompareOpclassOptions(Datum *opts1, Datum *opts2, int natts); @@ -2284,6 +2285,9 @@ GetDefaultOpClass(Oid type_id, Oid am_id) /* If it's a domain, look at the base type instead */ type_id = getBaseType(type_id); + if (GetDefaultOpClass_hook) + return GetDefaultOpClass_hook(type_id, am_id); + tcategory = TypeCategory(type_id); /* diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index bf3a08c5f08..928566b3e40 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -48,6 +48,9 @@ #include "utils/array.h" #include "utils/builtins.h" #include "utils/datum.h" +#include "utils/json.h" +#include "utils/jsonb.h" +#include "utils/jsonpath.h" #include "utils/lsyscache.h" #include "utils/typcache.h" diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 000e81a2d96..91136f5cfbd 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -65,6 +65,10 @@ /* Cache management header --- pointer is NULL until created */ static CatCacheHeader *CacheHdr = NULL; +SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook = NULL; +SearchCatCacheList_hook_type SearchCatCacheList_hook = NULL; +GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook = NULL; + static inline HeapTuple SearchCatCacheInternal(CatCache *cache, int nkeys, Datum v1, Datum v2, @@ -1270,6 +1274,14 @@ SearchCatCacheInternal(CatCache *cache, dlist_head *bucket; CatCTup *ct; + if (SearchCatCacheInternal_hook) + { + ct = SearchCatCacheInternal_hook(cache, nkeys, v1, v2, v3, v4); + + if (ct) + return &ct->tuple; + } + /* Make sure we're in an xact, even if this ends up being a cache hit */ Assert(IsTransactionState()); @@ -1555,6 +1567,11 @@ GetCatCacheHashValue(CatCache *cache, Datum v3, Datum v4) { + if (GetCatCacheHashValue_hook) + { + return GetCatCacheHashValue_hook(cache, cache->cc_nkeys, + v1, v2, v3, v4); + } /* * one-time startup overhead for each cache */ @@ -1605,6 +1622,14 @@ SearchCatCacheList(CatCache *cache, MemoryContext oldcxt; int i; + if (SearchCatCacheList_hook) + { + cl = SearchCatCacheList_hook(cache, nkeys, v1, v2, v3); + + if (cl) + return cl; + } + /* * one-time startup overhead for each cache */ diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 4e4a34bde80..a5b3e437f7c 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -696,6 +696,7 @@ static int SysCacheSupportingRelOidSize; static int oid_compare(const void *a, const void *b); +SysCacheGetAttr_hook_type SysCacheGetAttr_hook = NULL; /* * InitCatalogCache - initialize the caches @@ -1080,6 +1081,7 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull) { + TupleDesc cc_tupdesc = SysCache[cacheId]->cc_tupdesc; /* * We just need to get the TupleDesc out of the cache entry, and then we * can apply heap_getattr(). Normally the cache control data is already @@ -1089,14 +1091,18 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, if (cacheId < 0 || cacheId >= SysCacheSize || !PointerIsValid(SysCache[cacheId])) elog(ERROR, "invalid cache ID: %d", cacheId); - if (!PointerIsValid(SysCache[cacheId]->cc_tupdesc)) + + if (!PointerIsValid(cc_tupdesc) && SysCacheGetAttr_hook) + cc_tupdesc = SysCacheGetAttr_hook(SysCache[cacheId]); + if (!PointerIsValid(cc_tupdesc)) { InitCatCachePhase2(SysCache[cacheId], false); Assert(PointerIsValid(SysCache[cacheId]->cc_tupdesc)); + cc_tupdesc = SysCache[cacheId]->cc_tupdesc; } return heap_getattr(tup, attributeNumber, - SysCache[cacheId]->cc_tupdesc, + cc_tupdesc, isNull); } diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index 608cd5e8e43..71619cf04d0 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -290,6 +290,8 @@ static int32 NextRecordTypmod = 0; /* number of entries used */ * as identifiers, so we start the counter at INVALID_TUPLEDESC_IDENTIFIER. */ static uint64 tupledesc_id_counter = INVALID_TUPLEDESC_IDENTIFIER; +load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook = NULL; +load_enum_cache_data_hook_type load_enum_cache_data_hook = NULL; static void load_typcache_tupdesc(TypeCacheEntry *typentry); static void load_rangetype_info(TypeCacheEntry *typentry); @@ -879,6 +881,12 @@ load_typcache_tupdesc(TypeCacheEntry *typentry) { Relation rel; + if (load_typcache_tupdesc_hook) + { + load_typcache_tupdesc_hook(typentry); + return; + } + if (!OidIsValid(typentry->typrelid)) /* should not happen */ elog(ERROR, "invalid typrelid for composite type %u", typentry->type_id); @@ -2560,6 +2568,12 @@ load_enum_cache_data(TypeCacheEntry *tcache) int bm_size, start_pos; + if (load_enum_cache_data_hook) + { + load_enum_cache_data_hook(tcache); + return; + } + /* Check that this is actually an enum */ if (tcache->typtype != TYPTYPE_ENUM) ereport(ERROR, diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index 9208c31fe06..85811af84ff 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -72,7 +72,7 @@ extern Datum fmgr_security_definer(PG_FUNCTION_ARGS); * or name, but search by Oid is much faster. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_isbuiltin(Oid id) { uint16 index; @@ -97,7 +97,7 @@ fmgr_isbuiltin(Oid id) * the array with the same name, but they should all point to the same * routine. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_lookupByName(const char *name) { int i; diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 478203ed4c4..e1764c06adf 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -158,4 +158,7 @@ extern int defGetTypeLength(DefElem *def); extern List *defGetStringList(DefElem *def); extern void errorConflictingDefElem(DefElem *defel, ParseState *pstate) pg_attribute_noreturn(); +typedef Oid (*GetDefaultOpClass_hook_type)(Oid type_id, Oid am_id); +extern PGDLLIMPORT GetDefaultOpClass_hook_type GetDefaultOpClass_hook; + #endif /* DEFREM_H */ diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index a32d7222a99..91880e498f7 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -232,5 +232,28 @@ extern void PrepareToInvalidateCacheTuple(Relation relation, extern void PrintCatCacheLeakWarning(HeapTuple tuple); extern void PrintCatCacheListLeakWarning(CatCList *list); +typedef CatCTup *(*SearchCatCacheInternal_hook_type)(CatCache *cache, + int nkeys, + Datum v1, Datum v2, + Datum v3, Datum v4); +extern SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook; + +typedef CatCList *(*SearchCatCacheList_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3); +extern SearchCatCacheList_hook_type SearchCatCacheList_hook; + +typedef TupleDesc (*SysCacheGetAttr_hook_type)(CatCache *SysCache); +extern SysCacheGetAttr_hook_type SysCacheGetAttr_hook; + +typedef uint32 (*GetCatCacheHashValue_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3, + Datum v4); +extern GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook; #endif /* CATCACHE_H */ diff --git a/src/include/utils/fmgrtab.h b/src/include/utils/fmgrtab.h index 838ffe3bc1c..f7e416653a6 100644 --- a/src/include/utils/fmgrtab.h +++ b/src/include/utils/fmgrtab.h @@ -46,4 +46,7 @@ extern PGDLLIMPORT const Oid fmgr_last_builtin_oid; /* highest function OID in #define InvalidOidBuiltinMapping PG_UINT16_MAX extern PGDLLIMPORT const uint16 fmgr_builtin_oid_index[]; +extern const FmgrBuiltin *fmgr_isbuiltin(Oid id); +extern const FmgrBuiltin *fmgr_lookupByName(const char *name); + #endif /* FMGRTAB_H */ diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h index 95f3a9ee308..77d57927de0 100644 --- a/src/include/utils/typcache.h +++ b/src/include/utils/typcache.h @@ -206,4 +206,9 @@ extern void SharedRecordTypmodRegistryInit(SharedRecordTypmodRegistry *, extern void SharedRecordTypmodRegistryAttach(SharedRecordTypmodRegistry *); +typedef void (*load_typcache_tupdesc_hook_type)(TypeCacheEntry *typentry); +extern PGDLLIMPORT load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook; +typedef void (*load_enum_cache_data_hook_type)(TypeCacheEntry *tcache); +extern PGDLLIMPORT load_enum_cache_data_hook_type load_enum_cache_data_hook; + #endif /* TYPCACHE_H */ From c77514487d49011ab168127d2ab9eff680c39c6f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:01:16 +0300 Subject: [PATCH 11/45] Recovery and checkpointer hooks --- src/backend/access/transam/transam.c | 1 + src/backend/access/transam/xact.c | 4 ++++ src/backend/access/transam/xlog.c | 18 ++++++++++++++++++ src/backend/access/transam/xlogrecovery.c | 2 ++ src/backend/storage/buffer/bufmgr.c | 6 +++++- src/include/access/xact.h | 3 +++ src/include/access/xlog.h | 10 ++++++++++ 7 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 7629904bbf7..d118c5fd61a 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -22,6 +22,7 @@ #include "access/clog.h" #include "access/subtrans.h" #include "access/transam.h" +#include "storage/proc.h" #include "utils/snapmgr.h" /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 8f3adde828d..dab73df4b2c 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -322,6 +322,7 @@ typedef struct SubXactCallbackItem static SubXactCallbackItem *SubXact_callbacks = NULL; +xact_redo_hook_type xact_redo_hook = NULL; /* local function prototypes */ static void AssignTransactionId(TransactionState s); @@ -5965,6 +5966,9 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionId max_xid; TimestampTz commit_time; + if (xact_redo_hook) + xact_redo_hook(xid, lsn); + Assert(TransactionIdIsValid(xid)); max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 04b7f1eba1e..38f48cf287b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -146,6 +146,11 @@ bool XLOG_DEBUG = false; int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +/* Hook for plugins to get control in CheckPointGuts() */ +CheckPoint_hook_type CheckPoint_hook = NULL; +double CheckPointProgress; +after_checkpoint_cleanup_hook_type after_checkpoint_cleanup_hook = NULL; + /* * Number of WAL insertion locks to use. A higher value allows more insertions * to happen concurrently, but adds some CPU overhead to flushing the WAL, @@ -5051,6 +5056,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + bool wasInRecovery; /* * We should have an aux process resource owner to use, and we should not @@ -5667,6 +5673,8 @@ StartupXLOG(void) */ PreallocXlogFiles(EndOfLog, newTLI); + wasInRecovery = InRecovery; + /* * Okay, we're officially UP. */ @@ -5745,6 +5753,9 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + if (wasInRecovery && after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(EndOfLog, 0); + /* * All done with end-of-recovery actions. * @@ -6869,6 +6880,9 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + if (after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(ProcLastRecPtr, flags); + /* Real work is done; log and update stats. */ LogCheckpointEnd(false); @@ -7027,6 +7041,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); CheckPointRelationMap(); CheckPointReplicationSlots(); CheckPointSnapBuild(); @@ -8991,3 +9007,5 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +void (*RedoShutdownHook) (void) = NULL; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 3c7fb913e7e..8de18a3a6ee 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1806,6 +1806,8 @@ PerformWalRecovery(void) * exit with special return code to request shutdown of * postmaster. Log messages issued from postmaster. */ + if (RedoShutdownHook != NULL) + RedoShutdownHook(); proc_exit(3); case RECOVERY_TARGET_ACTION_PAUSE: diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e066a3f888f..aa82637b1d1 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2667,6 +2667,7 @@ BufferSync(int flags) BufferDesc *bufHdr = NULL; CkptTsStatus *ts_stat = (CkptTsStatus *) DatumGetPointer(binaryheap_first(ts_heap)); + double progress; buf_id = CkptBufferIds[ts_stat->index].buf_id; Assert(buf_id != -1); @@ -2721,7 +2722,10 @@ BufferSync(int flags) * * (This will check for barrier events even if it doesn't sleep.) */ - CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); + progress = (double) num_processed / num_to_scan; + progress = CheckPointProgress + progress * (1 - CheckPointProgress); + + CheckpointWriteDelay(flags, progress); } /* diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 7d3b9446e62..e8200d55720 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -527,4 +527,7 @@ extern void EnterParallelMode(void); extern void ExitParallelMode(void); extern bool IsInParallelMode(void); +typedef void (*xact_redo_hook_type) (TransactionId xid, XLogRecPtr lsn); +extern xact_redo_hook_type xact_redo_hook; + #endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 5517feaefe4..3a4cb5eb0fa 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -300,4 +300,14 @@ extern SessionBackupState get_backup_status(void); /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" +typedef void (*CheckPoint_hook_type) (XLogRecPtr checkPointRedo, int flags); +extern PGDLLIMPORT CheckPoint_hook_type CheckPoint_hook; +extern double CheckPointProgress; +typedef void (*after_checkpoint_cleanup_hook_type)(XLogRecPtr checkPointRedo, + int flags); +extern PGDLLIMPORT after_checkpoint_cleanup_hook_type + after_checkpoint_cleanup_hook; + +extern void (*RedoShutdownHook) (void); + #endif /* XLOG_H */ From 66ffd8853c4e77d17b00f0920ec84ac0c6c40c31 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:07:13 +0300 Subject: [PATCH 12/45] Allow skipping logging for AccessExclusiveLock --- src/backend/storage/lmgr/lock.c | 14 ++++++++++++-- src/include/storage/lock.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index ba66e820d06..f9231b10dd3 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -797,7 +797,7 @@ LockAcquireExtended(const LOCKTAG *locktag, bool reportMemoryError, LOCALLOCK **locallockp) { - LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCKMETHODID lockmethodid; LockMethod lockMethodTable; LOCALLOCKTAG localtag; LOCALLOCK *locallock; @@ -809,6 +809,15 @@ LockAcquireExtended(const LOCKTAG *locktag, LWLock *partitionLock; bool found_conflict; bool log_lock = false; + bool no_log_lock = false; + + if (locktag->locktag_lockmethodid == NO_LOG_LOCKMETHOD) + { + ((LOCKTAG *)locktag)->locktag_lockmethodid = DEFAULT_LOCKMETHOD; + no_log_lock = true; + } + + lockmethodid = locktag->locktag_lockmethodid; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -923,7 +932,8 @@ LockAcquireExtended(const LOCKTAG *locktag, if (lockmode >= AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION && !RecoveryInProgress() && - XLogStandbyInfoActive()) + XLogStandbyInfoActive() && + !no_log_lock) { LogAccessExclusiveLockPrepare(); log_lock = true; diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index f67056a82b5..f4b02c12261 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -124,6 +124,7 @@ typedef uint16 LOCKMETHODID; /* These identify the known lock methods */ #define DEFAULT_LOCKMETHOD 1 #define USER_LOCKMETHOD 2 +#define NO_LOG_LOCKMETHOD 255 /* Skip logging of AccessExclusiveLock */ /* * LOCKTAG is the key information needed to look up a LOCK item in the From 104a6c337cdb9ebc4215711e26d91b9daaa057d2 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:08:21 +0300 Subject: [PATCH 13/45] Add convenience functions IsFatalError() have_backup_in_progress() SnapBuildNextPhaseAt() DoLocalLockExist() --- src/backend/access/transam/xlog.c | 13 +++++++++++++ src/backend/postmaster/postmaster.c | 6 ++++++ src/backend/replication/logical/snapbuild.c | 11 +++++++++++ src/backend/storage/lmgr/lock.c | 21 +++++++++++++++++++++ src/include/access/xlog.h | 1 + src/include/postmaster/postmaster.h | 1 + src/include/replication/snapbuild.h | 1 + src/include/storage/lock.h | 1 + 8 files changed, 55 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 38f48cf287b..c2820f29e7c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8598,6 +8598,19 @@ get_backup_status(void) return sessionBackupState; } +/* + * Check if there is a backup in progress. + * + * We do this check without lock assuming 32-bit reads are atomic. In fact, + * the false result means that there was at least a moment of time when there + * were no backups. + */ +bool +have_backup_in_progress(void) +{ + return (XLogCtl->Insert.runningBackups > 0); +} + /* * do_pg_backup_stop * diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b42aae41fce..f6e1e65d22b 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -579,6 +579,12 @@ int postmaster_alive_fds[2] = {-1, -1}; HANDLE PostmasterHandle; #endif +bool +IsFatalError(void) +{ + return FatalError; +} + /* * Postmaster main entry point */ diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3ed2f79dd06..5b92c5542be 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -404,6 +404,17 @@ SnapBuildCurrentState(SnapBuild *builder) return builder->state; } +/* + * An which transaction id the next phase of initial snapshot building will + * happen? + */ +TransactionId +SnapBuildNextPhaseAt(SnapBuild *builder) +{ + return builder->next_phase_at; +} + + /* * Return the LSN at which the two-phase decoding was first enabled. */ diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index f9231b10dd3..d08a0c21c70 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -648,6 +648,27 @@ GetLockMethodLocalHash(void) } #endif +/* + * Returns true if any LOCKMODE lock with given locktag exist in LocalMethodLocalHash. + */ +bool +DoLocalLockExist(const LOCKTAG *locktag) +{ + HASH_SEQ_STATUS scan_status; + LOCALLOCK* locallock; + + hash_seq_init(&scan_status, LockMethodLocalHash); + while ((locallock = (LOCALLOCK *) hash_seq_search(&scan_status)) != NULL) + { + if (memcmp(&locallock->tag.lock, locktag, sizeof(LOCKTAG)) == 0) + { + hash_seq_term(&scan_status); + return true; + } + } + return false; +} + /* * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 3a4cb5eb0fa..b3b2191e733 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -285,6 +285,7 @@ extern void do_pg_backup_start(const char *backupidstr, bool fast, StringInfo tblspcmapfile); extern void do_pg_backup_stop(BackupState *state, bool waitforarchive); extern void do_pg_abort_backup(int code, Datum arg); +extern bool have_backup_in_progress(void); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 3b3889c58c0..ddc409ec240 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -50,6 +50,7 @@ extern PGDLLIMPORT int postmaster_alive_fds[2]; extern PGDLLIMPORT const char *progname; +extern bool IsFatalError(void); extern void PostmasterMain(int argc, char *argv[]) pg_attribute_noreturn(); extern void ClosePostmasterPorts(bool am_syslogger); extern void InitProcessGlobals(void); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index f49b941b53e..071fca6d3b5 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -73,6 +73,7 @@ extern void SnapBuildClearExportedSnapshot(void); extern void SnapBuildResetExportedSnapshotState(void); extern SnapBuildState SnapBuildCurrentState(SnapBuild *builder); +extern TransactionId SnapBuildNextPhaseAt(SnapBuild *builder); extern Snapshot SnapBuildGetOrBuildSnapshot(SnapBuild *builder); extern bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index f4b02c12261..d7b095b1464 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -551,6 +551,7 @@ extern LockMethod GetLocksMethodTable(const LOCK *lock); extern LockMethod GetLockTagsMethodTable(const LOCKTAG *locktag); extern uint32 LockTagHashCode(const LOCKTAG *locktag); extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2); +extern bool DoLocalLockExist(const LOCKTAG *locktag); extern LockAcquireResult LockAcquire(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, From 5e463755f6a89650dee34d93d0e909405f854ba9 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:11:14 +0300 Subject: [PATCH 14/45] PERFORM_DELETION_OF_RELATION flag for object hooks --- src/backend/catalog/dependency.c | 36 +++++++++++++++++++++++++++++++- src/include/catalog/dependency.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 494738824cb..8627810dc23 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -242,6 +242,7 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, int flags) { int i; + bool *depends_on_relation; /* * Keep track of objects for event triggers, if necessary. @@ -269,6 +270,33 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, } } + depends_on_relation = palloc0(sizeof(bool) * targetObjects->numrefs); + + for (i = targetObjects->numrefs - 1; i >= 0; i--) + { + ObjectAddressExtra *thisextra = targetObjects->extras + i; + int j; + + if (thisextra->dependee.classId == RelationRelationId && + thisextra->dependee.objectSubId == 0) + { + depends_on_relation[i] = true; + continue; + } + + for (j = i + 1; j < targetObjects->numrefs; j++) + { + ObjectAddress *depobj = targetObjects->refs + j; + if (depobj->classId == thisextra->dependee.classId && + depobj->objectId == thisextra->dependee.objectId && + depobj->objectSubId == thisextra->dependee.objectSubId) + { + depends_on_relation[i] = depends_on_relation[j]; + break; + } + } + } + /* * Delete all the objects in the proper order, except that if told to, we * should skip the original object(s). @@ -277,13 +305,19 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, { ObjectAddress *thisobj = targetObjects->refs + i; ObjectAddressExtra *thisextra = targetObjects->extras + i; + int temp_flags = flags; if ((flags & PERFORM_DELETION_SKIP_ORIGINAL) && (thisextra->flags & DEPFLAG_ORIGINAL)) continue; - deleteOneObject(thisobj, depRel, flags); + if (depends_on_relation[i]) + temp_flags |= PERFORM_DELETION_OF_RELATION; + + deleteOneObject(thisobj, depRel, temp_flags); } + + pfree(depends_on_relation); } /* diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index ffd5e9dc82d..4cf8df01077 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -140,6 +140,8 @@ typedef enum ObjectClass #define PERFORM_DELETION_SKIP_EXTENSIONS 0x0010 /* keep extensions */ #define PERFORM_DELETION_CONCURRENT_LOCK 0x0020 /* normal drop with * concurrent lock mode */ +#define PERFORM_DELETION_OF_RELATION 0x0040 /* used for orioledb + * extension */ /* in dependency.c */ From b2cab2180a5af620997bfb858efb0a0764a7b0eb Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:20:32 +0300 Subject: [PATCH 15/45] Expose existing planning funcs and structs --- src/backend/catalog/index.c | 5 +---- src/backend/commands/explain.c | 14 +++----------- src/backend/commands/indexcmds.c | 8 ++------ src/backend/optimizer/path/indxpath.c | 13 +------------ src/backend/optimizer/plan/createplan.c | 16 ++++++++++------ src/include/catalog/index.h | 2 ++ src/include/commands/defrem.h | 4 ++++ src/include/commands/explain.h | 8 ++++++++ src/include/optimizer/paths.h | 12 ++++++++++++ src/include/optimizer/planmain.h | 5 +++++ 10 files changed, 48 insertions(+), 39 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 6f1910a6e0f..db68393483c 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -125,9 +125,6 @@ static void UpdateIndexRelation(Oid indexoid, Oid heapoid, bool immediate, bool isvalid, bool isready); -static void index_update_stats(Relation rel, - bool hasindex, - double reltuples); static void IndexCheckExclusion(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo); @@ -2807,7 +2804,7 @@ FormIndexDatum(IndexInfo *indexInfo, * index. When updating an index, it's important because some index AMs * expect a relcache flush to occur after REINDEX. */ -static void +void index_update_stats(Relation rel, bool hasindex, double reltuples) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 6c2e5c8a4f9..b3421e6e5a8 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -64,9 +64,6 @@ static void report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es); static double elapsed_time(instr_time *starttime); static bool ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used); -static void ExplainNode(PlanState *planstate, List *ancestors, - const char *relationship, const char *plan_name, - ExplainState *es); static void show_plan_tlist(PlanState *planstate, List *ancestors, ExplainState *es); static void show_expression(Node *node, const char *qlabel, @@ -75,9 +72,6 @@ static void show_expression(Node *node, const char *qlabel, static void show_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, bool useprefix, ExplainState *es); -static void show_scan_qual(List *qual, const char *qlabel, - PlanState *planstate, List *ancestors, - ExplainState *es); static void show_upper_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es); @@ -114,8 +108,6 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); -static void show_instrumentation_count(const char *qlabel, int which, - PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static void show_eval_params(Bitmapset *bms_params, ExplainState *es); static const char *explain_get_index_name(Oid indexId); @@ -1174,7 +1166,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) * to the nesting depth of logical output groups, and therefore is controlled * by ExplainOpenGroup/ExplainCloseGroup. */ -static void +void ExplainNode(PlanState *planstate, List *ancestors, const char *relationship, const char *plan_name, ExplainState *es) @@ -2346,7 +2338,7 @@ show_qual(List *qual, const char *qlabel, /* * Show a qualifier expression for a scan plan node */ -static void +void show_scan_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es) @@ -3437,7 +3429,7 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) * * "which" identifies which instrumentation counter to print */ -static void +void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es) { diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index e0e94c6b486..831534ea6c6 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -88,11 +88,7 @@ static void ComputeIndexAttrs(IndexInfo *indexInfo, Oid ddl_userid, int ddl_sec_context, int *ddl_save_nestlevel); -static char *ChooseIndexName(const char *tabname, Oid namespaceId, - List *colnames, List *exclusionOpNames, - bool primary, bool isconstraint); static char *ChooseIndexNameAddition(List *colnames); -static List *ChooseIndexColumnNames(List *indexElems); static void ReindexIndex(RangeVar *indexRelation, ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -2503,7 +2499,7 @@ ChooseRelationName(const char *name1, const char *name2, * * The argument list is pretty ad-hoc :-( */ -static char * +char * ChooseIndexName(const char *tabname, Oid namespaceId, List *colnames, List *exclusionOpNames, bool primary, bool isconstraint) @@ -2592,7 +2588,7 @@ ChooseIndexNameAddition(List *colnames) * * Returns a List of plain strings (char *, not String nodes). */ -static List * +List * ChooseIndexColumnNames(List *indexElems) { List *result = NIL; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 0065c8992bd..bf4968e348b 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -48,14 +48,6 @@ typedef enum ST_ANYSCAN /* either is okay */ } ScanTypeControl; -/* Data structure for collecting qual clauses that match an index */ -typedef struct -{ - bool nonempty; /* True if lists are not all empty */ - /* Lists of IndexClause nodes, one list per index column */ - List *indexclauses[INDEX_MAX_KEYS]; -} IndexClauseSet; - /* Per-path data used within choose_bitmap_and() */ typedef struct { @@ -130,9 +122,6 @@ static double adjust_rowcount_for_semijoins(PlannerInfo *root, Index outer_relid, double rowcount); static double approximate_joinrel_size(PlannerInfo *root, Relids relids); -static void match_restriction_clauses_to_index(PlannerInfo *root, - IndexOptInfo *index, - IndexClauseSet *clauseset); static void match_join_clauses_to_index(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauseset, @@ -2012,7 +2001,7 @@ approximate_joinrel_size(PlannerInfo *root, Relids relids) * Identify restriction clauses for the rel that match the index. * Matching clauses are added to *clauseset. */ -static void +void match_restriction_clauses_to_index(PlannerInfo *root, IndexOptInfo *index, IndexClauseSet *clauseset) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 974c50b29f9..48f251738e2 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -164,16 +164,12 @@ static MergeJoin *create_mergejoin_plan(PlannerInfo *root, MergePath *best_path) static HashJoin *create_hashjoin_plan(PlannerInfo *root, HashPath *best_path); static Node *replace_nestloop_params(PlannerInfo *root, Node *expr); static Node *replace_nestloop_params_mutator(Node *node, PlannerInfo *root); -static void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, - List **stripped_indexquals_p, - List **fixed_indexquals_p); static List *fix_indexorderby_references(PlannerInfo *root, IndexPath *index_path); static Node *fix_indexqual_clause(PlannerInfo *root, IndexOptInfo *index, int indexcol, Node *clause, List *indexcolnos); static Node *fix_indexqual_operand(Node *node, IndexOptInfo *index, int indexcol); static List *get_switched_clauses(List *clauses, Relids outerrelids); -static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_generic_path_info(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static void label_sort_with_costsize(PlannerInfo *root, Sort *plan, @@ -4897,6 +4893,14 @@ replace_nestloop_params(PlannerInfo *root, Node *expr) return replace_nestloop_params_mutator(expr, root); } +Node * +replace_nestloop_params_compat(PlannerInfo *root, Node *expr) +{ + /* No setup needed for tree walk, so away we go */ + return replace_nestloop_params_mutator(expr, root); +} + + static Node * replace_nestloop_params_mutator(Node *node, PlannerInfo *root) { @@ -4977,7 +4981,7 @@ replace_nestloop_params_mutator(Node *node, PlannerInfo *root) * are subplans in it (we need two separate copies of the subplan tree, or * things will go awry). */ -static void +void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, List **stripped_indexquals_p, List **fixed_indexquals_p) { @@ -5270,7 +5274,7 @@ get_switched_clauses(List *clauses, Relids outerrelids) * instead of bare clauses. This is another reason why trying to consider * selectivity in the ordering would likely do the wrong thing. */ -static List * +List * order_qual_clauses(PlannerInfo *root, List *clauses) { typedef struct diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index c8532fb97c8..3fa15391d83 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -211,4 +211,6 @@ itemptr_decode(ItemPointer itemptr, int64 encoded) ItemPointerSet(itemptr, block, offset); } +extern void index_update_stats(Relation rel, bool hasindex, double reltuples); + #endif /* INDEX_H */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index e1764c06adf..b8b4dbfc2a3 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -41,6 +41,10 @@ extern char *makeObjectName(const char *name1, const char *name2, extern char *ChooseRelationName(const char *name1, const char *name2, const char *label, Oid namespaceid, bool isconstraint); +extern List *ChooseIndexColumnNames(List *indexElems); +extern char *ChooseIndexName(const char *tabname, Oid namespaceId, + List *colnames, List *exclusionOpNames, + bool primary, bool isconstraint); extern bool CheckIndexCompatible(Oid oldId, const char *accessMethodName, List *attributeList, diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 3d3e632a0cc..ae8b2b63de9 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -93,6 +93,14 @@ extern void ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ParamListInfo params, QueryEnvironment *queryEnv, const instr_time *planduration, const BufferUsage *bufusage); +extern void ExplainNode(PlanState *planstate, List *ancestors, + const char *relationship, const char *plan_name, + ExplainState *es); +extern void show_scan_qual(List *qual, const char *qlabel, + PlanState *planstate, List *ancestors, + ExplainState *es); +extern void show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es); extern void ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc); extern void ExplainPrintTriggers(ExplainState *es, QueryDesc *queryDesc); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 50bc3b503a6..80abe2467c0 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -67,6 +67,14 @@ extern void generate_partitionwise_join_paths(PlannerInfo *root, extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); #endif +/* Data structure for collecting qual clauses that match an index */ +typedef struct +{ + bool nonempty; /* True if lists are not all empty */ + /* Lists of IndexClause nodes, one list per index column */ + List *indexclauses[INDEX_MAX_KEYS]; +} IndexClauseSet; + /* * indxpath.c * routines to generate index paths @@ -82,6 +90,10 @@ extern bool match_index_to_operand(Node *operand, int indexcol, IndexOptInfo *index); extern void check_index_predicates(PlannerInfo *root, RelOptInfo *rel); +extern void match_restriction_clauses_to_index(PlannerInfo *root, + IndexOptInfo *index, + IndexClauseSet *clauseset); + /* * tidpath.h * routines to generate tid paths diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 31c188176b7..8ec52018173 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -39,6 +39,11 @@ extern void preprocess_minmax_aggregates(PlannerInfo *root); * prototypes for plan/createplan.c */ extern Plan *create_plan(PlannerInfo *root, Path *best_path); +extern List *order_qual_clauses(PlannerInfo *root, List *clauses); +extern void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, + List **stripped_indexquals_p, + List **fixed_indexquals_p); +extern Node *replace_nestloop_params_compat(PlannerInfo *root, Node *expr); extern ForeignScan *make_foreignscan(List *qptlist, List *qpqual, Index scanrelid, List *fdw_exprs, List *fdw_private, List *fdw_scan_tlist, List *fdw_recheck_quals, From 5496ad4e7c1c83fc23f4b94caf572c3636714879 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:22:17 +0300 Subject: [PATCH 16/45] Allow locks in checkpointer --- src/backend/postmaster/checkpointer.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 26a06c6b084..7cd4552c526 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -53,11 +53,20 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/shmem.h" +#include "storage/sinvaladt.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" +#include "utils/syscache.h" + +/* + * Included for InitializeTimeouts and RegisterTimeout functions that + * needed for correct working of OrioleDB checkpoint. + * See comment for InitializeTimeouts call in CheckpointerMain for details. + */ +#include "utils/timeout.h" /*---------- @@ -207,6 +216,20 @@ CheckpointerMain(void) */ pqsignal(SIGCHLD, SIG_DFL); + /* + * To use OrioleDB checkpoint, we must initialize the data for the primary + * lock mechanism (lock.h) to work correctly. Because locks of this type are + * needed by the OrioleDB module for debug events and relation locks, but + * they are not used by the postgres checkpointer and are not initialized + * for it. + */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + InitDeadLockChecking(); + RegisterTimeout(DEADLOCK_TIMEOUT, CheckDeadLockAlert); + RelationCacheInitialize(); + InitCatalogCache(); + SharedInvalBackendInit(false); + /* * Initialize so that first time-driven event happens at the correct time. From 503a751605f8b8aa7b660412f7e44f5d550ac259 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:14:57 +0300 Subject: [PATCH 17/45] Add base_init_startup_hook and HandleStartupProcInterrupts_hook --- src/backend/postmaster/startup.c | 5 +++++ src/backend/utils/init/postinit.c | 5 ++++- src/include/postmaster/postmaster.h | 4 ++++ src/include/postmaster/startup.h | 3 +++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 0e7de26bc28..ce79e4f8f43 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -79,6 +79,8 @@ static volatile sig_atomic_t startup_progress_timer_expired = false; */ int log_startup_progress_interval = 10000; /* 10 sec */ +HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook = NULL; + /* Signal handlers */ static void StartupProcTriggerHandler(SIGNAL_ARGS); static void StartupProcSigHupHandler(SIGNAL_ARGS); @@ -186,6 +188,9 @@ HandleStartupProcInterrupts(void) static uint32 postmaster_poll_count = 0; #endif + if (HandleStartupProcInterrupts_hook) + HandleStartupProcInterrupts_hook(); + /* * Process any requests or signals received recently. */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index df4d15a50fb..7b89c11feb6 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -81,7 +81,7 @@ static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); - +base_init_startup_hook_type base_init_startup_hook = NULL; /*** InitPostgres support ***/ @@ -641,6 +641,9 @@ BaseInit(void) */ InitFileAccess(); + if (base_init_startup_hook) + base_init_startup_hook(); + /* * Initialize statistics reporting. This needs to happen early to ensure * that pgstat's shutdown callback runs after the shutdown callbacks of diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index ddc409ec240..3d1da495915 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -59,6 +59,10 @@ extern int MaxLivePostmasterChildren(void); extern bool PostmasterMarkPIDForWorkerNotify(int); +typedef void (*base_init_startup_hook_type)(void); + +extern PGDLLIMPORT base_init_startup_hook_type base_init_startup_hook; + #ifdef EXEC_BACKEND extern pid_t postmaster_forkexec(int argc, char *argv[]); extern void SubPostmasterMain(int argc, char *argv[]) pg_attribute_noreturn(); diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h index 6a2e4c4526b..95eb25f9f4d 100644 --- a/src/include/postmaster/startup.h +++ b/src/include/postmaster/startup.h @@ -23,7 +23,10 @@ ereport(LOG, errmsg(msg, secs, (usecs / 10000), __VA_ARGS__ )); \ } while(0) +typedef void (*HandleStartupProcInterrupts_hook_type)(void); + extern PGDLLIMPORT int log_startup_progress_interval; +extern PGDLLIMPORT HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook; extern void HandleStartupProcInterrupts(void); extern void StartupProcessMain(void) pg_attribute_noreturn(); From ff119e577b5e48f0da1a8e4d0339f8fc154d11da Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:17:36 +0300 Subject: [PATCH 18/45] Don't cancel recovery processes because of deadlocks --- src/backend/storage/lmgr/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 6cf4cf33242..905fccd673d 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1192,7 +1192,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) * If InHotStandby we set lock waits slightly later for clarity with other * code. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { @@ -1552,7 +1552,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { From 447bc97c2ef9dc47a25e006b5e40cd6dbd515572 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:42:18 +0300 Subject: [PATCH 19/45] set_plain_rel_pathlist_hook --- src/backend/optimizer/path/allpaths.c | 7 +++++-- src/include/optimizer/paths.h | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index f75e0f99cb9..c62a407f4ca 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -86,6 +86,7 @@ int min_parallel_index_scan_size; /* Hook for plugins to get control in set_rel_pathlist() */ set_rel_pathlist_hook_type set_rel_pathlist_hook = NULL; +set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook = NULL; /* Hook for plugins to replace standard_join_search() */ join_search_hook_type join_search_hook = NULL; @@ -775,8 +776,10 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) */ required_outer = rel->lateral_relids; - /* Consider sequential scan */ - add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); + if (!set_plain_rel_pathlist_hook || + set_plain_rel_pathlist_hook(root, rel, rte)) + /* Consider sequential scan */ + add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); /* If appropriate, consider parallel sequential scan */ if (rel->consider_parallel && required_outer == NULL) diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 80abe2467c0..6e4b570fce0 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -31,6 +31,10 @@ typedef void (*set_rel_pathlist_hook_type) (PlannerInfo *root, Index rti, RangeTblEntry *rte); extern PGDLLIMPORT set_rel_pathlist_hook_type set_rel_pathlist_hook; +typedef bool (*set_plain_rel_pathlist_hook_type)(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +extern PGDLLIMPORT set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook; /* Hook for plugins to get control in add_paths_to_joinrel() */ typedef void (*set_join_pathlist_hook_type) (PlannerInfo *root, From 8c4540c6da0f0ac2129a4baa68422028f1bb7110 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 14:17:57 +0300 Subject: [PATCH 20/45] Let locker tolerate being removed from the waiting queue without obtaining a lock. --- src/backend/storage/lmgr/lock.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index d08a0c21c70..245e15f0cc5 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -1165,12 +1165,35 @@ LockAcquireExtended(const LOCKTAG *locktag, */ if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) { + int i; + AbortStrongLockAcquire(); PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock); LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode); /* Should we retry ? */ LWLockRelease(partitionLock); - elog(ERROR, "LockAcquire failed"); + /* + * We've been removed from the queue without obtaining a lock. + * That's OK, we're going to return LOCKACQUIRE_NOT_AVAIL, but + * need to release a local lock first. + */ + locallock->nLocks--; + for (i = 0; i < locallock->numLockOwners; i++) + { + if (locallock->lockOwners[i].owner == owner) + { + locallock->lockOwners[i].nLocks--; + if (locallock->lockOwners[i].nLocks == 0) + { + ResourceOwnerForgetLock(owner, locallock); + locallock->lockOwners[i] = locallock->lockOwners[--locallock->numLockOwners]; + } + break; + } + } + + return LOCKACQUIRE_NOT_AVAIL; + } PROCLOCK_PRINT("LockAcquire: granted", proclock); LOCK_PRINT("LockAcquire: granted", lock, lockmode); @@ -4672,8 +4695,8 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LWLockRelease(&proc->fpInfoLock); /* Time to wait. */ - (void) LockAcquire(&tag, ShareLock, false, false); - + if (LockAcquire(&tag, ShareLock, false, false) == LOCKACQUIRE_NOT_AVAIL) + return false; LockRelease(&tag, ShareLock, false); return XactLockForVirtualXact(vxid, xid, wait); } From 62984a8262ad5654c415c0f85607f2d0edd45562 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 17 Feb 2022 07:46:49 +0300 Subject: [PATCH 21/45] Count extension wait events in pg_isolation_test_session_is_blocked() --- src/backend/utils/adt/lockfuncs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index f9b9590997b..6f7bcc4394c 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -18,8 +18,11 @@ #include "funcapi.h" #include "miscadmin.h" #include "storage/predicate_internals.h" +#include "storage/proc.h" +#include "storage/procarray.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/wait_event.h" /* @@ -614,6 +617,7 @@ pg_safe_snapshot_blocking_pids(PG_FUNCTION_ARGS) Datum pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) { + PGPROC *blocked_proc; int blocked_pid = PG_GETARG_INT32(0); ArrayType *interesting_pids_a = PG_GETARG_ARRAYTYPE_P(1); ArrayType *blocking_pids_a; @@ -674,6 +678,10 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) if (GetSafeSnapshotBlockingPids(blocked_pid, &dummy, 1) > 0) PG_RETURN_BOOL(true); + blocked_proc = BackendPidGetProc(blocked_pid); + if ((blocked_proc->wait_event_info & 0xFF000000) == PG_WAIT_EXTENSION) + PG_RETURN_BOOL(true); + PG_RETURN_BOOL(false); } From 818f0d4c71c63fb6b8a3b82d6fe96dce3a630ef0 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 24 Feb 2022 03:19:39 +0300 Subject: [PATCH 22/45] Support for custom table AM in pgbench --- src/bin/pgbench/pgbench.c | 45 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index c1134eae5b5..70db51c6bec 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -220,6 +220,11 @@ double throttle_delay = 0; */ int64 latency_limit = 0; +/* + * tableam selection + */ +char *tableam = NULL; + /* * tablespace selection */ @@ -890,6 +895,7 @@ usage(void) " --partition-method=(range|hash)\n" " partition pgbench_accounts with this method (default: range)\n" " --partitions=NUM partition pgbench_accounts into NUM parts (default: 0)\n" + " --tableam=TABLEAM create tables using the specified tableam\n" " --tablespace=TABLESPACE create tables in the specified tablespace\n" " --unlogged-tables create tables as unlogged tables\n" "\nOptions to select what to run:\n" @@ -4749,14 +4755,34 @@ createPartitions(PGconn *con) appendPQExpBufferStr(&query, "maxvalue"); appendPQExpBufferChar(&query, ')'); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } } else if (partition_method == PART_HASH) + { printfPQExpBuffer(&query, "create%s table pgbench_accounts_%d\n" " partition of pgbench_accounts\n" " for values with (modulus %d, remainder %d)", unlogged_tables ? " unlogged" : "", p, partitions, p - 1); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + } else /* cannot get there */ Assert(0); @@ -4843,10 +4869,20 @@ initCreateTables(PGconn *con) if (partition_method != PART_NONE && strcmp(ddl->table, "pgbench_accounts") == 0) appendPQExpBuffer(&query, " partition by %s (aid)", PARTITION_METHOD[partition_method]); - else if (ddl->declare_fillfactor) + else { + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + /* fillfactor is only expected on actual tables */ - appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); + if (ddl->declare_fillfactor) + appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); } if (tablespace != NULL) @@ -6602,6 +6638,7 @@ main(int argc, char **argv) {"failures-detailed", no_argument, NULL, 13}, {"max-tries", required_argument, NULL, 14}, {"verbose-errors", no_argument, NULL, 15}, + {"tableam", required_argument, NULL, 16}, {NULL, 0, NULL, 0} }; @@ -6939,6 +6976,10 @@ main(int argc, char **argv) benchmarking_option_set = true; verbose_errors = true; break; + case 16: /* tableam */ + initialization_option_set = true; + tableam = pg_strdup(optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); From c165ad890326ec93f53c27d6878caf6aea582c5f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 2 Mar 2022 14:49:29 +0300 Subject: [PATCH 23/45] Support for outline atomics on aarch64 Outline-atomics is a gcc compilation flag that enables runtime detection of CPU support for atomic instructions. Performance on CPUs that do support atomic instructions is improved, while compatibility and performance on CPUs without atomic instructions is not hurt. Discussion: https://postgr.es/m/flat/099F69EE-51D3-4214-934A-1F28C0A1A7A7%40amazon.com Author: Tsahi Zidenberg --- configure | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 4 +++ 2 files changed, 97 insertions(+) diff --git a/configure b/configure index 71155f46e0d..da2c2e2df5e 100755 --- a/configure +++ b/configure @@ -6657,6 +6657,99 @@ fi if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -moutline-atomics, for CFLAGS" >&5 +$as_echo_n "checking whether ${CC} supports -moutline-atomics, for CFLAGS... " >&6; } +if ${pgac_cv_prog_CC_cflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CC} +CFLAGS="${CFLAGS} -moutline-atomics" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CC_cflags__moutline_atomics=yes +else + pgac_cv_prog_CC_cflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CC_cflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CC_cflags__moutline_atomics" = x"yes"; then + CFLAGS="${CFLAGS} -moutline-atomics" +fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS" >&5 +$as_echo_n "checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS... " >&6; } +if ${pgac_cv_prog_CXX_cxxflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CXXFLAGS=$CXXFLAGS +pgac_save_CXX=$CXX +CXX=${CXX} +CXXFLAGS="${CXXFLAGS} -moutline-atomics" +ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + pgac_cv_prog_CXX_cxxflags__moutline_atomics=yes +else + pgac_cv_prog_CXX_cxxflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag +CXXFLAGS="$pgac_save_CXXFLAGS" +CXX="$pgac_save_CXX" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CXX_cxxflags__moutline_atomics" = x"yes"; then + CXXFLAGS="${CXXFLAGS} -moutline-atomics" +fi + + + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. diff --git a/configure.ac b/configure.ac index bf0cfdf2ba9..628c6f90c7d 100644 --- a/configure.ac +++ b/configure.ac @@ -579,6 +579,10 @@ if test "$GCC" = yes -a "$ICC" = no; then if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + PGAC_PROG_CC_CFLAGS_OPT([-moutline-atomics]) + PGAC_PROG_CXX_CFLAGS_OPT([-moutline-atomics]) + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. From 6f9d6e53b9d033a11871acbe97af02f8e6b48901 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 17 Feb 2022 08:43:32 +0300 Subject: [PATCH 24/45] OrioleDB specific CI --- .github/workflows/build.yml | 31 +++++++++++++++++++++++++++++++ ci/build.sh | 21 +++++++++++++++++++++ ci/check.sh | 11 +++++++++++ ci/check_output.sh | 30 ++++++++++++++++++++++++++++++ ci/prerequisites.sh | 22 ++++++++++++++++++++++ configure | 5 +++++ configure.ac | 4 ++++ meson.build | 1 + src/Makefile.global.in | 3 +++ src/bin/pg_rewind/meson.build | 6 ++++++ src/makefiles/meson.build | 1 + 11 files changed, 135 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 ci/build.sh create mode 100644 ci/check.sh create mode 100644 ci/check_output.sh create mode 100644 ci/prerequisites.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000000..c6f1bef64aa --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,31 @@ +name: build + +on: + push: + pull_request: + +jobs: + test: + runs-on: + - ubuntu-20.04 + strategy: + fail-fast: false + matrix: + compiler: [clang, gcc] + check_type: [normal, debug] + env: + LLVM_VER: 10 + COMPILER: ${{ matrix.compiler }} + CHECK_TYPE: ${{ matrix.check_type }} + steps: + - name: Checkout code into workspace directory + uses: actions/checkout@v2 + - name: Setup prerequisites + run: bash ./ci/prerequisites.sh + - name: Build + run: bash ./ci/build.sh + - name: Check + run: bash ./ci/check.sh + - name: Check output + run: bash ./ci/check_output.sh + if: ${{ success() || failure() }} diff --git a/ci/build.sh b/ci/build.sh new file mode 100644 index 00000000000..f541929e69c --- /dev/null +++ b/ci/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eu + +if [ $COMPILER = "clang" ]; then + export CC=clang-$LLVM_VER +else + export CC=gcc +fi + +# configure & build +if [ $CHECK_TYPE = "debug" ]; then + CFLAGS="-O0" ./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu +else + ./configure --disable-debug --disable-cassert --enable-tap-tests --with-icu +fi + +make -sj4 +cd contrib +make -sj4 +cd .. diff --git a/ci/check.sh b/ci/check.sh new file mode 100644 index 00000000000..faa8c25e84a --- /dev/null +++ b/ci/check.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eu + +# unsets limit for coredumps size +ulimit -c unlimited -S +# sets a coredump file pattern +mkdir -p /tmp/cores-$GITHUB_SHA-$TIMESTAMP +sudo sh -c "echo \"/tmp/cores-$GITHUB_SHA-$TIMESTAMP/%t_%p_%s.core\" > /proc/sys/kernel/core_pattern" + +make check-world -j4 diff --git a/ci/check_output.sh b/ci/check_output.sh new file mode 100644 index 00000000000..ae26cf63d68 --- /dev/null +++ b/ci/check_output.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -eu + +status=0 + +# show diff if it exists +for f in ` find . -name regression.diffs ` ; do + echo "========= Contents of $f" + cat $f + status=1 +done + +# check core dumps if any +cores=$(find /tmp/cores-$GITHUB_SHA-$TIMESTAMP/ -name '*.core' 2>/dev/null) + +if [ -n "$cores" ]; then + for corefile in $cores ; do + if [[ $corefile != *_3.core ]]; then + binary=$(gdb -quiet -core $corefile -batch -ex 'info auxv' | grep AT_EXECFN | perl -pe "s/^.*\"(.*)\"\$/\$1/g") + echo dumping $corefile for $binary + gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" $binary $corefile + status=1 + fi + done +fi + +rm -rf /tmp/cores-$GITHUB_SHA-$TIMESTAMP + +exit $status diff --git a/ci/prerequisites.sh b/ci/prerequisites.sh new file mode 100644 index 00000000000..b26251b711c --- /dev/null +++ b/ci/prerequisites.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +# print the hostname to be able to identify runner by logs +echo "HOSTNAME=`hostname`" +TIMESTAMP=$(date +%s) +echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_ENV +echo "TIMESTAMP=$TIMESTAMP" + +sudo apt-get -y install -qq wget ca-certificates + +sudo apt-get update -qq + +apt_packages="build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3 python3-dev python3-pip python3-setuptools python3-testresources" + +if [ $COMPILER = "clang" ]; then + apt_packages="$apt_packages llvm-$LLVM_VER clang-$LLVM_VER clang-tools-$LLVM_VER" +fi + +# install required packages +sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $apt_packages diff --git a/configure b/configure index da2c2e2df5e..863126591c2 100755 --- a/configure +++ b/configure @@ -628,6 +628,7 @@ ac_includes_default="\ ac_subst_vars='LTLIBOBJS vpath_build PG_SYSROOT +ORIOLEDB_PATCHSET_VERSION PG_VERSION_NUM LDFLAGS_EX_BE PROVE @@ -19479,6 +19480,10 @@ _ACEOF +# Needed to check postgresql patches git tag during orioledb extension build +ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2` + + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/configure.ac b/configure.ac index 628c6f90c7d..15db5845779 100644 --- a/configure.ac +++ b/configure.ac @@ -2418,6 +2418,10 @@ $AWK '{printf "%d%04d", $1, $2}'`"] AC_DEFINE_UNQUOTED(PG_VERSION_NUM, $PG_VERSION_NUM, [PostgreSQL version as a number]) AC_SUBST(PG_VERSION_NUM) +# Needed to check postgresql patches git tag during orioledb extension build +[ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2`] +AC_SUBST(ORIOLEDB_PATCHSET_VERSION) + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/meson.build b/meson.build index 56454cc3395..9655febad1d 100644 --- a/meson.build +++ b/meson.build @@ -153,6 +153,7 @@ cdata.set('PG_VERSION_NUM', pg_version_num) # PG_VERSION_STR is built later, it depends on compiler test results cdata.set_quoted('CONFIGURE_ARGS', '') +orioledb_patchset_version = '22' ############################################################### diff --git a/src/Makefile.global.in b/src/Makefile.global.in index cc4dc6de91e..ccae8c39d87 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -42,6 +42,9 @@ VERSION_NUM = @PG_VERSION_NUM@ PACKAGE_URL = @PACKAGE_URL@ +# OrioleDB patchset git tag number +ORIOLEDB_PATCHSET_VERSION = @ORIOLEDB_PATCHSET_VERSION@ + # Set top_srcdir, srcdir, and VPATH. ifdef PGXS top_srcdir = $(top_builddir) diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index fd22818be4d..36e9a4766f3 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -2,6 +2,7 @@ pg_rewind_sources = files( 'datapagemap.c', + 'extension.c', 'file_ops.c', 'filemap.c', 'libpq_source.c', @@ -23,6 +24,7 @@ pg_rewind = executable('pg_rewind', pg_rewind_sources, dependencies: [frontend_code, libpq, lz4, zstd], c_args: ['-DFRONTEND'], # needed for xlogreader et al + export_dynamic: true, kwargs: default_bin_args, ) bin_targets += pg_rewind @@ -48,3 +50,7 @@ tests += { } subdir('po', if_found: libintl) + +install_headers( + 'pg_rewind_ext.h' +) \ No newline at end of file diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 13045cbd6e4..16ce1650e2e 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -37,6 +37,7 @@ pgxs_kv = { 'PACKAGE_VERSION': pg_version, 'PG_MAJORVERSION': pg_version_major, 'PG_VERSION_NUM': pg_version_num, + 'ORIOLEDB_PATCHSET_VERSION': orioledb_patchset_version, 'configure_input': 'meson', 'vpath_build': 'yes', From 7507685a0b80d60760aebec830f2a90d3dd2d4b4 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 9 Apr 2023 01:57:21 +0300 Subject: [PATCH 25/45] Close indices in AttachPartitionEnsureIndexes() before DefineIndex() --- src/backend/commands/tablecmds.c | 43 +++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 98f4602f449..49f71638bbe 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -18135,6 +18135,7 @@ static void AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) { List *idxes; + List *buildIdxes = NIL; List *attachRelIdxs; Relation *attachrelIdxRels; IndexInfo **attachInfos; @@ -18142,6 +18143,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) ListCell *cell; MemoryContext cxt; MemoryContext oldcxt; + AttrMap *attmap; cxt = AllocSetContextCreate(CurrentMemoryContext, "AttachPartitionEnsureIndexes", @@ -18192,6 +18194,10 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) goto out; } + attmap = build_attrmap_by_name(RelationGetDescr(attachrel), + RelationGetDescr(rel), + false); + /* * For each index on the partitioned table, find a matching one in the * partition-to-be; if one is not found, create one. @@ -18201,7 +18207,6 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) Oid idx = lfirst_oid(cell); Relation idxRel = index_open(idx, AccessShareLock); IndexInfo *info; - AttrMap *attmap; bool found = false; Oid constraintOid; @@ -18217,9 +18222,6 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) /* construct an indexinfo to compare existing indexes against */ info = BuildIndexInfo(idxRel); - attmap = build_attrmap_by_name(RelationGetDescr(attachrel), - RelationGetDescr(rel), - false); constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(rel), idx); /* @@ -18280,19 +18282,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) * now. */ if (!found) - { - IndexStmt *stmt; - Oid conOid; - - stmt = generateClonedIndexStmt(NULL, - idxRel, attmap, - &conOid); - DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, - RelationGetRelid(idxRel), - conOid, - -1, - true, false, false, false, false); - } + buildIdxes = lappend_oid(buildIdxes, RelationGetRelid(idxRel)); index_close(idxRel, AccessShareLock); } @@ -18301,6 +18291,25 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel) /* Clean up. */ for (i = 0; i < list_length(attachRelIdxs); i++) index_close(attachrelIdxRels[i], AccessShareLock); + + foreach(cell, buildIdxes) + { + Oid idx = lfirst_oid(cell); + Relation idxRel = index_open(idx, AccessShareLock); + IndexStmt *stmt; + Oid conOid; + + stmt = generateClonedIndexStmt(NULL, + idxRel, attmap, + &conOid); + DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, + RelationGetRelid(idxRel), + conOid, + -1, + true, false, false, false, false); + index_close(idxRel, AccessShareLock); + } + MemoryContextSwitchTo(oldcxt); MemoryContextDelete(cxt); } From 58b411aebf16d54e3a344ee05f952e94bf91ebc9 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Fri, 30 Jun 2023 01:35:54 +0300 Subject: [PATCH 26/45] New BGWORKER_CLASS_SYSTEM bgworkers class They are allowed to stay during shutdown checkpointing and help checkpointer do its work. --- src/backend/postmaster/postmaster.c | 39 +++++++++++++++++++++-------- src/include/postmaster/bgworker.h | 6 +++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index f6e1e65d22b..59c7d67302e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -145,7 +145,8 @@ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ -#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ +#define BACKEND_TYPE_SYSTEM_BGWORKER 0x0010 /* system bgworker process */ +#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ /* * List of active backends (or child processes anyway; we don't actually @@ -2472,8 +2473,9 @@ processCancelRequest(Port *port, void *pkt) /* * canAcceptConnections --- check to see if database state allows connections * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, - * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet - * know whether a NORMAL connection might turn into a walsender.) + * BACKEND_TYPE_AUTOVAC, BACKEND_TYPE_BGWORKER or BACKEND_TYPE_SYSTEM_BGWORKER. + * (Note that we don't yet know whether a NORMAL connection might turn into + * a walsender.) */ static CAC_state canAcceptConnections(int backend_type) @@ -2487,7 +2489,8 @@ canAcceptConnections(int backend_type) * bgworker_should_start_now() decided whether the DB state allows them. */ if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && - backend_type != BACKEND_TYPE_BGWORKER) + backend_type != BACKEND_TYPE_BGWORKER && + backend_type != BACKEND_TYPE_SYSTEM_BGWORKER) { if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ @@ -3166,6 +3169,13 @@ process_pm_child_exit(void) if (PgArchPID != 0) signal_child(PgArchPID, SIGUSR2); + /* + * Terminate system background workers since checpoint is + * complete. + */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_SYSTEM_BGWORKER); + /* * Waken walsenders for the last time. No regular backends * should be around anymore. @@ -3567,7 +3577,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) * Background workers were already processed above; ignore them * here. */ - if (bp->bkend_type == BACKEND_TYPE_BGWORKER) + if (bp->bkend_type == BACKEND_TYPE_BGWORKER || + bp->bkend_type == BACKEND_TYPE_SYSTEM_BGWORKER) continue; if (take_action) @@ -3746,7 +3757,7 @@ PostmasterStateMachine(void) /* Signal all backend children except walsenders */ SignalSomeChildren(SIGTERM, - BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); @@ -3784,7 +3795,7 @@ PostmasterStateMachine(void) * and archiver are also disregarded, they will be terminated later * after writing the checkpoint record. */ - if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER) == 0 && StartupPID == 0 && WalReceiverPID == 0 && BgWriterPID == 0 && @@ -5794,16 +5805,20 @@ do_start_bgworker(RegisteredBgWorker *rw) * specified start_time? */ static bool -bgworker_should_start_now(BgWorkerStartTime start_time) +bgworker_should_start_now(BgWorkerStartTime start_time, int flags) { switch (pmState) { case PM_NO_CHILDREN: case PM_WAIT_DEAD_END: case PM_SHUTDOWN_2: + break; + case PM_SHUTDOWN: case PM_WAIT_BACKENDS: case PM_STOP_BACKENDS: + if (flags & BGWORKER_CLASS_SYSTEM) + return true; break; case PM_RUN: @@ -5878,7 +5893,10 @@ assign_backendlist_entry(RegisteredBgWorker *rw) bn->cancel_key = MyCancelKey; bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); - bn->bkend_type = BACKEND_TYPE_BGWORKER; + if (rw->rw_worker.bgw_flags & BGWORKER_CLASS_SYSTEM) + bn->bkend_type = BACKEND_TYPE_SYSTEM_BGWORKER; + else + bn->bkend_type = BACKEND_TYPE_BGWORKER; bn->dead_end = false; bn->bgworker_notify = false; @@ -5976,7 +5994,8 @@ maybe_start_bgworkers(void) } } - if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) + if (bgworker_should_start_now(rw->rw_worker.bgw_start_time, + rw->rw_worker.bgw_flags)) { /* reset crash time before trying to start worker */ rw->rw_crashed_at = 0; diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h index 845d4498e65..e5af3247632 100644 --- a/src/include/postmaster/bgworker.h +++ b/src/include/postmaster/bgworker.h @@ -66,6 +66,12 @@ * background workers should not use this class. */ #define BGWORKER_CLASS_PARALLEL 0x0010 + +/* + * This class of bgworkers are allowed to stay working during shutdown + * checkpointing. + */ +#define BGWORKER_CLASS_SYSTEM 0x0020 /* add additional bgworker classes here */ From 71907c6bed29821eab08781c9dcc28bd6891a5da Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 7 Sep 2023 21:33:03 +0200 Subject: [PATCH 27/45] Add pg_newlocale_from_collation_hook to perform stricter collation checks --- src/backend/utils/adt/pg_locale.c | 7 ++++++- src/include/utils/pg_locale.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 99a21f20b9f..d903a746b36 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -134,6 +134,7 @@ typedef struct static HTAB *collation_cache = NULL; +pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook = NULL; #if defined(WIN32) && defined(LC_MESSAGES) static char *IsoLocaleName(const char *); @@ -1600,6 +1601,7 @@ pg_newlocale_from_collation(Oid collid) { char *actual_versionstr; char *collversionstr; + int level = WARNING; collversionstr = TextDatumGetCString(datum); @@ -1619,8 +1621,11 @@ pg_newlocale_from_collation(Oid collid) NameStr(collform->collname)))); } + if (pg_newlocale_from_collation_hook && pg_newlocale_from_collation_hook()) + level = ERROR; + if (strcmp(actual_versionstr, collversionstr) != 0) - ereport(WARNING, + ereport(level, (errmsg("collation \"%s\" has version mismatch", NameStr(collform->collname)), errdetail("The collation in the database was created using version %s, " diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index e2a72435427..2fd19a95cbb 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -102,6 +102,8 @@ extern void make_icu_collator(const char *iculocstr, extern bool pg_locale_deterministic(pg_locale_t locale); extern pg_locale_t pg_newlocale_from_collation(Oid collid); +typedef bool (*pg_newlocale_from_collation_hook_type)(); +extern pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook; extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); From 31ee5092e884f405c2bae22c34d486e84189b3cb Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 12 Jul 2023 23:40:12 +0300 Subject: [PATCH 28/45] Archive preload callback --- src/backend/postmaster/pgarch.c | 16 ++++++++++++++++ src/include/archive/archive_module.h | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 46af3495644..93ce77683a4 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -670,6 +670,22 @@ pgarch_readyXlog(char *xlog) for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); + /* + * Preload the WAL files if the relevant callback is provided. + */ + if (ArchiveCallbacks->archive_preload_file_cb) + { + for (int i = 0; i < arch_files->arch_files_size; i++) + { + char *xlog1 = arch_files->arch_files[i]; + char pathname[MAXPGPATH]; + + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog1); + ArchiveCallbacks->archive_preload_file_cb(archive_module_state, + xlog1, pathname); + } + } + /* Return the highest priority file. */ arch_files->arch_files_size--; strcpy(xlog, arch_files->arch_files[arch_files->arch_files_size]); diff --git a/src/include/archive/archive_module.h b/src/include/archive/archive_module.h index 679ce5a6dbd..2921c0a05f8 100644 --- a/src/include/archive/archive_module.h +++ b/src/include/archive/archive_module.h @@ -37,13 +37,17 @@ typedef struct ArchiveModuleState */ typedef void (*ArchiveStartupCB) (ArchiveModuleState *state); typedef bool (*ArchiveCheckConfiguredCB) (ArchiveModuleState *state); -typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, const char *file, const char *path); +typedef void (*ArchivePreloadFileCB) (ArchiveModuleState *state, + const char *file, const char *path); +typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, + const char *file, const char *path); typedef void (*ArchiveShutdownCB) (ArchiveModuleState *state); typedef struct ArchiveModuleCallbacks { ArchiveStartupCB startup_cb; ArchiveCheckConfiguredCB check_configured_cb; + ArchivePreloadFileCB archive_preload_file_cb; ArchiveFileCB archive_file_cb; ArchiveShutdownCB shutdown_cb; } ArchiveModuleCallbacks; From cfa69d1bec36ccb4432e70bce0b3f822f9ec03dc Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 18 Feb 2024 06:10:50 +0200 Subject: [PATCH 29/45] Remove pthread_is_threaded_np() call To use curl during shared_preload_libraries initialization. --- configure | 2 +- configure.ac | 1 - meson.build | 1 - src/backend/postmaster/postmaster.c | 46 ----------------------------- src/include/pg_config.h.in | 3 -- 5 files changed, 1 insertion(+), 52 deletions(-) diff --git a/configure b/configure index 863126591c2..f66a81e682d 100755 --- a/configure +++ b/configure @@ -15800,7 +15800,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l +for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 15db5845779..e4ebb267341 100644 --- a/configure.ac +++ b/configure.ac @@ -1808,7 +1808,6 @@ AC_CHECK_FUNCS(m4_normalize([ memset_s posix_fallocate ppoll - pthread_is_threaded_np setproctitle setproctitle_fast strchrnul diff --git a/meson.build b/meson.build index 9655febad1d..17f10b9851a 100644 --- a/meson.build +++ b/meson.build @@ -2540,7 +2540,6 @@ func_checks = [ ['posix_fallocate'], ['ppoll'], ['pthread_barrier_wait', {'dependencies': [thread_dep]}], - ['pthread_is_threaded_np', {'dependencies': [thread_dep]}], ['sem_init', {'dependencies': [rt_dep, thread_dep], 'skip': sema_kind != 'unnamed_posix', 'define': false}], ['setproctitle', {'dependencies': [util_dep]}], ['setproctitle_fast'], diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 59c7d67302e..c9727bcdbbf 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -85,10 +85,6 @@ #include #endif -#ifdef HAVE_PTHREAD_IS_THREADED_NP -#include -#endif - #include "access/transam.h" #include "access/xlog.h" #include "access/xlogrecovery.h" @@ -1424,24 +1420,6 @@ PostmasterMain(int argc, char *argv[]) */ } -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * On macOS, libintl replaces setlocale() with a version that calls - * CFLocaleCopyCurrent() when its second argument is "" and every relevant - * environment variable is unset or empty. CFLocaleCopyCurrent() makes - * the process multithreaded. The postmaster calls sigprocmask() and - * calls fork() without an immediate exec(), both of which have undefined - * behavior in a multithreaded program. A multithreaded postmaster is the - * normal case on Windows, which offers neither fork() nor sigprocmask(). - */ - if (pthread_is_threaded_np() != 0) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("postmaster became multithreaded during startup"), - errhint("Set the LC_ALL environment variable to a valid locale."))); -#endif - /* * Remember postmaster startup time */ @@ -1859,15 +1837,6 @@ ServerLoop(void) if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworkers(); -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * With assertions enabled, check regularly for appearance of - * additional threads. All builds check at start and exit. - */ - Assert(pthread_is_threaded_np() == 0); -#endif - /* * Lastly, check to see if it's time to do some things that we don't * want to do every single time through the loop, because they're a @@ -5062,21 +5031,6 @@ SubPostmasterMain(int argc, char *argv[]) static void ExitPostmaster(int status) { -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * There is no known cause for a postmaster to become multithreaded after - * startup. Recheck to account for the possibility of unknown causes. - * This message uses LOG level, because an unclean shutdown at this point - * would usually not look much different from a clean shutdown. - */ - if (pthread_is_threaded_np() != 0) - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg_internal("postmaster became multithreaded"), - errdetail("Please report this to <%s>.", PACKAGE_BUGREPORT))); -#endif - /* should cleanup shared memory and kill all backends */ /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index cb380c96e26..4b4218a574c 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -343,9 +343,6 @@ /* Define to 1 if you have the `pthread_barrier_wait' function. */ #undef HAVE_PTHREAD_BARRIER_WAIT -/* Define to 1 if you have the `pthread_is_threaded_np' function. */ -#undef HAVE_PTHREAD_IS_THREADED_NP - /* Have PTHREAD_PRIO_INHERIT. */ #undef HAVE_PTHREAD_PRIO_INHERIT From 73f1c62e1e70221d2b57e0c5ad1be4a1a80d3c53 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 8 Dec 2023 01:37:02 +0100 Subject: [PATCH 30/45] Added option to pg_rewind to perform extension specific rewind - added option --extension for pg_rewind - extracted SimpleXLogRead from extractPageMap for generic wal iteration in pg_rewind --- doc/src/sgml/ref/pg_rewind.sgml | 10 +++ src/bin/pg_rewind/Makefile | 7 +- src/bin/pg_rewind/extension.c | 132 ++++++++++++++++++++++++++++++ src/bin/pg_rewind/filemap.c | 41 +++++++++- src/bin/pg_rewind/parsexlog.c | 36 +++++--- src/bin/pg_rewind/pg_rewind.c | 16 +++- src/bin/pg_rewind/pg_rewind.h | 10 +++ src/bin/pg_rewind/pg_rewind_ext.h | 44 ++++++++++ 8 files changed, 279 insertions(+), 17 deletions(-) create mode 100644 src/bin/pg_rewind/extension.c create mode 100644 src/bin/pg_rewind/pg_rewind_ext.h diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 2de747ec37f..01d20462e33 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -284,6 +284,16 @@ PostgreSQL documentation + + + + + + Load shared library that performs custom rewind for postgres extension. The path may be full or relative to PKGLIBDIR. File extension is optional. Multiple extensions can be selected by multiple switches. + + + + diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index bed05f1609c..5ff8163b841 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -21,6 +21,7 @@ LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) OBJS = \ $(WIN32RES) \ datapagemap.o \ + extension.o \ file_ops.o \ filemap.o \ libpq_source.o \ @@ -35,19 +36,21 @@ EXTRA_CLEAN = xlogreader.c all: pg_rewind pg_rewind: $(OBJS) | submake-libpq submake-libpgport - $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LDFLAGS_EX_BE) $(LIBS) -o $@$(X) xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . install: all installdirs $(INSTALL_PROGRAM) pg_rewind$(X) '$(DESTDIR)$(bindir)/pg_rewind$(X)' + $(INSTALL_DATA) $(srcdir)/pg_rewind_ext.h '$(DESTDIR)$(includedir)' installdirs: - $(MKDIR_P) '$(DESTDIR)$(bindir)' + $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(includedir)' uninstall: rm -f '$(DESTDIR)$(bindir)/pg_rewind$(X)' + rm -f '$(DESTDIR)$(includedir)/pg_rewind_ext.h' clean distclean maintainer-clean: rm -f pg_rewind$(X) $(OBJS) xlogreader.c diff --git a/src/bin/pg_rewind/extension.c b/src/bin/pg_rewind/extension.c new file mode 100644 index 00000000000..29ec4b5a6f6 --- /dev/null +++ b/src/bin/pg_rewind/extension.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * extension.c + * Functions for processing shared libraries loaded by pg_rewind. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#ifndef WIN32 +#include + +/* + * On macOS, insists on including . If we're not + * using stdbool, undef bool to undo the damage. + */ +#ifndef PG_USE_STDBOOL +#ifdef bool +#undef bool +#endif +#endif +#endif /* !WIN32 */ + +#include + +#include "access/xlog_internal.h" +#include "pg_rewind.h" + +/* signature for pg_rewind extension library rewind function */ +typedef void (*PG_rewind_t) (const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + +static bool +file_exists(const char *argv0, const char *name) +{ + struct stat st; + + Assert(name != NULL); + + if (stat(name, &st) == 0) + return !S_ISDIR(st.st_mode); + else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES)) + { + const char *progname; + + progname = get_progname(argv0); + pg_log_error("could not access file \"%s\": %m", name); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + return false; +} + +static char * +expand_dynamic_library_name(const char *argv0, const char *name) +{ + char *full; + char my_exec_path[MAXPGPATH]; + char pkglib_path[MAXPGPATH]; + + Assert(name); + + if (find_my_exec(argv0, my_exec_path) < 0) + pg_fatal("%s: could not locate my own executable path", argv0); + get_pkglib_path(my_exec_path, pkglib_path); + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1); + sprintf(full, "%s/%s", pkglib_path, name); + if (file_exists(argv0, full)) + return full; + pfree(full); + + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1 + + strlen(DLSUFFIX) + 1); + sprintf(full, "%s/%s%s", pkglib_path, name, DLSUFFIX); + if (file_exists(argv0, full)) + return full; + pfree(full); + + return pstrdup(name); +} + +void +process_extensions(SimpleStringList *extensions, const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug) +{ + SimpleStringListCell *cell; + + if (extensions->head == NULL) + return; /* nothing to do */ + + for (cell = extensions->head; cell; cell = cell->next) + { + char *filename = cell->val; + char *fullname; + void *lib_handle; + PG_rewind_t PG_rewind; + char *load_error; + + fullname = expand_dynamic_library_name(argv0, filename); + + lib_handle = dlopen(fullname, RTLD_NOW | RTLD_GLOBAL); + if (lib_handle == NULL) + { + load_error = dlerror(); + pg_fatal("could not load library \"%s\": %s", fullname, load_error); + } + + PG_rewind = dlsym(lib_handle, "_PG_rewind"); + + if (PG_rewind == NULL) + pg_fatal("could not find function \"_PG_rewind\" in \"%s\"", + fullname); + pfree(fullname); + + if (showprogress) + pg_log_info("performing rewind for '%s' extension", filename); + PG_rewind(datadir_target, datadir_source, connstr_source, startpoint, + tliIndex, endpoint, restoreCommand, argv0, debug); + + pg_log_debug("loaded library \"%s\"", filename); + } +} diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 435742d20d1..a936c3d3586 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -54,6 +54,7 @@ static uint32 hash_string_pointer(const char *s); #define FILEHASH_INITIAL_SIZE 1000 static filehash_hash *filehash; +static SimpleStringList extensions_exclude = {NULL, NULL}; static bool isRelDataFile(const char *path); static char *datasegpath(RelFileLocator rlocator, ForkNumber forknum, @@ -261,6 +262,8 @@ process_target_file(const char *path, file_type_t type, size_t size, * from the target data folder all paths which have been filtered out from * the source data folder when processing the source files. */ + if (check_file_excluded(path, false)) + return; /* * Like in process_source_file, pretend that pg_wal is always a directory. @@ -405,6 +408,31 @@ check_file_excluded(const char *path, bool is_source) } } + /* + * Exclude extensions directories + */ + if (extensions_exclude.head != NULL) + { + SimpleStringListCell *cell; + + for (cell = extensions_exclude.head; cell; cell = cell->next) + { + char *exclude_dir = cell->val; + + snprintf(localpath, sizeof(localpath), "%s/", exclude_dir); + if (strstr(path, localpath) == path) + { + if (is_source) + pg_log_debug("entry \"%s\" excluded from source file list", + path); + else + pg_log_debug("entry \"%s\" excluded from target file list", + path); + return true; + } + } + } + return false; } @@ -822,7 +850,6 @@ decide_file_actions(void) return filemap; } - /* * Helper function for filemap hash table. */ @@ -833,3 +860,15 @@ hash_string_pointer(const char *s) return hash_bytes(ss, strlen(s)); } + +void +extensions_exclude_add(char **exclude_dirs) +{ + int i; + + for (i = 0; exclude_dirs[i] != NULL; i++) + { + simple_string_list_append(&extensions_exclude, + pstrdup(exclude_dirs[i])); + } +} diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 27782237d05..f8202d298e4 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -38,7 +38,7 @@ static const char *RmgrNames[RM_MAX_ID + 1] = { #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \ RmgrNames[rmid] : "custom") -static void extractPageInfo(XLogReaderState *record); +static void extractPageInfo(XLogReaderState *record, void *arg); static int xlogreadfd = -1; static XLogSegNo xlogreadsegno = 0; @@ -54,17 +54,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); -/* - * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline - * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of - * the data blocks touched by the WAL records, and return them in a page map. - * - * 'endpoint' is the end of the last record to read. The record starting at - * 'endpoint' is the first one that is not read. - */ void -extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, - XLogRecPtr endpoint, const char *restoreCommand) +SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand, + void (*page_callback) (XLogReaderState *, void *arg), + void *arg) { XLogRecord *record; XLogReaderState *xlogreader; @@ -97,7 +91,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, LSN_FORMAT_ARGS(errptr)); } - extractPageInfo(xlogreader); + page_callback(xlogreader, arg); } while (xlogreader->EndRecPtr < endpoint); /* @@ -116,6 +110,22 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, } } +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of + * the data blocks touched by the WAL records, and return them in a page map. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +void +extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand) +{ + SimpleXLogRead(datadir, startpoint, tliIndex, endpoint, restoreCommand, + extractPageInfo, NULL); +} + /* * Reads one WAL record. Returns the end position of the record, without * doing anything with the record itself. @@ -365,7 +375,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, * Extract information on which blocks the current record modifies. */ static void -extractPageInfo(XLogReaderState *record) +extractPageInfo(XLogReaderState *record, void *arg) { int block_id; RmgrId rmid = XLogRecGetRmid(record); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index f7f3b8227fd..d08d421bbdd 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -75,6 +75,8 @@ bool dry_run = false; bool do_sync = true; bool restore_wal = false; +static SimpleStringList extensions = {NULL, NULL}; + /* Target history */ TimeLineHistoryEntry *targetHistory; int targetNentries; @@ -107,6 +109,7 @@ usage(const char *progname) " file when running target cluster\n")); printf(_(" --debug write a lot of debug messages\n")); printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); + printf(_(" -e, --extension=PATH path to library performing rewind for extension\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); @@ -131,6 +134,7 @@ main(int argc, char **argv) {"no-sync", no_argument, NULL, 'N'}, {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, + {"extension", required_argument, NULL, 'e'}, {NULL, 0, NULL, 0} }; int option_index; @@ -169,7 +173,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "cD:nNPRe", long_options, &option_index)) != -1) { switch (c) { @@ -218,6 +222,10 @@ main(int argc, char **argv) config_file = pg_strdup(optarg); break; + case 'e': /* -e or --extension */ + simple_string_list_append(&extensions, optarg); + break; + default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -454,6 +462,12 @@ main(int argc, char **argv) /* Initialize the hash table to track the status of each file */ filehash_init(); + if (extensions.head != NULL) + process_extensions(&extensions, datadir_target, datadir_source, + connstr_source, chkptrec, lastcommontliIndex, + target_wal_endrec, restore_command, argv[0], + debug); + /* * Collect information about all files in the both data directories. */ diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index ef8bdc1fbb8..1d42a921246 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -14,7 +14,9 @@ #include "access/timeline.h" #include "common/logging.h" #include "datapagemap.h" +#include "fe_utils/simple_list.h" #include "libpq-fe.h" +#include "pg_rewind_ext.h" #include "storage/block.h" #include "storage/relfilelocator.h" @@ -53,4 +55,12 @@ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries); +/* in extension.c */ +extern void process_extensions(SimpleStringList *extensions, + const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + #endif /* PG_REWIND_H */ diff --git a/src/bin/pg_rewind/pg_rewind_ext.h b/src/bin/pg_rewind/pg_rewind_ext.h new file mode 100644 index 00000000000..3616d94f588 --- /dev/null +++ b/src/bin/pg_rewind/pg_rewind_ext.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * pg_rewind_ext.h + * + * + * Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef PG_REWIND_EXT_H +#define PG_REWIND_EXT_H + +#include "access/xlogreader.h" + +/* in parsexlog.c */ +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. + * Pass all WAL records to 'page_callback'. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +extern void SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, + void (*page_callback) (XLogReaderState *, + void *arg), + void *arg); + + +/* in filemap.c */ +/* Add NULL-terminated list of dirs that pg_rewind can skip copying */ +extern void extensions_exclude_add(char **exclude_dirs); + +/* signature for pg_rewind extension library rewind function */ +extern PGDLLEXPORT void _PG_rewind(const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, + const char *restoreCommand, + const char *argv0, bool debug); + +#endif /* PG_REWIND_EXT_H */ From dfa97f5d50b970fcc4b4ae1aca8279fbc9685bd1 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 3 May 2024 22:05:35 +0200 Subject: [PATCH 31/45] Index scan and index only scan with rowid --- src/backend/access/heap/heapam_handler.c | 3 +- src/backend/access/index/genam.c | 2 + src/backend/access/index/indexam.c | 88 +++++++++++++++++++++--- src/backend/access/table/tableam.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/nodeIndexonlyscan.c | 33 +++++++-- src/backend/utils/adt/selfuncs.c | 28 ++++++-- src/include/access/genam.h | 3 + src/include/access/relscan.h | 2 + src/include/access/tableam.h | 6 +- 10 files changed, 147 insertions(+), 22 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ea6759e8a7f..a32fc3b69fb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -131,7 +131,7 @@ heapam_index_fetch_end(IndexFetchTableData *scan) static bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -139,6 +139,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; bool got_heap_tuple; + ItemPointer tid = DatumGetItemPointer(tupleid); Assert(TTS_IS_BUFFERTUPLE(slot)); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 709b2641021..e0535503145 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -104,6 +104,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->orderByData = NULL; scan->xs_want_itup = false; /* may be set later */ + scan->xs_want_rowid = false; /* may be set later */ /* * During recovery we ignore killed tuples and don't bother to kill them @@ -125,6 +126,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_rowid.isnull = true; return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 715e91e25f0..b19a7b7297e 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -603,6 +603,55 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } +/* ---------------- + * index_getnext_rowid - get the next ROWID from a scan + * + * The result is the next ROWID satisfying the scan keys, + * or isnull if no more matching tuples exist. + * ---------------- + */ +NullableDatum +index_getnext_rowid(IndexScanDesc scan, ScanDirection direction) +{ + NullableDatum result; + bool found; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgettuple); + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_heaptid. It should also set + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. + */ + found = scan->indexRelation->rd_indam->amgettuple(scan, direction); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + scan->xs_heap_continue = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + result.isnull = true; + return result; + } + /* Assert(RowidIsValid(&scan->xs_rowid)); */ + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the ROWID of the tuple we found. */ + return scan->xs_rowid; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -626,8 +675,17 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) { bool all_dead = false; bool found; + Datum tupleid; + + if (scan->xs_want_rowid) + { + Assert(!scan->xs_rowid.isnull); + tupleid = scan->xs_rowid.value; + } + else + tupleid = PointerGetDatum(&scan->xs_heaptid); - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, + found = table_index_fetch_tuple(scan->xs_heapfetch, tupleid, scan->xs_snapshot, slot, &scan->xs_heap_continue, &all_dead); @@ -669,16 +727,30 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * { if (!scan->xs_heap_continue) { - ItemPointer tid; + if (scan->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scan, direction); - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + ItemPointer tid; + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } } /* diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index c8329db8f34..67d63e0a6ec 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -227,7 +227,7 @@ table_index_fetch_tuple_check(Relation rel, slot = table_slot_create(rel, NULL); scan = table_index_fetch_begin(rel); - found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, + found = table_index_fetch_tuple(scan, PointerGetDatum(tid), snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 35c4451fc06..982bae9ed42 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -111,7 +111,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); bool call_again = false; - if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, + if (!table_index_fetch_tuple(scan, PointerGetDatum(&tmptid), SnapshotSelf, slot, &call_again, NULL)) { /* diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 45d1a67a713..6ebddd36c95 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -66,7 +66,7 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; + ItemPointer tid = NULL; /* * extract necessary information from index scan node @@ -118,12 +118,36 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (true) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + if (scandesc->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; + + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + } + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -158,7 +182,8 @@ IndexOnlyNext(IndexOnlyScanState *node) * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, + if (!scandesc->xs_want_rowid && + !VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { @@ -243,7 +268,7 @@ IndexOnlyNext(IndexOnlyScanState *node) * If we didn't access the heap, then we'll need to take a predicate * lock explicitly, as if we had. For now we do that at page level. */ - if (!tuple_from_heap) + if (!tuple_from_heap && !scandesc->xs_want_rowid) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c4fcd0076ea..675c743bcc5 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6223,12 +6223,32 @@ get_actual_variable_endpoint(Relation heapRel, index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); - /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (true) { - BlockNumber block = ItemPointerGetBlockNumber(tid); + BlockNumber block = InvalidBlockNumber; - if (!VM_ALL_VISIBLE(heapRel, + /* Fetch first/next tuple in specified direction */ + if (index_scan->xs_want_rowid) + { + NullableDatum rowid; + rowid = index_getnext_rowid(index_scan, indexscandir); + + if (rowid.isnull) + break; + } + else + { + tid = index_getnext_tid(index_scan, indexscandir); + + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &index_scan->xs_heaptid)); + block = ItemPointerGetBlockNumber(tid); + } + + if (!index_scan->xs_want_rowid && + !VM_ALL_VISIBLE(heapRel, block, &vmbuffer)) { diff --git a/src/include/access/genam.h b/src/include/access/genam.h index b071cedd44b..40671f28a66 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -173,6 +173,9 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); +extern NullableDatum index_getnext_rowid(IndexScanDesc scan, + ScanDirection direction); +extern Datum index_getnext_tupleid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index d03360eac04..ea0913ce6f2 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -122,6 +122,7 @@ typedef struct IndexScanDescData struct ScanKeyData *keyData; /* array of index qualifier descriptors */ struct ScanKeyData *orderByData; /* array of ordering op descriptors */ bool xs_want_itup; /* caller requests index tuples */ + bool xs_want_rowid; /* caller requests index tuples */ bool xs_temp_snap; /* unregister snapshot at scan end? */ /* signaling to index AM about killing index tuples */ @@ -145,6 +146,7 @@ typedef struct IndexScanDescData struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ ItemPointerData xs_heaptid; /* result */ + NullableDatum xs_rowid; /* result if xs_want_rowid */ bool xs_heap_continue; /* T if must keep walking, potential * further results */ IndexFetchTableData *xs_heapfetch; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index cd086ae12ab..2610d9ca692 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -469,7 +469,7 @@ typedef struct TableAmRoutine * future searches. */ bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead); @@ -1262,7 +1262,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan) */ static inline bool table_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -1275,7 +1275,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); - return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, + return scan->rel->rd_tableam->index_fetch_tuple(scan, tupleid, snapshot, slot, call_again, all_dead); } From e56d0e7bad2e836e7243f4d19b7f924d0bd5ea9a Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Wed, 8 May 2024 04:09:19 +0200 Subject: [PATCH 32/45] Remove primary index am check --- src/backend/access/index/indexam.c | 3 ++- src/backend/catalog/index.c | 3 --- src/backend/parser/parse_utilcmd.c | 13 ------------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index b19a7b7297e..70974ce9c17 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -758,7 +758,8 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * If we don't find anything, loop around and grab the next TID from * the index. */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (!scan->xs_want_rowid) + Assert(ItemPointerIsValid(&scan->xs_heaptid)); if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index db68393483c..21021f84e52 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2678,9 +2678,6 @@ BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii) */ Assert(ii->ii_Unique); - if (index->rd_rel->relam != BTREE_AM_OID) - elog(ERROR, "unexpected non-btree speculative unique index"); - ii->ii_UniqueOps = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueProcs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueStrats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 441f599d1a0..87b962f05de 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -2320,19 +2320,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) errdetail("Cannot create a non-deferrable constraint using a deferrable index."), parser_errposition(cxt->pstate, constraint->location))); - /* - * Insist on it being a btree. That's the only kind that supports - * uniqueness at the moment anyway; but we must have an index that - * exactly matches what you'd get from plain ADD CONSTRAINT syntax, - * else dump and reload will produce a different index (breaking - * pg_upgrade in particular). - */ - if (index_rel->rd_rel->relam != get_index_am_oid(DEFAULT_INDEX_TYPE, false)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("index \"%s\" is not a btree", index_name), - parser_errposition(cxt->pstate, constraint->location))); - /* Must get indclass the hard way */ indclassDatum = SysCacheGetAttrNotNull(INDEXRELID, index_rel->rd_indextuple, From c3e9f05943fb2d2f8d7899692356fe8d212c490d Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Mon, 13 May 2024 20:33:54 +0200 Subject: [PATCH 33/45] Passing tupleid to insert now --- contrib/bloom/blinsert.c | 3 ++- contrib/bloom/bloom.h | 2 +- src/backend/access/brin/brin.c | 3 ++- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/gininsert.c | 3 ++- src/backend/access/gist/gist.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/index/indexam.c | 4 ++-- src/backend/access/nbtree/nbtree.c | 3 ++- src/backend/access/spgist/spginsert.c | 3 ++- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/execIndexing.c | 18 +++++++++++++++--- src/backend/executor/nodeModifyTable.c | 4 ++-- src/include/access/amapi.h | 2 +- src/include/access/brin_internal.h | 2 +- src/include/access/genam.h | 2 +- src/include/access/gin_private.h | 2 +- src/include/access/gist_private.h | 2 +- src/include/access/hash.h | 2 +- src/include/access/nbtree.h | 2 +- src/include/access/spgist.h | 2 +- .../modules/dummy_index_am/dummy_index_am.c | 2 +- 24 files changed, 47 insertions(+), 28 deletions(-) diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index b90145148d4..99aed8f9948 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -172,7 +172,7 @@ blbuildempty(Relation index) */ bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ blinsert(Relation index, Datum *values, bool *isnull, BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index 330811ec608..15ef1b9aee2 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -189,7 +189,7 @@ extern bool blvalidate(Oid opclassoid); /* index access method interface functions */ extern bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index a257903991d..a0052239645 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -154,7 +154,7 @@ brinhandler(PG_FUNCTION_ARGS) */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -168,6 +168,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); + ItemPointer heaptid = DatumGetItemPointer(tupleid); revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 9b6a5d9091c..653a4f7d469 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -339,7 +339,7 @@ toast_save_datum(Relation rel, Datum value, /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, - &(toasttup->t_self), + ItemPointerGetDatum(&(toasttup->t_self)), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 56968b95acf..36815547151 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -484,7 +484,7 @@ ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -493,6 +493,7 @@ gininsert(Relation index, Datum *values, bool *isnull, MemoryContext oldCtx; MemoryContext insertCtx; int i; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GinState cache if first call in this statement */ if (ginstate == NULL) diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 8ef5fa03290..53680b30d87 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -156,7 +156,7 @@ gistbuildempty(Relation index) */ bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -164,6 +164,7 @@ gistinsert(Relation r, Datum *values, bool *isnull, GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; IndexTuple itup; MemoryContext oldCxt; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GISTSTATE cache if first call in this statement */ if (giststate == NULL) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index fc5d97f606e..c8202e9349d 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -247,7 +247,7 @@ hashbuildCallback(Relation index, */ bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -255,6 +255,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull, Datum index_values[1]; bool index_isnull[1]; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* convert data to a hash key; on failure, do not insert anything */ if (!_hash_convert_tuple(rel, diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a32fc3b69fb..8f6559c9c3e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2310,7 +2310,7 @@ heapam_index_validate_scan(Relation heapRelation, index_insert(indexRelation, values, isnull, - &rootTuple, + ItemPointerGetDatum(&rootTuple), heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 70974ce9c17..72ee401e875 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -218,7 +218,7 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, @@ -233,7 +233,7 @@ index_insert(Relation indexRelation, InvalidBlockNumber); return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, - heap_t_ctid, heapRelation, + tupleid, heapRelation, checkUnique, indexUnchanged, indexInfo); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6c5b5c69ce5..32ec09b1ec7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -188,13 +188,14 @@ btbuildempty(Relation index) */ bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { bool result; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* generate an index tuple */ itup = index_form_tuple(RelationGetDescr(rel), values, isnull); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 4443f1918df..1f5c9a930d2 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -198,7 +198,7 @@ spgbuildempty(Relation index) */ bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -206,6 +206,7 @@ spginsert(Relation index, Datum *values, bool *isnull, SpGistState spgstate; MemoryContext oldCtx; MemoryContext insertCtx; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST insert temporary context", diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index 522da0ac855..9846637537c 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -170,7 +170,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, index_insert(index, /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - &(heapTuple->t_self), /* tid of heap tuple */ + ItemPointerGetDatum(&(heapTuple->t_self)), /* tid of heap tuple */ heapRelation, index->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 982bae9ed42..04e718c95cd 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -173,7 +173,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) * the row is now dead, because that is the TID the index will know * about. */ - index_insert(indexRel, values, isnull, &checktid, + index_insert(indexRel, values, isnull, ItemPointerGetDatum(&checktid), trigdata->tg_relation, UNIQUE_CHECK_EXISTING, false, indexInfo); } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 1d82b64b897..aac91283013 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -299,7 +299,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, List *arbiterIndexes, bool onlySummarizing) { - ItemPointer tupleid = &slot->tts_tid; List *result = NIL; int i; int numIndices; @@ -309,8 +308,20 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; - Assert(ItemPointerIsValid(tupleid)); + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } /* * Get information from the result relation info structure. @@ -457,6 +468,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; + ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -477,7 +489,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - tupleid, values, isnull, + raw_tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index ef62f283b6f..bbfc4d72bbd 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1755,8 +1755,8 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, /* Tuple routing starts from the root table. */ context->cpUpdateReturningSlot = - ExecInsert(context, mtstate->rootResultRelInfo, slot, canSetTag, - inserted_tuple, insert_destrel); + ExecInsert(context, mtstate->rootResultRelInfo, + slot, canSetTag, inserted_tuple, insert_destrel); /* * Reset the transition state that may possibly have been written by diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 4476ff7fba1..6bb600dfe2f 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -107,7 +107,7 @@ typedef void (*ambuildempty_function) (Relation indexRelation); typedef bool (*aminsert_function) (Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_tid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h index 97ddc925b27..418b32d5515 100644 --- a/src/include/access/brin_internal.h +++ b/src/include/access/brin_internal.h @@ -92,7 +92,7 @@ extern IndexBuildResult *brinbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void brinbuildempty(Relation index); extern bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 40671f28a66..24ec61ef04b 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,7 +144,7 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 6da64928b66..7ba1d4bc999 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -114,7 +114,7 @@ extern IndexBuildResult *ginbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void ginbuildempty(Relation index); extern bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 3edc740a3f3..0cd19757208 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -401,7 +401,7 @@ typedef struct GiSTOptions /* gist.c */ extern void gistbuildempty(Relation index); extern bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 9e035270a16..14fb8e4ce1e 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -364,7 +364,7 @@ extern IndexBuildResult *hashbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void hashbuildempty(Relation index); extern bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9020abebc92..3f36ea455aa 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1128,7 +1128,7 @@ typedef struct BTOptions */ extern void btbuildempty(Relation index); extern bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index fe31d32dbe9..e44d3561abf 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -197,7 +197,7 @@ extern IndexBuildResult *spgbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void spgbuildempty(Relation index); extern bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index c14e0abe0c6..562a578cad2 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -164,7 +164,7 @@ dibuildempty(Relation index) */ static bool diinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) From e55690582319e910a6fe74701fe144f214243202 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 17 May 2024 00:27:02 +0200 Subject: [PATCH 34/45] Methods for index update and delete Also validates compatability of index AM with table AM at index creation --- src/backend/access/index/indexam.c | 60 ++++ src/backend/executor/execIndexing.c | 401 +++++++++++++++++++++++++ src/backend/executor/nodeModifyTable.c | 22 +- src/backend/parser/gram.y | 16 +- src/include/access/amapi.h | 23 ++ src/include/access/genam.h | 15 + src/include/executor/executor.h | 10 + src/include/nodes/parsenodes.h | 1 + 8 files changed, 542 insertions(+), 6 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 72ee401e875..0dabdabca8e 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -238,6 +238,66 @@ index_insert(Relation indexRelation, indexInfo); } +/* ---------------- + * index_update - update an index tuple in a relation + * ---------------- + */ +bool +index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amupdate); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amupdate(indexRelation, + new_valid, old_valid, + values, isnull, tupleid, + valuesOld, isnullOld, oldTupleid, + heapRelation, + checkUnique, + indexInfo); +} + + +/* ---------------- + * index_delete - delete an index tuple from a relation + * ---------------- + */ +bool +index_delete(Relation indexRelation, + Datum *values, bool *isnull, Datum tupleid, + Relation heapRelation, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amdelete); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amdelete(indexRelation, + values, isnull, tupleid, + heapRelation, + indexInfo); +} + /* * index_beginscan - start a scan of an index with amgettuple * diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index aac91283013..266c876c8f6 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -513,6 +513,407 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, return result; } +List * +ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, + List *arbiterIndexes, + bool onlySummarizing) +{ + List *result = NIL; + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool applyNoDupErr; + IndexUniqueCheck checkUnique; + bool satisfiesConstraint; + bool new_valid = true; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* + * Skip processing of non-summarizing indexes if we only update + * summarizing indexes + */ + if (onlySummarizing && !indexInfo->ii_Summarizing) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + { + if (!indexRelation->rd_indam->ammvccaware) + continue; + new_valid = false; + } + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Check whether to apply noDupErr to this index */ + applyNoDupErr = noDupErr && + (arbiterIndexes == NIL || + list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)); + + /* + * The index AM does the actual insertion, plus uniqueness checking. + * + * For an immediate-mode unique index, we just tell the index AM to + * throw error if not unique. + * + * For a deferrable unique index, we tell the index AM to just detect + * possible non-uniqueness, and we add the index OID to the result + * list if further checking is needed. + * + * For a speculative insertion (used by INSERT ... ON CONFLICT), do + * the same as for a deferrable unique index. + */ + if (!indexRelation->rd_index->indisunique) + checkUnique = UNIQUE_CHECK_NO; + else if (applyNoDupErr) + checkUnique = UNIQUE_CHECK_PARTIAL; + else if (indexRelation->rd_index->indimmediate) + checkUnique = UNIQUE_CHECK_YES; + else + checkUnique = UNIQUE_CHECK_PARTIAL; + + if (indexRelation->rd_indam->ammvccaware) + { + Datum valuesOld[INDEX_MAX_KEYS]; + bool isnullOld[INDEX_MAX_KEYS]; + Datum oldTupleid; + bool old_valid = true; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + oldTupleid = slot_getsysattr(oldSlot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&oldSlot->tts_tid)); + oldTupleid = PointerGetDatum(&oldSlot->tts_tid); + } + + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = oldSlot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + old_valid = false; + } + + FormIndexDatum(indexInfo, + oldSlot, + estate, + valuesOld, + isnullOld); + + satisfiesConstraint = + index_update(indexRelation, /* index relation */ + new_valid, + old_valid, + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + valuesOld, + isnullOld, + oldTupleid, + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexInfo); /* index AM may need this */ + + } + else + { + bool indexUnchanged; + /* + * There's definitely going to be an index_insert() call for this + * index. If we're being called as part of an UPDATE statement, + * consider if the 'indexUnchanged' = true hint should be passed. + */ + indexUnchanged = index_unchanged_by_update(resultRelInfo, + estate, + indexInfo, + indexRelation); + + satisfiesConstraint = + index_insert(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexUnchanged, /* UPDATE without logical change? */ + indexInfo); /* index AM may need this */ + } + + /* + * If the index has an associated exclusion constraint, check that. + * This is simpler than the process for uniqueness checks since we + * always insert first and then check. If the constraint is deferred, + * we check now anyway, but don't throw error on violation or wait for + * a conclusive outcome from a concurrent insertion; instead we'll + * queue a recheck event. Similarly, noDupErr callers (speculative + * inserters) will recheck later, and wait for a conclusive outcome + * then. + * + * An index for an exclusion constraint can't also be UNIQUE (not an + * essential property, we just don't allow it in the grammar), so no + * need to preserve the prior state of satisfiesConstraint. + */ + if (indexInfo->ii_ExclusionOps != NULL) + { + bool violationOK; + CEOUC_WAIT_MODE waitMode; + ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); + + if (applyNoDupErr) + { + violationOK = true; + waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT; + } + else if (!indexRelation->rd_index->indimmediate) + { + violationOK = true; + waitMode = CEOUC_NOWAIT; + } + else + { + violationOK = false; + waitMode = CEOUC_WAIT; + } + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, + indexRelation, indexInfo, + raw_tupleid, values, isnull, + estate, false, + waitMode, violationOK, NULL); + } + + if ((checkUnique == UNIQUE_CHECK_PARTIAL || + indexInfo->ii_ExclusionOps != NULL) && + !satisfiesConstraint) + { + /* + * The tuple potentially violates the uniqueness or exclusion + * constraint, so make a note of the index so that we can re-check + * it later. Speculative inserters are told if there was a + * speculative conflict, since that always requires a restart. + */ + result = lappend_oid(result, RelationGetRelid(indexRelation)); + if (indexRelation->rd_index->indimmediate && specConflict) + *specConflict = true; + } + } + + return result; +} + +void +ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, + EState *estate) +{ + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + if (!indexRelation->rd_indam->ammvccaware) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + index_delete(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + indexInfo); /* index AM may need this */ + } +} + /* ---------------------------------------------------------------- * ExecCheckIndexConstraints * diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index bbfc4d72bbd..c10311cddb4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1234,6 +1234,14 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result) *result = TM_Ok; + /* + * Open the table's indexes, if we have not done so already, so that we + * can delete index entries. + */ + if (resultRelInfo->ri_RelationDesc->rd_rel->relhasindex && + resultRelInfo->ri_IndexRelationDescs == NULL) + ExecOpenIndices(resultRelInfo, false); + /* BEFORE ROW DELETE triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_delete_before_row) @@ -1290,6 +1298,10 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, EState *estate = context->estate; TransitionCaptureState *ar_delete_trig_tcs; + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, slot, context->estate); + /* * If this delete is the result of a partition key update that moved the * tuple to a new partition, put this row into the transition OLD TABLE, @@ -2021,11 +2033,15 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* insert index entries for tuple if necessary */ if (resultRelInfo->ri_NumIndices > 0 && (updateCxt->updateIndexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, context->estate, - true, false, + { + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + context->estate, + false, NULL, NIL, (updateCxt->updateIndexes == TU_Summarizing)); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c31b3733587..c39c6f21939 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -371,6 +371,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type OptSchemaEltList parameter_name_list %type am_type +%type opt_for_tableam %type TriggerForSpec TriggerForType %type TriggerActionTime @@ -5746,17 +5747,21 @@ row_security_cmd: /***************************************************************************** * * QUERY: - * CREATE ACCESS METHOD name HANDLER handler_name + * CREATE ACCESS METHOD name TYPE am_type + * [FOR tableam_name] + * HANDLER handler_name * *****************************************************************************/ -CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type HANDLER handler_name +CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type + opt_for_tableam HANDLER handler_name { CreateAmStmt *n = makeNode(CreateAmStmt); n->amname = $4; - n->handler_name = $8; n->amtype = $6; + n->tableam_name = $7; + n->handler_name = $9; $$ = (Node *) n; } ; @@ -5766,6 +5771,11 @@ am_type: | TABLE { $$ = AMTYPE_TABLE; } ; +opt_for_tableam: + FOR name { $$ = $2; } + | /*EMPTY*/ { $$ = NULL; } + ; + /***************************************************************************** * * QUERIES : diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 6bb600dfe2f..4bef9d7e974 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -112,6 +112,25 @@ typedef bool (*aminsert_function) (Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); +/* update this tuple */ +typedef bool (*amupdate_function) (Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +/* delete this tuple */ +typedef bool (*amdelete_function) (Relation indexRelation, + Datum *values, bool *isnull, + Datum tupleid, + Relation heapRelation, + struct IndexInfo *indexInfo); /* bulk delete */ typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info, @@ -246,6 +265,8 @@ typedef struct IndexAmRoutine bool amusemaintenanceworkmem; /* does AM store tuple information only at block granularity? */ bool amsummarizing; + /* does AM can provide MVCC */ + bool ammvccaware; /* OR of parallel vacuum flags. See vacuum.h for flags. */ uint8 amparallelvacuumoptions; /* type of data stored in index, or InvalidOid if variable */ @@ -261,6 +282,8 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + amupdate_function amupdate; + amdelete_function amdelete; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 24ec61ef04b..696c063373e 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -149,6 +149,21 @@ extern bool index_insert(Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); +extern bool index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +extern bool index_delete(Relation indexRelation, Datum *values, bool *isnull, + Datum tupleid, Relation heapRelation, + struct IndexInfo *indexInfo); extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index ac02247947e..2cc92d66f93 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -633,6 +633,16 @@ extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, bool noDupErr, bool *specConflict, List *arbiterIndexes, bool onlySummarizing); +extern List *ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, List *arbiterIndexes, + bool onlySummarizing); +extern void ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate); extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 99173f541d5..f51ec29fc92 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2824,6 +2824,7 @@ typedef struct CreateAmStmt char *amname; /* access method name */ List *handler_name; /* handler function name */ char amtype; /* type of access method */ + char *tableam_name; /* table AM name */ } CreateAmStmt; /* ---------------------- From 9774eb495c533bf973f9e055a9d2a75d2bd59353 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 12 Aug 2024 12:30:00 +0300 Subject: [PATCH 35/45] Hook to override index AM routine --- src/backend/access/index/amapi.c | 67 ++++++++++++++++++++++++------ src/backend/catalog/index.c | 2 +- src/backend/commands/indexcmds.c | 4 +- src/backend/commands/opclasscmds.c | 9 ++-- src/backend/executor/execAmi.c | 2 +- src/backend/utils/adt/amutils.c | 4 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/cache/relcache.c | 2 +- src/include/access/amapi.h | 9 +++- 9 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 8b02cdbe825..f4526997474 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -16,25 +16,27 @@ #include "access/amapi.h" #include "access/htup_details.h" #include "catalog/pg_am.h" +#include "catalog/pg_class.h" +#include "catalog/pg_index.h" #include "catalog/pg_opclass.h" #include "utils/builtins.h" #include "utils/syscache.h" +IndexAMRoutineHookType IndexAMRoutineHook = NULL; -/* - * GetIndexAmRoutine - call the specified access method handler routine to get - * its IndexAmRoutine struct, which will be palloc'd in the caller's context. - * - * Note that if the amhandler function is built-in, this will not involve - * any catalog access. It's therefore safe to use this while bootstrapping - * indexes for the system catalogs. relcache.c relies on that. - */ IndexAmRoutine * -GetIndexAmRoutine(Oid amhandler) +GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) { Datum datum; IndexAmRoutine *routine; + if (IndexAMRoutineHook != NULL) + { + routine = IndexAMRoutineHook(tamoid, amhandler); + if (routine) + return routine; + } + datum = OidFunctionCall0(amhandler); routine = (IndexAmRoutine *) DatumGetPointer(datum); @@ -45,6 +47,47 @@ GetIndexAmRoutine(Oid amhandler) return routine; } + +/* + * GetIndexAmRoutine - call the specified access method handler routine to get + * its IndexAmRoutine struct, which will be palloc'd in the caller's context. + * + * Note that if the amhandler function is built-in, this will not involve + * any catalog access. It's therefore safe to use this while bootstrapping + * indexes for the system catalogs. relcache.c relies on that. + */ +IndexAmRoutine * +GetIndexAmRoutine(Oid indoid, Oid amhandler) +{ + HeapTuple ht_idx; + HeapTuple ht_tblrel; + Form_pg_index idxrec; + Form_pg_class tblrelrec; + Oid indrelid; + Oid tamoid; + + if (!OidIsValid((indoid)) || indoid < FirstNormalObjectId) + return GetIndexAmRoutineWithTableAM(HEAP_TABLE_AM_OID, amhandler); + + ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indoid)); + if (!HeapTupleIsValid(ht_idx)) + elog(ERROR, "cache lookup failed for index %u", indoid); + idxrec = (Form_pg_index) GETSTRUCT(ht_idx); + Assert(indoid == idxrec->indexrelid); + indrelid = idxrec->indrelid; + + ht_tblrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indrelid)); + if (!HeapTupleIsValid(ht_tblrel)) + elog(ERROR, "cache lookup failed for relation %u", indrelid); + tblrelrec = (Form_pg_class) GETSTRUCT(ht_tblrel); + tamoid = tblrelrec->relam; + + ReleaseSysCache(ht_tblrel); + ReleaseSysCache(ht_idx); + + return GetIndexAmRoutineWithTableAM(tamoid, amhandler); +} + /* * GetIndexAmRoutineByAmId - look up the handler of the index access method * with the given OID, and get its IndexAmRoutine struct. @@ -53,7 +96,7 @@ GetIndexAmRoutine(Oid amhandler) * noerror is true, else throws error. */ IndexAmRoutine * -GetIndexAmRoutineByAmId(Oid amoid, bool noerror) +GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) { HeapTuple tuple; Form_pg_am amform; @@ -103,7 +146,7 @@ GetIndexAmRoutineByAmId(Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(amhandler); + return GetIndexAmRoutine(indoid, amhandler); } @@ -129,7 +172,7 @@ amvalidate(PG_FUNCTION_ARGS) ReleaseSysCache(classtup); - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (amroutine->amvalidate == NULL) elog(ERROR, "function amvalidate is not defined for index access method %u", diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 21021f84e52..69c6689245e 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -298,7 +298,7 @@ ConstructTupleDescriptor(Relation heapRelation, int i; /* We need access to the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(accessMethodObjectId, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, accessMethodObjectId, false); /* ... and to the table's tuple descriptor */ heapTupDesc = RelationGetDescr(heapRelation); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 831534ea6c6..390d9f898b3 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -214,7 +214,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutine(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; @@ -838,7 +838,7 @@ DefineIndex(Oid relationId, } accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineWithTableAM(rel->rd_rel->relam, accessMethodForm->amhandler); pgstat_progress_update_param(PROGRESS_CREATEIDX_ACCESS_METHOD_OID, accessMethodId); diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index 5f7ee238863..76722d506b5 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -43,6 +43,7 @@ #include "parser/parse_func.h" #include "parser/parse_oper.h" #include "parser/parse_type.h" +#include "postgres_ext.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" @@ -377,7 +378,7 @@ DefineOpClass(CreateOpClassStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -835,7 +836,7 @@ AlterOpFamily(AlterOpFamilyStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -882,7 +883,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, int maxOpNumber, int maxProcNumber, int optsProcNumber, List *items) { - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); List *operators; /* OpFamilyMember list for operators */ List *procedures; /* OpFamilyMember list for support procs */ ListCell *l; @@ -1165,7 +1166,7 @@ assignOperTypes(OpFamilyMember *member, Oid amoid, Oid typeoid) * the family has been created but not yet populated with the required * operators.) */ - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (!amroutine->amcanorderbyop) ereport(ERROR, diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 9d18ce8c6b2..286a0f8f222 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -615,7 +615,7 @@ IndexSupportsBackwardScan(Oid indexid) idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false); + amroutine = GetIndexAmRoutineByAmId(indexid, idxrelrec->relam, false); result = amroutine->amcanbackward; diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c index 48852bf79e2..265fcfc86c4 100644 --- a/src/backend/utils/adt/amutils.c +++ b/src/backend/utils/adt/amutils.c @@ -195,7 +195,7 @@ indexam_property(FunctionCallInfo fcinfo, /* * Get AM information. If we don't have a valid AM OID, return NULL. */ - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(index_oid, amoid, true); if (routine == NULL) PG_RETURN_NULL(); @@ -455,7 +455,7 @@ pg_indexam_progress_phasename(PG_FUNCTION_ARGS) IndexAmRoutine *routine; char *name; - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(InvalidOid, amoid, true); if (routine == NULL || !routine->ambuildphasename) PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index f01cc2521c8..d38b62ee569 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1313,7 +1313,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(amrec->amhandler); + amroutine = GetIndexAmRoutine(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 35e076c536b..2a16440bf09 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1404,7 +1404,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_amhandler); + tmp = GetIndexAmRoutine(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 4bef9d7e974..91045064b40 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -309,7 +309,12 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ -extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid amoid, bool noerror); +extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutine(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); + +typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); + +extern IndexAMRoutineHookType IndexAMRoutineHook; #endif /* AMAPI_H */ From f9b8cd9bb189d6f4f3edf082ec26827f29d6ce3a Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 5 Sep 2024 00:03:23 +0200 Subject: [PATCH 36/45] Always building child/root maps for relations with ROW_REF_ROWID --- src/backend/executor/execUtils.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index c06b2288583..bb65bd078cb 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1242,9 +1242,19 @@ ExecGetChildToRootMap(ResultRelInfo *resultRelInfo) ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo; if (rootRelInfo) - resultRelInfo->ri_ChildToRootMap = - convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc), - RelationGetDescr(rootRelInfo->ri_RelationDesc)); + { + TupleDesc indesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + TupleDesc outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); + AttrMap *attrMap; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, false); + else + attrMap = build_attrmap_by_name(indesc, outdesc, false); + if (attrMap) + resultRelInfo->ri_ChildToRootMap = + convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); + } else /* this isn't a child result rel */ resultRelInfo->ri_ChildToRootMap = NULL; @@ -1281,8 +1291,10 @@ ExecGetRootToChildMap(ResultRelInfo *resultRelInfo, EState *estate) * to ignore by passing true for missing_ok. */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - attrMap = build_attrmap_by_name_if_req(indesc, outdesc, - !childrel->rd_rel->relispartition); + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, !childrel->rd_rel->relispartition); + else + attrMap = build_attrmap_by_name(indesc, outdesc, !childrel->rd_rel->relispartition); if (attrMap) resultRelInfo->ri_RootToChildMap = convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); From ab0b3eb14832f561f51a7102a6bc48398b332827 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Tue, 20 Aug 2024 14:09:51 +0200 Subject: [PATCH 37/45] Don't run internal btree _bt_getrootheight on non-btree in get_relation_info --- src/backend/optimizer/util/plancat.c | 3 ++- src/include/optimizer/plancat.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 1a3045479ff..f4a0fc2b6c6 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -60,6 +60,7 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +skip_tree_height_hook_type skip_tree_height_hook = NULL; static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, @@ -457,7 +458,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->tuples = rel->tuples; } - if (info->relam == BTREE_AM_OID) + if (info->relam == BTREE_AM_OID && (!skip_tree_height_hook || !skip_tree_height_hook(indexRelation))) { /* * For btrees, get tree height while we have the index diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index eb1c3ccc4bf..812927ddcf4 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -24,6 +24,9 @@ typedef void (*get_relation_info_hook_type) (PlannerInfo *root, RelOptInfo *rel); extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; +typedef bool (*skip_tree_height_hook_type) (Relation indexRelation); +extern PGDLLIMPORT skip_tree_height_hook_type skip_tree_height_hook; + extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); From 3833e5d936b42f4ce911dfb8a9dba9e04630485a Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 17 Sep 2024 01:58:24 +0300 Subject: [PATCH 38/45] Fix handling tupleid in logical replication --- src/backend/access/table/tableam.c | 8 ++-- src/backend/executor/execReplication.c | 54 +++++++++++++++++------- src/backend/replication/logical/worker.c | 15 +++---- src/include/access/tableam.h | 4 +- 4 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 67d63e0a6ec..3f64d70666e 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -297,7 +297,7 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, +simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *oldSlot) { TM_Result result; @@ -308,7 +308,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_delete(rel, PointerGetDatum(tid), + result = table_tuple_delete(rel, tupleid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, @@ -349,7 +349,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, * via ereport(). */ void -simple_table_tuple_update(Relation rel, ItemPointer otid, +simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, TU_UpdateIndexes *update_indexes, @@ -364,7 +364,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, PointerGetDatum(otid), slot, + result = table_tuple_update(rel, tupleid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 6e2388005fb..136e761fa2f 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -125,6 +125,25 @@ build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel, return skey_attoff; } +static Datum +slot_get_tupleid(Relation rel, TupleTableSlot *slot) +{ + Datum tupleid; + + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&slot->tts_tid); + } + + return tupleid; +} + /* * Search the relation 'rel' for tuple using the index. * @@ -209,7 +228,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), GetLatestSnapshot(), outslot, GetCurrentCommandId(false), @@ -394,7 +413,7 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), GetLatestSnapshot(), outslot, GetCurrentCommandId(false), @@ -518,7 +537,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &(searchslot->tts_tid); + Datum tupleid = slot_get_tupleid(rel, searchslot); /* For now we support only tables. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); @@ -530,7 +549,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - PointerGetDatum(tid), NULL, slot, NULL, NULL)) + tupleid, NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -552,16 +571,17 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); - if (resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_update_after_row) - oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); - simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, + simple_table_tuple_update(rel, tupleid, slot, estate->es_snapshot, &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, true, false, + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + estate, + false, NULL, NIL, (update_indexes == TU_Summarizing)); @@ -588,7 +608,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &searchslot->tts_tid; + Datum tupleid = slot_get_tupleid(rel, searchslot); CheckCmdReplicaIdentity(rel, CMD_DELETE); @@ -597,19 +617,21 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - PointerGetDatum(tid), NULL, NULL, NULL, NULL); + tupleid, NULL, NULL, NULL, NULL); } if (!skip_tuple) { TupleTableSlot *oldSlot = NULL; - if (resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_delete_after_row) - oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot, oldSlot); + simple_table_tuple_delete(rel, tupleid, estate->es_snapshot, oldSlot); + + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, oldSlot, estate); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 832b1cf7642..dfd72bf8cca 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2430,9 +2430,8 @@ apply_handle_insert(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Process and store remote tuple in the slot */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -2586,9 +2585,8 @@ apply_handle_update(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* * Populate updatedCols so that per-column triggers can fire, and so @@ -2766,9 +2764,8 @@ apply_handle_delete(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Build the search tuple. */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 2610d9ca692..62ffc14e8f8 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -2094,10 +2094,10 @@ table_tuple_is_current(Relation rel, TupleTableSlot *slot) */ extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); -extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, +extern void simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *oldSlot); -extern void simple_table_tuple_update(Relation rel, ItemPointer otid, +extern void simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot); From 24fb47810aab793f17ad31f01b4bbab04ef15bb5 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 27 Sep 2024 14:26:40 +0200 Subject: [PATCH 39/45] New CSN snapshot format * Add xlogptr and xmin to determine right order of transactions when decoding on replica. * Add CSN snapshot data to snapshot builder. * Record CSN to the running xids and restore it during logical decoding to the snapshot builder. * Add function to update CSN snapshot data in snapshot builder. * Update CSN snapshot LSN in snapshot building after each transaction commit. * Restore CSN snapshot data in SnapBuildBuildSnapshot(). --- src/backend/replication/logical/snapbuild.c | 16 ++++++++++++++++ src/backend/storage/ipc/procarray.c | 1 + src/backend/storage/ipc/standby.c | 1 + src/backend/utils/time/snapmgr.c | 10 +++++++--- src/include/replication/snapbuild.h | 2 ++ src/include/storage/standby.h | 1 + src/include/storage/standbydefs.h | 1 + src/include/utils/snapshot.h | 9 ++++++++- 8 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 5b92c5542be..3e2a275a8eb 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -207,6 +207,8 @@ struct SnapBuild */ TransactionId next_phase_at; + CSNSnapshotData csnSnapshotData; + /* * Array of transactions which could have catalog changes that committed * between xmin and xmax. @@ -562,6 +564,8 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->regd_count = 0; snapshot->snapXactCompletionCount = 0; + snapshot->csnSnapshotData = builder->csnSnapshotData; + return snapshot; } @@ -1039,6 +1043,8 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, TransactionId xmax = xid; + builder->csnSnapshotData.xlogptr = lsn; + /* * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor * will they be part of a snapshot. So we don't need to record anything. @@ -1253,6 +1259,9 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we hit fast paths in heapam_visibility.c. */ builder->xmin = running->oldestRunningXid; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; /* Remove transactions we don't need to keep track off anymore */ SnapBuildPurgeOlderTxn(builder); @@ -2150,3 +2159,10 @@ CheckPointSnapBuild(void) } FreeDir(snap_dir); } + +void +SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData) +{ + builder->csnSnapshotData = *csnSnapshotData; +} diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index f41656027d5..a5ada9beb54 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2866,6 +2866,7 @@ GetRunningTransactionData(void) CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + CurrentRunningXacts->csn = pg_atomic_read_u64(&ShmemVariableCache->nextCommitSeqNo); Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 3bdc5f7fb6c..1e6760a7c49 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1355,6 +1355,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + xlrec.csn = CurrRunningXacts->csn; /* Header */ XLogBeginInsert(); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 526cacb70a5..4aa78a38868 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -196,7 +196,7 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; - CommitSeqNo snapshotcsn; + CSNSnapshotData csnSnapshotData; uint64 undoRegularLocation; uint64 undoRegularXmin; uint64 undoSystemLocation; @@ -2194,7 +2194,9 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; - serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot.csnSnapshotData.xmin = snapshot->csnSnapshotData.xmin; + serialized_snapshot.csnSnapshotData.snapshotcsn = snapshot->csnSnapshotData.snapshotcsn; + serialized_snapshot.csnSnapshotData.xlogptr = snapshot->csnSnapshotData.xlogptr; serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; @@ -2274,7 +2276,9 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; - snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; + snapshot->csnSnapshotData.xmin = serialized_snapshot.csnSnapshotData.xmin; + snapshot->csnSnapshotData.snapshotcsn = serialized_snapshot.csnSnapshotData.snapshotcsn; + snapshot->csnSnapshotData.xlogptr = serialized_snapshot.csnSnapshotData.xlogptr; snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 071fca6d3b5..a7b793dae3c 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -91,5 +91,7 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, struct xl_running_xacts *running); extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn); +extern void SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData); #endif /* SNAPBUILD_H */ diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index bb7d90c7ad6..b97394b4841 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -91,6 +91,7 @@ typedef struct RunningTransactionsData TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId *xids; /* array of (sub)xids still running */ } RunningTransactionsData; diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index 188e348618a..23dddce8d84 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -52,6 +52,7 @@ typedef struct xl_running_xacts TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; } xl_running_xacts; diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index d4392d7dc04..01093a33315 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -129,6 +129,13 @@ typedef struct pairingheap_node ph_node; } RetainUndoLocationPHNode; +typedef struct CSNSnapshotData +{ + uint64 xmin; + CommitSeqNo snapshotcsn; + XLogRecPtr xlogptr; +} CSNSnapshotData; + /* * Struct representing all kind of possible snapshots. * @@ -224,7 +231,7 @@ typedef struct SnapshotData RetainUndoLocationPHNode undoRegularLocationPhNode; RetainUndoLocationPHNode undoSystemLocationPhNode; - CommitSeqNo snapshotcsn; + CSNSnapshotData csnSnapshotData; } SnapshotData; typedef void (*snapshot_hook_type) (Snapshot snapshot); From f5f12cedad37f7db3f842835e26a3bd3b996784c Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 3 Oct 2024 13:12:01 +0300 Subject: [PATCH 40/45] Restart archiver during PM_SHUTDOWN postmaster stage That allows S3 mode to finish WAL archiving if needed. --- src/backend/postmaster/postmaster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index c9727bcdbbf..7a9c875ee7e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -448,7 +448,7 @@ static void InitPostmasterDeathWatchHandle(void); * even during recovery. */ #define PgArchStartupAllowed() \ - (((XLogArchivingActive() && pmState == PM_RUN) || \ + (((XLogArchivingActive() && (pmState == PM_RUN || pmState == PM_SHUTDOWN)) || \ (XLogArchivingAlways() && \ (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && \ PgArchCanRestart()) From bf4510e7ca3e1972b80d471ff4fc246694058e1b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 8 Oct 2024 21:31:33 +0300 Subject: [PATCH 41/45] Add handling of CSN snapshot in some places of snapbuild.c --- src/backend/replication/logical/snapbuild.c | 8 +++++--- src/backend/utils/time/snapmgr.c | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3e2a275a8eb..3cc86087fd1 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -663,6 +663,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) snap->snapshot_type = SNAPSHOT_MVCC; snap->xcnt = newxcnt; snap->xip = newxip; + snap->csnSnapshotData = builder->csnSnapshotData; return snap; } @@ -1232,6 +1233,10 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact ReorderBufferTXN *txn; TransactionId xmin; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; + /* * If we're not consistent yet, inspect the record to see whether it * allows to get closer to being consistent. If we are consistent, dump @@ -1259,9 +1264,6 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we hit fast paths in heapam_visibility.c. */ builder->xmin = running->oldestRunningXid; - builder->csnSnapshotData.snapshotcsn = running->csn; - builder->csnSnapshotData.xmin = 0; - builder->csnSnapshotData.xlogptr = lsn; /* Remove transactions we don't need to keep track off anymore */ SnapBuildPurgeOlderTxn(builder); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 4aa78a38868..283255cdaad 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -551,6 +551,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, CurrentSnapshot->xmin = sourcesnap->xmin; CurrentSnapshot->xmax = sourcesnap->xmax; CurrentSnapshot->xcnt = sourcesnap->xcnt; + CurrentSnapshot->csnSnapshotData = sourcesnap->csnSnapshotData; Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount()); if (sourcesnap->xcnt > 0) memcpy(CurrentSnapshot->xip, sourcesnap->xip, From 7e50554f03bfaa1235c985d30cb0c2ac0e937b71 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 14 Oct 2024 16:22:14 +0300 Subject: [PATCH 42/45] Move CheckPoint_hook() call after CheckPointBuffers() That allows to process flushed buffers in CheckPoint_hook(). --- src/backend/access/transam/xlog.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c2820f29e7c..ee0794465b1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7041,8 +7041,6 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - if (CheckPoint_hook) - CheckPoint_hook(checkPointRedo, flags); CheckPointRelationMap(); CheckPointReplicationSlots(); CheckPointSnapBuild(); @@ -7059,6 +7057,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointPredicate(); CheckPointBuffers(flags); + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); + /* Perform all queued up fsyncs */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); From 3e6f16b6473da78fb74a44a3b41ba031346e4736 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Tue, 15 Oct 2024 22:39:08 +0400 Subject: [PATCH 43/45] Restore GetIndexAmRoutine signature for compatibility with other callers Use GetIndexAmRoutineExtended instead for all Orioledb extensibility. --- src/backend/access/index/amapi.c | 11 ++++++++--- src/backend/commands/indexcmds.c | 2 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/cache/relcache.c | 2 +- src/include/access/amapi.h | 3 ++- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index f4526997474..ed2b9fc9e68 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -47,7 +47,6 @@ GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) return routine; } - /* * GetIndexAmRoutine - call the specified access method handler routine to get * its IndexAmRoutine struct, which will be palloc'd in the caller's context. @@ -57,7 +56,13 @@ GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) * indexes for the system catalogs. relcache.c relies on that. */ IndexAmRoutine * -GetIndexAmRoutine(Oid indoid, Oid amhandler) +GetIndexAmRoutine(Oid amhandler) +{ + return GetIndexAmRoutineExtended(InvalidOid, amhandler); +} + +IndexAmRoutine * +GetIndexAmRoutineExtended(Oid indoid, Oid amhandler) { HeapTuple ht_idx; HeapTuple ht_tblrel; @@ -146,7 +151,7 @@ GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(indoid, amhandler); + return GetIndexAmRoutineExtended(indoid, amhandler); } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 390d9f898b3..df4fffc4e37 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -214,7 +214,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(oldId, accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineExtended(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index d38b62ee569..ecae9d86420 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1313,7 +1313,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(indexrelid, amrec->amhandler); + amroutine = GetIndexAmRoutineExtended(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 2a16440bf09..18b2ebdd59f 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1404,7 +1404,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_id, relation->rd_amhandler); + tmp = GetIndexAmRoutineExtended(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 91045064b40..bb226d85fad 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -310,7 +310,8 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutine(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineExtended(Oid indoid, Oid amhandler); extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); From 6aeb4e4858bd011725a33156e0ec9c7ed680bb6f Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Wed, 16 Oct 2024 19:23:07 +0400 Subject: [PATCH 44/45] Make index insert compatible with outside callers We split aminsert method to aminsert and aminsertextended. aminsert is a method for indexes implemented in other extensions, it accepts ItemPointer tupleid. aminsertextended is for internal Postgres indexes and Orioledb, it accepts Datum tupleid. They are not supposed to call aminsert method, so that it is set NULL for them. We can not rely that extensions are aware of aminsertextended, so index_insert() calls aminsert if it's not NULL preferentially. Signature of index_insert() is reverted so that it could be called by other extensions. Datum tupleid is confined inside index_insert method. --- contrib/bloom/blutils.c | 3 ++- doc/src/sgml/indexam.sgml | 1 + src/backend/access/brin/brin.c | 3 ++- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/ginutil.c | 3 ++- src/backend/access/gist/gist.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/index/indexam.c | 21 ++++++++++++++++--- src/backend/access/nbtree/nbtree.c | 3 ++- src/backend/access/spgist/spgutils.c | 3 ++- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/execIndexing.c | 20 ++++++++---------- src/include/access/amapi.h | 12 +++++++++++ src/include/access/genam.h | 2 +- .../modules/dummy_index_am/dummy_index_am.c | 3 ++- src/tools/pgindent/typedefs.list | 1 + 18 files changed, 62 insertions(+), 27 deletions(-) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index f23fbb1d9e0..d92858a3433 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -130,7 +130,8 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambuild = blbuild; amroutine->ambuildempty = blbuildempty; - amroutine->aminsert = blinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = blinsert; amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 30eda37afa8..cee79776683 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -139,6 +139,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index a0052239645..38469a5a554 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -116,7 +116,8 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambuild = brinbuild; amroutine->ambuildempty = brinbuildempty; - amroutine->aminsert = brininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = brininsert; amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 653a4f7d469..9b6a5d9091c 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -339,7 +339,7 @@ toast_save_datum(Relation rel, Datum value, /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, - ItemPointerGetDatum(&(toasttup->t_self)), + &(toasttup->t_self), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 7a4cd93f301..52d9a725fc4 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -63,7 +63,8 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambuild = ginbuild; amroutine->ambuildempty = ginbuildempty; - amroutine->aminsert = gininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gininsert; amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 53680b30d87..73193f0970d 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -85,7 +85,8 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambuild = gistbuild; amroutine->ambuildempty = gistbuildempty; - amroutine->aminsert = gistinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gistinsert; amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; amroutine->amcanreturn = gistcanreturn; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index c8202e9349d..ffddf7b900c 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -82,7 +82,8 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambuild = hashbuild; amroutine->ambuildempty = hashbuildempty; - amroutine->aminsert = hashinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = hashinsert; amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 8f6559c9c3e..a32fc3b69fb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2310,7 +2310,7 @@ heapam_index_validate_scan(Relation heapRelation, index_insert(indexRelation, values, isnull, - ItemPointerGetDatum(&rootTuple), + &rootTuple, heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 0dabdabca8e..94bdec63666 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -218,24 +218,39 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - Datum tupleid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { RELATION_CHECKS; - CHECK_REL_PROCEDURE(aminsert); + + if (indexRelation->rd_indam->aminsertextended == NULL && indexRelation->rd_indam->aminsert == NULL ) + elog(ERROR, "at least one function aminsert or aminsertextended should be defined for index \"%s\"", \ + RelationGetRelationName(indexRelation)); if (!(indexRelation->rd_indam->ampredlocks)) CheckForSerializableConflictIn(indexRelation, (ItemPointer) NULL, InvalidBlockNumber); - return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, + if (indexRelation->rd_indam->aminsert) + { + /* compatibility method for extension AM's not aware of aminsertextended */ + return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, tupleid, heapRelation, checkUnique, indexUnchanged, indexInfo); + } + else + { + /* index insert method for internal AM's and Orioledb that are aware of aminsertextended */ + return indexRelation->rd_indam->aminsertextended(indexRelation, values, isnull, + ItemPointerGetDatum(tupleid), heapRelation, + checkUnique, indexUnchanged, + indexInfo); + } } /* ---------------- diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 32ec09b1ec7..44daed95baf 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -121,7 +121,8 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; - amroutine->aminsert = btinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = btinsert; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; amroutine->amcanreturn = btcanreturn; diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 5fa9e230c08..127ff3922d1 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -69,7 +69,8 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambuild = spgbuild; amroutine->ambuildempty = spgbuildempty; - amroutine->aminsert = spginsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = spginsert; amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; amroutine->amcanreturn = spgcanreturn; diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index 9846637537c..522da0ac855 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -170,7 +170,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, index_insert(index, /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - ItemPointerGetDatum(&(heapTuple->t_self)), /* tid of heap tuple */ + &(heapTuple->t_self), /* tid of heap tuple */ heapRelation, index->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 04e718c95cd..982bae9ed42 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -173,7 +173,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) * the row is now dead, because that is the TID the index will know * about. */ - index_insert(indexRel, values, isnull, ItemPointerGetDatum(&checktid), + index_insert(indexRel, values, isnull, &checktid, trigdata->tg_relation, UNIQUE_CHECK_EXISTING, false, indexInfo); } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 266c876c8f6..a40aebb1ef1 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -308,19 +308,19 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - Datum tupleid; + ItemPointer tupleid; if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { bool isnull; - tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); Assert(!isnull); } else { Assert(ItemPointerIsValid(&slot->tts_tid)); - tupleid = PointerGetDatum(&slot->tts_tid); + tupleid = &slot->tts_tid; } /* @@ -468,7 +468,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; - ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -489,7 +488,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - raw_tupleid, values, isnull, + tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } @@ -532,18 +531,18 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - Datum tupleid; + ItemPointer tupleid; if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { bool isnull; - tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); Assert(!isnull); } else { Assert(ItemPointerIsValid(&slot->tts_tid)); - tupleid = PointerGetDatum(&slot->tts_tid); + tupleid = &slot->tts_tid; } /* @@ -712,7 +711,7 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, old_valid, values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + ItemPointerGetDatum(tupleid), /* tid of heap tuple */ valuesOld, isnullOld, oldTupleid, @@ -763,7 +762,6 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; - ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -784,7 +782,7 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - raw_tupleid, values, isnull, + tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index bb226d85fad..73320f93be7 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -105,6 +105,16 @@ typedef void (*ambuildempty_function) (Relation indexRelation); /* insert this tuple */ typedef bool (*aminsert_function) (Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer tupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* extended version of aminsert taking Datum tupleid */ +typedef bool (*aminsert_extended_function) (Relation indexRelation, Datum *values, bool *isnull, Datum tupleid, @@ -112,6 +122,7 @@ typedef bool (*aminsert_function) (Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); + /* update this tuple */ typedef bool (*amupdate_function) (Relation indexRelation, bool new_valid, @@ -282,6 +293,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; amupdate_function amupdate; amdelete_function amdelete; ambulkdelete_function ambulkdelete; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 696c063373e..0de79f782a5 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,7 +144,7 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - Datum tupleid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 562a578cad2..09c5d20479d 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -302,7 +302,8 @@ dihandler(PG_FUNCTION_ARGS) amroutine->ambuild = dibuild; amroutine->ambuildempty = dibuildempty; - amroutine->aminsert = diinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = diinsert; amroutine->ambulkdelete = dibulkdelete; amroutine->amvacuumcleanup = divacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 4791528e140..264bdbdee0f 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3146,6 +3146,7 @@ amgetbitmap_function amgettuple_function aminitparallelscan_function aminsert_function +aminsert_extended_function ammarkpos_function amoptions_function amparallelrescan_function From 050b9c56c6e06d8ef7bf905112b9beca10aa5a9d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 25 Oct 2024 14:35:13 +0900 Subject: [PATCH 45/45] Detect patchset version automatically with Meson too It's the same approach done in configure.ac. --- meson.build | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 17f10b9851a..4158f96ad41 100644 --- a/meson.build +++ b/meson.build @@ -153,7 +153,8 @@ cdata.set('PG_VERSION_NUM', pg_version_num) # PG_VERSION_STR is built later, it depends on compiler test results cdata.set_quoted('CONFIGURE_ARGS', '') -orioledb_patchset_version = '22' +git_describe_tags = run_command('git', 'describe', '--tags', check: true) +orioledb_patchset_version = git_describe_tags.stdout().strip().split('_')[1] ###############################################################