diff --git a/.abi-check/6.27.1_arenadata63/postgres.symbols.ignore b/.abi-check/6.27.1_arenadata63/postgres.symbols.ignore new file mode 100644 index 000000000000..2c3d6c6c1abe --- /dev/null +++ b/.abi-check/6.27.1_arenadata63/postgres.symbols.ignore @@ -0,0 +1,2 @@ +ConfigureNamesBool_gp +log_smgrcreate diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index 65ff67428611..2a18785431b2 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -24,7 +24,7 @@ smgr_desc(StringInfo buf, XLogRecord *record) uint8 info = record->xl_info & ~XLR_INFO_MASK; char *rec = XLogRecGetData(record); - if (info == XLOG_SMGR_CREATE) + if ((info == XLOG_SMGR_CREATE) || (info == XLOG_SMGR_CREATE_PDL)) { xl_smgr_create *xlrec = (xl_smgr_create *) rec; char *path = relpathperm(xlrec->rnode, xlrec->forkNum); diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index e68c2b11639d..898f5a9fed78 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -17,6 +17,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "catalog/storage_pending_deletes.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -233,6 +234,11 @@ xlog_desc(StringInfo buf, XLogRecord *record) (uint32) xlrec.overwritten_lsn, timestamptz_to_str(xlrec.overwrite_time)); } + else if (info == XLOG_PENDING_DELETE) + { + appendStringInfo(buf, "orphaned relfilenodes to delete: %zu", + ((PendingRelXactDeleteArray *)rec)->count); + } else appendStringInfoString(buf, "UNKNOWN"); } diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 8e6f3a4daf20..c99a8ae10f98 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -56,6 +56,7 @@ #include "access/xlogutils.h" #include "catalog/pg_type.h" #include "catalog/storage.h" +#include "catalog/storage_pending_deletes_redo.h" #include "catalog/storage_tablespace.h" #include "catalog/storage_database.h" #include "funcapi.h" @@ -2459,3 +2460,85 @@ getTwoPhaseOldestPreparedTransactionXLogRecPtr(prepared_transaction_agg_state *p return oldest; } /* end getTwoPhaseOldestPreparedTransactionXLogRecPtr */ + +bool +RemovePendingDeletesForPreparedTransactions() +{ + HASH_SEQ_STATUS scan_status; + prpt_map *entry; + XLogReaderState *xlogreader; + volatile bool result = true; + XLogRecord *xlogrec = NULL; + MemoryContext oldcontext = CurrentMemoryContext; + + if (NULL == crashRecoverPostCheckpointPreparedTransactions_map_ht) + return result; + + xlogreader = XLogReaderAllocate(&read_local_xlog_page, NULL); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating an XLog reading processor."))); + + hash_seq_init(&scan_status, + crashRecoverPostCheckpointPreparedTransactions_map_ht); + while ((entry = (prpt_map *) hash_seq_search(&scan_status)) != NULL) + { + char *errormsg = NULL; + TwoPhaseFileHeader *hdr; + + if (entry->xlogrecptr == InvalidXLogRecPtr) + continue; + + int savedInterruptHoldoffCount = InterruptHoldoffCount; + PG_TRY(); + { + xlogrec = XLogReadRecord(xlogreader, entry->xlogrecptr, &errormsg); + } + PG_CATCH(); + { + MemoryContextSwitchTo(oldcontext); + InterruptHoldoffCount = savedInterruptHoldoffCount; + FlushErrorState(); + result = false; + } + PG_END_TRY(); + + if (!result) + { + elog(LOG, "Failed to read WAL record %X/%X for XID %u in %s", + (uint32) (entry->xlogrecptr >> 32), + (uint32) entry->xlogrecptr, + entry->xid, + __func__); + break; + } + + if (NULL == xlogrec) + { + if (errormsg) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("xlog record is invalid"), + errdetail("%s", errormsg))); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("xlog record is invalid"))); + } + + hdr = (TwoPhaseFileHeader *) XLogRecGetData(xlogrec); + + TransactionId *subxids = (hdr->nsubxacts > 0) ? + (TransactionId *) + ((char *) hdr + MAXALIGN(sizeof(TwoPhaseFileHeader))) : + NULL; + + PdlRedoRemoveTree(hdr->xid, subxids, hdr->nsubxacts); + } + + XLogReaderFree(xlogreader); + + return result; +} /* end RemovePendingDeletesForPreparedTransactions */ diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 618ce3f536ae..c9dad916f323 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -32,6 +32,7 @@ #include "catalog/namespace.h" #include "catalog/oid_dispatch.h" #include "catalog/storage.h" +#include "catalog/storage_pending_deletes_redo.h" #include "catalog/storage_tablespace.h" #include "catalog/storage_database.h" #include "commands/async.h" @@ -6223,6 +6224,8 @@ xact_redo_commit_internal(TransactionId xid, XLogRecPtr lsn, DoTablespaceDeletionForRedoXlog(tablespace_oid_to_delete); + PdlRedoRemoveTree(xid, sub_xids, nsubxacts); + /* * We issue an XLogFlush() for the same reason we emit ForceSyncCommit() * in normal operation. For example, in CREATE DATABASE, we copy all files @@ -6380,6 +6383,8 @@ xact_redo_distributed_commit(xl_xact_commit *xlrec, TransactionId xid) DropRelationFiles(xlrec->xnodes, xlrec->nrels, true); DropDatabaseDirectories(deldbs, xlrec->ndeldbs, true); DoTablespaceDeletionForRedoXlog(xlrec->tablespace_oid_to_delete_on_commit); + + PdlRedoRemoveTree(xid, sub_xids, xlrec->nsubxacts); } /* @@ -6455,6 +6460,8 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) DropRelationFiles(xlrec->xnodes, xlrec->nrels, true); DropDatabaseDirectories(deldbs, xlrec->ndeldbs, true); DoTablespaceDeletionForRedoXlog(xlrec->tablespace_oid_to_delete_on_abort); + + PdlRedoRemoveTree(xid, sub_xids, xlrec->nsubxacts); } static void diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 31a97a196adb..a5ec3a9e31c3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -36,6 +36,7 @@ #include "catalog/catversion.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" +#include "catalog/storage_pending_deletes_redo.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" @@ -7451,6 +7452,40 @@ StartupXLOG(void) TimeLineID newTLI = ThisTimeLineID; TimeLineID prevTLI = ThisTimeLineID; + if ((info == XLOG_CHECKPOINT_SHUTDOWN) || + (info == XLOG_END_OF_RECOVERY)) + { + /* + * At this point we may encounter a situation, when some + * prepared transaction is yet not committed/aborted, + * but the respective WAL segment file is already + * recycled. It may happen is some corner cases, like: + * 1. Primary successfully performs Prepare for a + * transaction; + * 2. Primary stops responding and Mirror is promoted; + * 3. New Primary (ex Mirror) commits the transaction; + * 4. New Primary (ex Mirror) recycles WAL segment with + * the Prepare record (because both Primary and Mirror + * has done the Prepare); + * 5. Ex Primary is recovered as new Mirror, it has the + * the transaction in the list of prepared transactions, + * but doesn't have the WAL segment. And the new Mirror + * should soon see the commit REDO record from the new + * Primary (and remove the transaction from the list of + * prepared transactions). + * + * In such a case + * RemovePendingDeletesForPreparedTransactions() will + * return FALSE. And we postpone the removal of orphaned + * files until all such prepared transactions without + * WAL segment files are wiped out from the list of + * prepared transactions. + */ + if (RemovePendingDeletesForPreparedTransactions()) + /* Clean up orphaned files */ + PdlRedoDropFiles(); + } + if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; @@ -7969,6 +8004,21 @@ StartupXLOG(void) UtilityModeCloseDtmRedoFile(); + /* + * By this moment, there shouldn't be any prepared transaction with + * missing respective WAL segment file, meaning + * RemovePendingDeletesForPreparedTransactions() should return TRUE. + * If not, most likely the respective WAL segment file is recycled + * illegally, and we do not perform orphaned files removal (as we might + * remove smth that is already committed). Instead, we emit a warning. + */ + if (RemovePendingDeletesForPreparedTransactions()) + /* Clean up orphaned files */ + PdlRedoDropFiles(); + else + ereport(WARNING, (errmsg( + "Couldn't drop orphaned files"))); + /* * And finally, execute the recovery_end_command, if any. */ @@ -9316,6 +9366,9 @@ CreateCheckPoint(int flags) */ getDtxCheckPointInfo(&dtxCheckPointInfo, &dtxCheckPointInfoSize); + if (!shutdown) + PdlXLogInsert(); + CheckPointGuts(checkPoint.redo, flags); /* @@ -10782,6 +10835,10 @@ xlog_redo(XLogRecPtr beginLoc __attribute__((unused)), XLogRecPtr lsn __attribut /* Keep track of full_page_writes */ lastFullPageWrites = fpw; } + else if (info == XLOG_PENDING_DELETE) + { + PdlRedoXLogRecord(record); + } } /* diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index a766ce6a0ca6..2e6cdc3b4e1d 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -23,7 +23,8 @@ OBJS += pg_exttable.o pg_extprotocol.o \ pg_attribute_encoding.o pg_compression.o aovisimap.o \ pg_appendonly.o \ oid_dispatch.o aocatalog.o storage_tablespace.o storage_database.o \ - storage_tablespace_twophase.o storage_tablespace_xact.o + storage_tablespace_twophase.o storage_tablespace_xact.o \ + storage_pending_deletes_redo.o storage_pending_deletes.o BKIFILES = postgres.bki postgres.description postgres.shdescription diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 773d3bc32d9f..88a96cf04c5e 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -1895,7 +1895,9 @@ heap_create_init_fork(Relation rel) { RelationOpenSmgr(rel); smgrcreate(rel->rd_smgr, INIT_FORKNUM, false); - log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM); + log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, + INIT_FORKNUM, + rel->rd_rel->relstorage); smgrimmedsync(rel->rd_smgr, INIT_FORKNUM); } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 20c2da39157c..bf03d1829208 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -19,11 +19,13 @@ #include "postgres.h" +#include "access/transam.h" #include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/storage.h" +#include "catalog/storage_pending_deletes_redo.h" #include "catalog/storage_xlog.h" #include "common/relpath.h" #include "commands/dbcommands.h" @@ -57,11 +59,21 @@ typedef struct PendingRelDelete RelFileNodePendingDelete relnode; /* relation that may need to be deleted */ bool atCommit; /* T=delete at commit; F=delete at abort */ int nestLevel; /* xact nesting level of request */ + dsa_pointer shmemPtr; /* ptr to shared pending delete list node */ struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ +static void +PendingRelDeleteFree(PendingRelDelete *pending) +{ + Assert(pending != NULL); + if (DsaPointerIsValid(pending->shmemPtr)) + PdlShmemRemove(pending->shmemPtr); + pfree(pending); +} + /* * RelationCreateStorage * Create physical storage for a relation. @@ -80,6 +92,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, char relstorage) SMgrRelation srel; BackendId backend; bool needs_wal; + TransactionId xid = InvalidTransactionId; switch (relpersistence) { @@ -104,7 +117,14 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, char relstorage) smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) - log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM); + { + /* + * Call GetCurrentTransactionId before log_smgrcreate, because + * XLOG_SMGR_CREATE_PDL WAL record should be always linked to XID + */ + xid = GetCurrentTransactionId(); + log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM, relstorage); + } /* Add the relation to the list of stuff to delete at abort */ pending = (PendingRelDelete *) @@ -115,30 +135,34 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, char relstorage) pending->atCommit = false; /* delete if abort */ pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; + pending->shmemPtr = PdlShmemAdd(&pending->relnode, xid); pendingDeletes = pending; } /* - * Perform XLogInsert of a XLOG_SMGR_CREATE record to WAL. + * Perform XLogInsert of a XLOG_SMGR_CREATE_PDL record to WAL. */ void -log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum) +log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum, char relstorage) { - xl_smgr_create xlrec; + xl_smgr_create_pdl xlrec; XLogRecData rdata; /* * Make an XLOG entry reporting the file creation. */ - xlrec.rnode = *rnode; - xlrec.forkNum = forkNum; + xlrec.createrec.rnode = *rnode; + xlrec.createrec.forkNum = forkNum; + xlrec.relstorage = relstorage; rdata.data = (char *) &xlrec; rdata.len = sizeof(xlrec); rdata.buffer = InvalidBuffer; rdata.next = NULL; - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata); + XLogRecPtr recptr = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE_PDL, &rdata); + + XLogFlush(recptr); } /* @@ -159,6 +183,7 @@ RelationDropStorage(Relation rel) pending->atCommit = true; /* delete if commit */ pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; + pending->shmemPtr = InvalidDsaPointer; pendingDeletes = pending; /* @@ -210,7 +235,7 @@ RelationPreserveStorage(RelFileNode rnode, bool atCommit) prev->next = next; else pendingDeletes = next; - pfree(pending); + PendingRelDeleteFree(pending); /* prev does not change */ } else @@ -366,7 +391,7 @@ smgrDoPendingDeletes(bool isCommit) srels[nrels++] = srel; } /* must explicitly free the list entry */ - pfree(pending); + PendingRelDeleteFree(pending); /* prev does not change */ } } @@ -467,7 +492,7 @@ PostPrepare_smgr(void) next = pending->next; pendingDeletes = next; /* must explicitly free the list entry */ - pfree(pending); + PendingRelDeleteFree(pending); } } @@ -518,6 +543,30 @@ smgr_redo(XLogRecPtr beginLoc, XLogRecPtr lsn, XLogRecord *record) reln = smgropen(xlrec->rnode, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } + else if (info == XLOG_SMGR_CREATE_PDL) + { + xl_smgr_create_pdl *xlrec = + (xl_smgr_create_pdl *) XLogRecGetData(record); + PendingRelXactDelete pd = + { + .relnode = + { + .node = xlrec->createrec.rnode, + /* + * Temp relations are not logged in WAL, so it is always false + * here. + */ + .isTempRelation = false, + .relstorage = xlrec->relstorage + }, + .xid = record->xl_xid + }; + + SMgrRelation reln = smgropen(xlrec->createrec.rnode, InvalidBackendId); + smgrcreate(reln, xlrec->createrec.forkNum, true); + + PdlRedoAdd(&pd); + } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); diff --git a/src/backend/catalog/storage_pending_deletes.c b/src/backend/catalog/storage_pending_deletes.c new file mode 100644 index 000000000000..bf1d4573f77f --- /dev/null +++ b/src/backend/catalog/storage_pending_deletes.c @@ -0,0 +1,303 @@ +/*------------------------------------------------------------------------- + * + * storage_pending_deletes.c + * code to support collecting of pending deletes from backends + * + * Copyright (c) 2025 Greengage Community + * + * src/backend/catalog/storage_pending_deletes.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/storage_pending_deletes.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/dsa.h" +#include "utils/guc.h" + +typedef struct PendingDeleteListNode +{ + PendingRelXactDelete xrelnode; + dsa_pointer next; + dsa_pointer prev; +} PendingDeleteListNode; + +typedef struct PendingDeletesList +{ + LWLock *lock; /* protects the list */ + dsa_pointer head; /* ptr to PendingDeleteListNode list head */ +} PendingDeletesList; + +typedef struct BackendsPendingDeletesArray +{ + PendingDeletesList *array; + char dsa_mem[FLEXIBLE_ARRAY_MEMBER]; +} BackendsPendingDeletesArray; + +static BackendsPendingDeletesArray *BackendsPendingDeletes = NULL; + +static inline bool +is_tracking_enabled() +{ + return !IsBootstrapProcessingMode() && + gp_track_pending_delete && + dynamic_shared_memory_type != DSM_IMPL_NONE; +} + +/* Memory required for the BackendsPendingDeletesArray structure */ +static inline Size +PdlStructSize(void) +{ + return add_size(offsetof(BackendsPendingDeletesArray, dsa_mem), + dsa_minimum_size()); +} + +/* Memory required for array of PendingDeletesList-s */ +static inline Size +PdlListArraySize(void) +{ + return mul_size(sizeof(PendingDeletesList), MaxBackends); +} + +/* + * Calculate shmem size for pending deletes. + * BackendsPendingDeletesArray.dsa_mem should fit DSA. + */ +Size +PdlShmemSize(void) +{ + if (!gp_track_pending_delete) + return 0; + + return add_size(PdlStructSize(), PdlListArraySize()); +} + +/* Initialize shared memory pending delete lists for all backends */ +void +PdlShmemInit(void) +{ + if (!is_tracking_enabled()) + return; + + bool found; + + BackendsPendingDeletes = (BackendsPendingDeletesArray *) + ShmemInitStruct("Pending deletes array", PdlStructSize(), &found); + if (found) + return; + + BackendsPendingDeletes->array = (PendingDeletesList *) + ShmemAlloc(PdlListArraySize()); + if (BackendsPendingDeletes->array == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("Not enough memory to create pending deletes lists."))); + + for (int i = 0; i < MaxBackends; i++) + BackendsPendingDeletes->array[i] = (PendingDeletesList) + { + .head = InvalidDsaPointer, + .lock = LWLockAssign() + }; + + dsa_area *dsa = dsa_create_in_place( + BackendsPendingDeletes->dsa_mem, dsa_minimum_size(), + LWLockNewTrancheId(), "storage_pending_deletes", NULL); + + on_shmem_exit(dsa_on_shmem_exit_release_in_place, + (Datum) BackendsPendingDeletes->dsa_mem); + dsa_detach(dsa); +} + +/* + * Cleanup pending deletes list. + * When the function is called, the list should be empty + */ +static void +pdl_beshutdown_hook(int code, Datum arg) +{ + dsa_release_in_place(BackendsPendingDeletes->dsa_mem); + + if (MyBackendId == InvalidBackendId) + return; + + PendingDeletesList *list = &BackendsPendingDeletes->array[MyBackendId]; + + if (!DsaPointerIsValid(list->head)) + return; + + /* Assert on debug build and warning on release */ + Assert(false); + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Pending deletes list is not empty. " + "MyBackend: %d, MyProcPid: %d", MyBackendId, MyProcPid))); + list->head = InvalidDsaPointer; +} + +/* Attach DSA once per process. */ +static dsa_area * +PdlAttachDsa(void) +{ + static dsa_area *dsa = NULL; /* ptr to DSA area attached by + * current process */ + + if (dsa) + return dsa; + + /* + * Keep the DSA area ptr in TopMemoryContext to avoid excessive + * attach/detach at every add/remove + */ + MemoryContext oldcxt = MemoryContextSwitchTo(TopMemoryContext); + + dsa = dsa_attach_in_place(BackendsPendingDeletes->dsa_mem, NULL); + MemoryContextSwitchTo(oldcxt); + + /* pin mappings, so they can survive res owner life end */ + dsa_pin_mapping(dsa); + + on_shmem_exit(pdl_beshutdown_hook, 0); + + return dsa; +} + +/* + * Add pending delete node to the list of current backend. + * Return DSA ptr of a created node. This ptr can be passed to PdlShmemRemove. + */ +dsa_pointer +PdlShmemAdd(const RelFileNodePendingDelete * relnode, TransactionId xid) +{ + if (!is_tracking_enabled() || xid == InvalidTransactionId || + MyBackendId == InvalidBackendId) + return InvalidDsaPointer; + + PendingDeleteListNode *node; + dsa_area *dsa = PdlAttachDsa(); + const dsa_pointer node_dsa = dsa_allocate(dsa, sizeof(*node)); + + if (!DsaPointerIsValid(node_dsa)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("Not enough memory to add pending delete node. " + "MyBackend: %d, MyProcPid: %d", MyBackendId, MyProcPid))); + + node = dsa_get_address(dsa, node_dsa); + *node = (PendingDeleteListNode) + { + .xrelnode = + { + .relnode = *relnode, + .xid = xid + }, + .prev = InvalidDsaPointer + }; + + PendingDeletesList *list = &BackendsPendingDeletes->array[MyBackendId]; + + LWLockAcquire(list->lock, LW_EXCLUSIVE); + node->next = list->head; + if (DsaPointerIsValid(node->next)) + { + PendingDeleteListNode *next_node = (PendingDeleteListNode *) + dsa_get_address(dsa, node->next); + + next_node->prev = node_dsa; + } + list->head = node_dsa; + LWLockRelease(list->lock); + + return node_dsa; +} + +/* + * Remove pending delete node from the list of current backend. + * node_ptr is a ptr to already added node (see PdlShmemAdd) + */ +void +PdlShmemRemove(dsa_pointer node_ptr) +{ + if (!is_tracking_enabled() || MyBackendId == InvalidBackendId) + return; + + Assert(DsaPointerIsValid(node_ptr)); + + dsa_area *dsa = PdlAttachDsa(); + PendingDeletesList *list = &BackendsPendingDeletes->array[MyBackendId]; + const PendingDeleteListNode *node = dsa_get_address(dsa, node_ptr); + + LWLockAcquire(list->lock, LW_EXCLUSIVE); + if (DsaPointerIsValid(node->next)) + { + PendingDeleteListNode *next_node = dsa_get_address(dsa, node->next); + + next_node->prev = node->prev; + } + + if (DsaPointerIsValid(node->prev)) + { + PendingDeleteListNode *prev_node = dsa_get_address(dsa, node->prev); + + prev_node->next = node->next; + } + else + list->head = node->next; + + LWLockRelease(list->lock); + + dsa_free(dsa, node_ptr); +} + +/* + * Collect info about pending deletes from all backends and return + * the accumulated result. Return NULL if there are no nodes in the lists. + * Note: the returned result is palloc'ed. Caller is responsible for + * freeing it. + */ +PendingRelXactDeleteArray * +PdlXLogShmemDump(void) +{ + dsa_area *dsa = PdlAttachDsa(); + PendingRelXactDeleteArray *ret = NULL; + Size size = offsetof(PendingRelXactDeleteArray, array); + Size step = sizeof(*ret->array) * 32; + + for (int i = 0; i < MaxBackends; i++) + { + PendingDeletesList *list = &BackendsPendingDeletes->array[i]; + + LWLockAcquire(list->lock, LW_SHARED); + + for (dsa_pointer pdl_node_dsa = list->head; + DsaPointerIsValid(pdl_node_dsa);) + { + const PendingDeleteListNode *pdl_node = dsa_get_address(dsa, + pdl_node_dsa); + + if (ret == NULL) + { + size += step; + ret = palloc(size); + ret->count = 0; + } + else if (PdlDumpSize(ret->count + 1) > size) + { + step *= 2; + size += step; + ret = repalloc(ret, size); + } + + ret->array[ret->count++] = pdl_node->xrelnode; + pdl_node_dsa = pdl_node->next; + } + + LWLockRelease(list->lock); + } + + return ret; +} diff --git a/src/backend/catalog/storage_pending_deletes_redo.c b/src/backend/catalog/storage_pending_deletes_redo.c new file mode 100644 index 000000000000..59d7505451d2 --- /dev/null +++ b/src/backend/catalog/storage_pending_deletes_redo.c @@ -0,0 +1,345 @@ +/*------------------------------------------------------------------------- + * + * storage_pending_deletes_redo.c + * code to support processing of pending deletes (orphaned files) in WAL + * + * Copyright (c) 2025 Greengage Community + * + * src/backend/catalog/storage_pending_deletes_redo.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/clog.h" +#include "access/transam.h" +#include "catalog/storage_pending_deletes_redo.h" +#include "miscadmin.h" +#include "storage/md.h" +#include "utils/elog.h" +#include "utils/guc.h" +#include "utils/hsearch.h" + +/* + * HTAB entry for pending deletes for the given xid. + */ +typedef struct PendingDeleteHtabNode +{ + TransactionId xid; + List *relnode_list; /* list of RelFileNodePendingDelete */ +} PendingDeleteHtabNode; + +/* + * Hash table for pending deletes relfilenodes for a given xid. + */ +static HTAB *pendingDeletesRedo = NULL; + +static bool +PdlTrackingDisabled() +{ + return IsBootstrapProcessingMode() || !gp_track_pending_delete; +} + +/* + * This function inserts XLOG_PENDING_DELETE record into WAL. + */ +void +PdlXLogInsert() +{ + if (PdlTrackingDisabled()) + return; + + PendingRelXactDeleteArray *arr = PdlXLogShmemDump(); + + if (arr != NULL) + { + XLogRecPtr rec; + XLogRecData rdata = + { + .buffer = InvalidBuffer, + .data = (char *) arr, + .len = PdlDumpSize(arr->count), + .next = NULL, + .buffer_std = false + }; + + rec = XLogInsert(RM_XLOG_ID, XLOG_PENDING_DELETE, &rdata); + + XLogFlush(rec); + + elog(DEBUG1, "Pending delete XLog record inserted"); + + pfree(arr); + } +} + +/* + * This function adds pending delete node to a pendingDeletesRedo hash-table + * during WAL redo processing. + */ +void +PdlRedoAdd(PendingRelXactDelete * pd) +{ + Assert(pd); + + if (PdlTrackingDisabled() || (pd->xid == InvalidTransactionId)) + return; + + if (NULL == pendingDeletesRedo) + { + HASHCTL info = + { + .keysize = sizeof(TransactionId), + .entrysize = sizeof(PendingDeleteHtabNode) + }; + + pendingDeletesRedo = hash_create("pendingDeletesRedo hash", + 32, + &info, + HASH_ELEM); + } + + bool found = false; + + PendingDeleteHtabNode *entry = (PendingDeleteHtabNode *) + hash_search(pendingDeletesRedo, &pd->xid, HASH_ENTER, &found); + + if (!found) + { + entry->xid = pd->xid; + entry->relnode_list = NIL; + } + + RelFileNodePendingDelete *data = (RelFileNodePendingDelete *) + palloc(sizeof(*data)); + + *data = pd->relnode; + entry->relnode_list = lappend(entry->relnode_list, data); +} + +/* + * This function replays XLOG_PENDING_DELETE xlog record. + */ +void +PdlRedoXLogRecord(XLogRecord *record) +{ + Assert(record); + + if (PdlTrackingDisabled()) + return; + + PendingRelXactDeleteArray *arr = (PendingRelXactDeleteArray *) + XLogRecGetData(record); + + TransactionId oldest_xid = ShmemVariableCache->oldestXid; + + Assert(arr->count); + + for (int i = 0; i < arr->count; i++) + { + PendingRelXactDelete *pd = &(arr->array[i]); + + /* + * This function should check transaction status before adding + * relfilenode to a pendingDeletesRedo hash table. Concurrent xlog + * inserts (concurrent to a checkpointing process) of commit or abort + * xlog records may out out-date pending deletes list. We don't want + * to use aggressive locking of shared structures in order to avoid + * performance drawbacks of concurrent commits or aborts. So the + * strategy is to double-check relfilenodes with it's transaction + * status. If it's TRANSACTION_STATUS_IN_PROGRESS, then it's + * permitted to delete files (it's orphaned), if it's in some other + * status - don't touch it. Also we should check transaction xid + * doesn't cross "freeze horizon" and compare it with current + * oldestXid value. Motivation of this check is that clog might get + * truncated after REDO point and before replaying XLOG_PENDING_DELETE + * record (though that looks like unlikely will happen in real-world, + * but still needs to be considered as possible scenario). So in that + * case we can't rely on xid status of that frozen transactions. + * Second point is that there is no way that clog would be truncated + * when transaction is in progress, so it's either been committed or + * aborted before that. + */ + + if (TransactionIdPrecedes(pd->xid, oldest_xid)) + ereport(LOG, (errmsg( + "Prevented adding node for XLOG_PENDING_DELETE " + "record for xid: %u, oldestXid: %u", + pd->xid, oldest_xid))); + else + { + XLogRecPtr result; + XidStatus status = TransactionIdGetStatus(pd->xid, &result); + + if (status == TRANSACTION_STATUS_IN_PROGRESS) + PdlRedoAdd(pd); + else + ereport(LOG, (errmsg( + "Prevented adding node for XLOG_PENDING_DELETE " + "record for xid: %u, status: %d", + pd->xid, status))); + } + } +} + +static void +PdlRedoRemove(TransactionId xid) +{ + if ((xid == InvalidTransactionId) || + (NULL == pendingDeletesRedo)) + return; + + PendingDeleteHtabNode *entry = (PendingDeleteHtabNode *) + hash_search(pendingDeletesRedo, &xid, HASH_REMOVE, NULL); + + if (entry) + list_free_deep(entry->relnode_list); +} + +/* + * This function removes pending delete nodes from redo hash-table + * (pendingDeleteRedo) for a given transaction identified by it's xid and + * sub-transactions (if there are). + */ +void +PdlRedoRemoveTree(TransactionId xid, + TransactionId *sub_xids, int nsubxacts) +{ + if (PdlTrackingDisabled()) + return; + + for (int i = 0; i < nsubxacts; i++) + PdlRedoRemove(sub_xids[i]); + + PdlRedoRemove(xid); +} + +/* + * This function serializes the contents of hash table entry into a structure + * suitable to pass into DropRelationFiles() functions. + */ +static RelFileNodePendingDelete * +PdlRedoPrepareArrayForDrop(PendingDeleteHtabNode *hnode, int *ndelrels) +{ + ListCell *cell; + + foreach(cell, hnode->relnode_list) + { + RelFileNodePendingDelete *pending_delete_node = + (RelFileNodePendingDelete *) lfirst(cell); + ListCell *i_cell = lnext(cell); + ListCell *i_cell_prev = cell; + + while (i_cell) + { + ListCell *i_cell_next = lnext(i_cell); + RelFileNodePendingDelete *i_relnode = + (RelFileNodePendingDelete *) lfirst(i_cell); + + if (RelFileNodeEquals(pending_delete_node->node, i_relnode->node)) + { + elog(DEBUG1, + "Duplicate pending delete node found: " + "(rel: (%u: %u: %u); xid: %u)", + pending_delete_node->node.spcNode, + pending_delete_node->node.dbNode, + pending_delete_node->node.relNode, + hnode->xid); + + hnode->relnode_list = + list_delete_cell(hnode->relnode_list, i_cell, i_cell_prev); + pfree(i_relnode); + } + else + i_cell_prev = i_cell; + + i_cell = i_cell_next; + } + } + + *ndelrels = list_length(hnode->relnode_list); + + if (*ndelrels <= 0) + { + ereport(WARNING, (errmsg("Empty list for xid: %u", hnode->xid))); + return NULL; + } + + RelFileNodePendingDelete *delrels = (RelFileNodePendingDelete *) + palloc((*ndelrels) * sizeof(*delrels)); + + int i = 0; + + foreach_with_count(cell, hnode->relnode_list, i) + { + RelFileNodePendingDelete *pending_delete_node = + (RelFileNodePendingDelete *) lfirst(cell); + + ereport(LOG, (errmsg( + "Prepare to drop node (%u: %u: %u) for xid: %u", + pending_delete_node->node.spcNode, + pending_delete_node->node.dbNode, + pending_delete_node->node.relNode, + hnode->xid))); + + delrels[i] = *pending_delete_node; + } + + return delrels; +} + +/* + * This function deletes files for pending delete nodes. + */ +void +PdlRedoDropFiles() +{ + if (PdlTrackingDisabled() || + (NULL == pendingDeletesRedo) || + (hash_get_num_entries(pendingDeletesRedo) == 0)) + return; + + TransactionId oldest_xid = ShmemVariableCache->oldestXid; + HASH_SEQ_STATUS scan_status = {0}; + PendingDeleteHtabNode *node; + + hash_seq_init(&scan_status, pendingDeletesRedo); + while ((node = (PendingDeleteHtabNode *) hash_seq_search(&scan_status)) != NULL) + { + if (TransactionIdPrecedes(node->xid, oldest_xid)) + ereport(WARNING, (errmsg( + "Prevented drop files for xid: %u, oldestXid: %u", + node->xid, oldest_xid))); + else + { + XLogRecPtr result; + XidStatus status = TransactionIdGetStatus(node->xid, &result); + + if (status != TRANSACTION_STATUS_IN_PROGRESS) + ereport(WARNING, (errmsg( + "Prevented drop files for xid: %u, status: %d", + node->xid, status))); + else + { + int ndelrels = 0; + RelFileNodePendingDelete *delrels = + PdlRedoPrepareArrayForDrop(node, &ndelrels); + + DropRelationFiles(delrels, ndelrels, true); + + ereport(LOG, (errmsg( + "Pending delete rels were dropped (count: %d; xid: %d).", + ndelrels, + node->xid))); + + pfree(delrels); + } + } + + list_free_deep(node->relnode_list); + } + + hash_destroy(pendingDeletesRedo); + pendingDeletesRedo = NULL; +} diff --git a/src/backend/catalog/test/Makefile b/src/backend/catalog/test/Makefile index 656951a3f0fa..50e08514554a 100644 --- a/src/backend/catalog/test/Makefile +++ b/src/backend/catalog/test/Makefile @@ -5,16 +5,22 @@ subdir = src/backend/catalog include $(top_builddir)/src/Makefile.global include $(top_srcdir)/src/Makefile.mock - TARGETS += storage_tablespace +TARGETS += storage_pending_deletes +TARGETS += storage_pending_deletes_redo +include $(top_builddir)/src/backend/mock.mk storage_tablespace.t: $(top_srcdir)/src/backend/catalog/storage_tablespace.o make -C $(top_srcdir)/src/backend/catalog/ && \ $(CC) $(CFLAGS) $(LDFLAGS) $(CMOCKERY_OBJS) $(CPPFLAGS) \ $(top_srcdir)/src/backend/catalog/storage_tablespace.o \ storage_tablespace_test.c \ - -o storage_tablespace_test.o && ./storage_tablespace_test.o + -o storage_tablespace.t +storage_pending_deletes.t: \ + $(top_builddir)/src/backend/catalog/storage_pending_deletes.o -check: storage_tablespace.t \ No newline at end of file +storage_pending_deletes_redo.t: \ + $(top_builddir)/src/backend/catalog/storage_pending_deletes_redo.o \ + $(top_builddir)/src/backend/catalog/storage_pending_deletes.o diff --git a/src/backend/catalog/test/storage_pending_deletes_redo_test.c b/src/backend/catalog/test/storage_pending_deletes_redo_test.c new file mode 100644 index 000000000000..deb916b1e3f5 --- /dev/null +++ b/src/backend/catalog/test/storage_pending_deletes_redo_test.c @@ -0,0 +1,1103 @@ +/*------------------------------------------------------------------------- + * + * storage_pending_deletes_redo_test.c + * code to test functionality from storage_pending_deletes_redo.c + * + * Copyright (c) 2025 Greengage Community + * + * src/backend/catalog/test/storage_pending_deletes_redo_test.c + * + *------------------------------------------------------------------------- + */ +#include +#include +#include +#include "cmockery.h" + +#include "postgres.h" + +#include "access/clog.h" +#include "access/transam.h" +#include "catalog/storage_pending_deletes_redo.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +#define TEST_TABLESPACE_OID1 11111 +#define TEST_TABLESPACE_OID2 11112 + +#define TEST_DB_OID1 11121 +#define TEST_DB_OID2 11122 + +#define TEST_REL_OID1 11211 +#define TEST_REL_OID2 11212 + +#define TEST_XID 10 + +#define TEST_XLOG_REC_PTR 100 + +void +__wrap_DropRelationFiles(RelFileNodePendingDelete *delrels, + int ndelrels, + bool isRedo); + +XidStatus +__wrap_TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn); + +PendingRelXactDeleteArray * +__wrap_PdlXLogShmemDump(void); + +XLogRecPtr +__wrap_XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); + +void + __wrap_XLogFlush(XLogRecPtr record); + +/* id of test, which is currently being executed */ +static int test_number = 0; + +/* + * counter to accumulate how many times DropRelationFiles() was called during + * test + */ +static int DropRelationFiles_call_count = 0; + +/* counter to accumulate how many times XLogInsert() was called during test */ +static int XLogInsert_call_count = 0; + +/* + * counter to accumulate how many times PdlXLogShmemDump() was called during + * test + */ +static int PdlXLogShmemDump_call_count = 0; + +/* + * array of relnodes expected by test, in case there are more than 1-2 nodes + * involved + */ +#define TEST_EXPECTED_NOTES_COUNT 20 +static RelFileNode test_expected_relnodes[TEST_EXPECTED_NOTES_COUNT]; + +/* + * List with transaction IDs, that will report complete status from + * TransactionIdGetStatus(). + */ +static List *ls_transactions_comlpete = NIL; + +static void +setup(int test) +{ + static VariableCacheData test_cache = {0}; + + ShmemVariableCache = &test_cache; + + DropRelationFiles_call_count = 0; + XLogInsert_call_count = 0; + PdlXLogShmemDump_call_count = 0; + + test_number = test; +} + +void +__wrap_DropRelationFiles(RelFileNodePendingDelete *delrels, + int ndelrels, + bool isRedo) +{ + DropRelationFiles_call_count++; + switch (test_number) + { + case 1: + case 8: + case 9: + case 13: + case 18: + case 19: + { + assert_int_equal(ndelrels, 1); + assert_true(isRedo); + RelFileNodePendingDelete *pd = &(delrels[0]); + + assert_false(pd->isTempRelation); + assert_int_equal(pd->node.spcNode, TEST_TABLESPACE_OID1); + assert_int_equal(pd->node.dbNode, TEST_DB_OID1); + assert_int_equal(pd->node.relNode, TEST_REL_OID1); + break; + } + case 3: + { + static RelFileNode test_3_expected_results[] = + { + [0] = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1, + }, + [1] = + { + .spcNode = TEST_TABLESPACE_OID2, + .dbNode = TEST_DB_OID2, + .relNode = TEST_REL_OID2, + } + }; + + assert_int_equal(ndelrels, 1); + assert_true(isRedo); + RelFileNodePendingDelete *pd = &(delrels[0]); + + /* + * We can't guarantee that the order of relnodes dropping will + * be the same as the order of adding the pending delete + * nodes. So we just need to ensure that we got all the + * expected relnodes (and only them). We check it by excluding + * values from the array of expected relnodes by replacing + * them with InvalidOid. And we will check that all values are + * excluded as the last step. + */ + for (int i = 0; i < ARRAY_SIZE(test_3_expected_results); i++) + { + if (RelFileNodeEquals(test_3_expected_results[i], pd->node)) + { + test_3_expected_results[i].spcNode = InvalidOid; + test_3_expected_results[i].dbNode = InvalidOid; + test_3_expected_results[i].relNode = InvalidOid; + } + } + + if (DropRelationFiles_call_count == 2) + { + for (int i = 0; i < ARRAY_SIZE(test_3_expected_results); i++) + { + assert_int_equal(test_3_expected_results[i].spcNode, + InvalidOid); + assert_int_equal(test_3_expected_results[i].dbNode, + InvalidOid); + assert_int_equal(test_3_expected_results[i].relNode, + InvalidOid); + } + } + + break; + } + case 4: + { + assert_int_equal(ndelrels, 2); + assert_true(isRedo); + + RelFileNodePendingDelete *pd; + + pd = &(delrels[0]); + assert_false(pd->isTempRelation); + assert_int_equal(pd->node.spcNode, TEST_TABLESPACE_OID1); + assert_int_equal(pd->node.dbNode, TEST_DB_OID1); + assert_int_equal(pd->node.relNode, TEST_REL_OID1); + + pd = &(delrels[1]); + assert_false(pd->isTempRelation); + assert_int_equal(pd->node.spcNode, TEST_TABLESPACE_OID2); + assert_int_equal(pd->node.dbNode, TEST_DB_OID2); + assert_int_equal(pd->node.relNode, TEST_REL_OID2); + + break; + } + case 5: + case 11: + case 12: + case 14: + { + assert_int_equal(ndelrels, 1); + assert_true(isRedo); + RelFileNodePendingDelete *pd = &(delrels[0]); + + assert_false(pd->isTempRelation); + + /* + * We can't guarantee that the order of relnodes dropping will + * be the same as the order of adding the pending delete + * nodes. So we just need to ensure that we got all the + * expected relnodes (and only them). We check it by excluding + * values from the array of expected relnodes by replacing + * them with InvalidOid. And we will check that all values are + * excluded in the end of the test. + */ + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + assert_true(pd->node.relNode != InvalidOid); + if (RelFileNodeEquals(test_expected_relnodes[i], pd->node)) + { + test_expected_relnodes[i].relNode = InvalidOid; + return; + } + } + + /* + * If we are here, then we didn't find the relnode in the + * expected data, and it is a problem, so fail. + */ + assert_true(false); + break; + } + default: + { + /* we shouldn't even get here */ + assert_true(false); + break; + } + } +} + +XidStatus +__wrap_TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) +{ + ListCell *cell; + + foreach(cell, ls_transactions_comlpete) + { + TransactionId xid_complete = (TransactionId) lfirst_int(cell); + + if (xid == xid_complete) + return TRANSACTION_STATUS_COMMITTED; + } + return TRANSACTION_STATUS_IN_PROGRESS; +} + +PendingRelXactDeleteArray * +__wrap_PdlXLogShmemDump(void) +{ + PdlXLogShmemDump_call_count++; + if (test_number == 16) + return NULL; + + /* return something valid */ + int node_count = 1; + + char *buffer = palloc(PdlDumpSize(node_count)); + + PendingRelXactDeleteArray *pending_deletes = + (PendingRelXactDeleteArray *) buffer; + + pending_deletes->count = node_count; + + PendingRelXactDelete *pd = &(pending_deletes->array[0]); + + pd->xid = TEST_XID; + pd->relnode.isTempRelation = false; + pd->relnode.node.spcNode = TEST_TABLESPACE_OID1; + pd->relnode.node.dbNode = TEST_DB_OID1; + pd->relnode.node.relNode = TEST_REL_OID1; + + return pending_deletes; +} + +XLogRecPtr +__wrap_XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) +{ + assert_int_equal(test_number, 17); /* currently we should get here only + * in test_17 */ + XLogInsert_call_count++; + + assert_int_equal(rmid, RM_XLOG_ID); + assert_int_equal(info, XLOG_PENDING_DELETE); + assert_int_equal(rdata->buffer, InvalidBuffer); + assert_false(rdata->buffer_std); + assert_true(rdata->next == NULL); + assert_true(rdata->len == (sizeof(Size) + sizeof(PendingRelXactDelete))); + + PendingRelXactDeleteArray *pending_deletes = + (PendingRelXactDeleteArray *) rdata->data; + + assert_int_equal(pending_deletes->count, 1); + + PendingRelXactDelete *pd = &(pending_deletes->array[0]); + + assert_int_equal(pd->xid, TEST_XID); + assert_false(pd->relnode.isTempRelation); + assert_int_equal(pd->relnode.node.spcNode, TEST_TABLESPACE_OID1); + assert_int_equal(pd->relnode.node.dbNode, TEST_DB_OID1); + assert_int_equal(pd->relnode.node.relNode, TEST_REL_OID1); + + return TEST_XLOG_REC_PTR; +} + +void +__wrap_XLogFlush(XLogRecPtr record) +{ + assert_int_equal(test_number, 17); /* currently we should get here only + * in test_17 */ + assert_int_equal(record, TEST_XLOG_REC_PTR); +} + +/* + * Tests + */ + +/* + * Scenario: + * add single pending delete node + * and then drop files. + */ +static void +test_1(void **state) +{ + setup(1); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); +} + +/* + * Scenario: + * add single pending delete node + * and datfrozenxid is above the node's xid + * and then drop files. + */ +static void +test_2(void **state) +{ + setup(2); + ShmemVariableCache->oldestXid = (TransactionId) 2; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 0); +} + +/* + * Scenario: + * add 2 pending delete nodes with different xids and different relnodes + * and then drop files. + */ +static void +test_3(void **state) +{ + setup(3); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + pd.xid = (TransactionId) 2; + pd.relnode.node.spcNode = TEST_TABLESPACE_OID2; + pd.relnode.node.dbNode = TEST_DB_OID2; + pd.relnode.node.relNode = TEST_REL_OID2; + + PdlRedoAdd(&pd); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 2); +} + +/* + * Scenario: + * add 2 pending delete nodes with same xid and different relnodes + * and then drop files. + */ +static void +test_4(void **state) +{ + setup(4); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + pd.relnode.node.spcNode = TEST_TABLESPACE_OID2; + pd.relnode.node.dbNode = TEST_DB_OID2; + pd.relnode.node.relNode = TEST_REL_OID2; + + PdlRedoAdd(&pd); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); +} + +/* + * Scenario: + * add many pending delete nodes with different xids and different relnodes + * and some xids precede datfrozenxid + * and some transactions are not in progress + * and then drop files. + */ +static void +test_5(void **state) +{ + setup(5); + ShmemVariableCache->oldestXid = (TransactionId) 5; + + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + PendingRelXactDelete pd = + { + .xid = (TransactionId) i, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + i + }; + + PdlRedoAdd(&pd); + + /* and fill data which is expected... */ + if (TransactionIdPrecedes(pd.xid, ShmemVariableCache->oldestXid)) + test_expected_relnodes[i].relNode = (Oid) -1; + else + test_expected_relnodes[i] = pd.relnode.node; + } + + /* mark some transactions as complete, let's say XIDs: 10, 12, 15 */ + TransactionId complete_xids[] = {10, 12, 15}; + int complete_xids_count = ARRAY_SIZE(complete_xids); + + for (int i = 0; i < complete_xids_count; i++) + { + ls_transactions_comlpete = lappend_int(ls_transactions_comlpete, + complete_xids[i]); + } + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, + TEST_EXPECTED_NOTES_COUNT - ShmemVariableCache->oldestXid - + complete_xids_count); + + /* Check that data for complete xids is not touched by PdlRedoDropFiles */ + for (int i = 0; i < complete_xids_count; i++) + { + ls_transactions_comlpete = lappend_int(ls_transactions_comlpete, + complete_xids[i]); + assert_int_equal(test_expected_relnodes[complete_xids[i]].relNode, + TEST_REL_OID1 + complete_xids[i]); + /* Replace it with InvalidOid to simplify further check */ + test_expected_relnodes[complete_xids[i]].relNode = InvalidOid; + } + + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + /* + * Check that data for xids preceding datfrozenxid is not touched by + * PdlRedoDropFiles, while all other is replaced with InvalidOid. + */ + if (TransactionIdPrecedes((TransactionId) i, ShmemVariableCache->oldestXid)) + assert_int_equal(test_expected_relnodes[i].relNode, (Oid) -1); + else + assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid); + } + + list_free(ls_transactions_comlpete); + ls_transactions_comlpete = NIL; +} + +/* + * Scenario: + * add single pending delete node + * and transaction status of the node is not in progress + * and then drop files. + */ +static void +test_6(void **state) +{ + setup(6); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + ls_transactions_comlpete = lappend_int(ls_transactions_comlpete, pd.xid); + + PdlRedoAdd(&pd); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 0); + + list_free(ls_transactions_comlpete); + ls_transactions_comlpete = NIL; +} + +/* + * Scenario: + * add single pending delete node + * and remove pending deletes for that node's xid + * and then drop files. + */ +static void +test_7(void **state) +{ + setup(7); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + PdlRedoRemoveTree(pd.xid, NULL, 0); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 0); +} + +/* + * Scenario: + * add single pending delete node + * and remove pending deletes for different xid + * and then drop files. + */ +static void +test_8(void **state) +{ + setup(8); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + PdlRedoRemoveTree(pd.xid + 1, NULL, 0); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); +} + +/* + * Scenario: + * add single pending delete node + * and remove pending deletes for invalid xid + * and then drop files. + */ +static void +test_9(void **state) +{ + setup(9); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + PdlRedoRemoveTree(InvalidTransactionId, NULL, 0); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); +} + +/* + * Scenario: + * add several pending delete nodes with the same xid + * and remove pending deletes for that xid + * and then drop files. + */ +static void +test_10(void **state) +{ + setup(10); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + PdlRedoAdd(&pd); + + pd.relnode.node.relNode = TEST_REL_OID2; + + PdlRedoAdd(&pd); + + pd.relnode.node.dbNode = TEST_DB_OID2; + pd.relnode.node.relNode = TEST_REL_OID1; + + PdlRedoAdd(&pd); + + PdlRedoRemoveTree(pd.xid, NULL, 0); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 0); +} + +/* + * Scenario: + * add several pending delete nodes with the different xids + * and remove pending deletes for one of the xids + * and then drop files. + */ +static void +test_11(void **state) +{ + setup(11); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = {0}; + + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + PendingRelXactDelete pd = + { + /* add oldest xid here just to ensure that all nodes will be added */ + .xid = ShmemVariableCache->oldestXid + (TransactionId) i, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + i + }; + + PdlRedoAdd(&pd); + + /* and fill data which is expected... */ + test_expected_relnodes[i] = pd.relnode.node; + } + + PdlRedoAdd(&pd); + + TransactionId xid_to_remove = 5; + + PdlRedoRemoveTree(xid_to_remove, NULL, 0); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, TEST_EXPECTED_NOTES_COUNT - 1); + + /* + * Check that data for removed xids is not touched by PdlRedoDropFiles and + * replace it with InvalidOid to simplify further check. + */ + int idx = xid_to_remove - ShmemVariableCache->oldestXid; + + assert_int_equal(test_expected_relnodes[idx].relNode, TEST_REL_OID1 + idx); + test_expected_relnodes[idx].relNode = InvalidOid; + + /* + * Check that all other are replaced with InvalidOid. + */ + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid); + } +} + +/* + * Scenario: + * add several pending delete nodes with the different xids + * and remove pending deletes for one of the xids + some sub_xids + * and then drop files. + */ +static void +test_12(void **state) +{ + setup(12); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + PendingRelXactDelete pd = + { + /* add oldest xid here just to ensure that all nodes will be added */ + .xid = ShmemVariableCache->oldestXid + (TransactionId) i, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + i + }; + + PdlRedoAdd(&pd); + + /* and fill data which is expected... */ + test_expected_relnodes[i] = pd.relnode.node; + } + + TransactionId xid_to_remove = 5; + TransactionId sub_xids_to_remove[] = {10, 11, 12, 15}; + int nsubxacts = ARRAY_SIZE(sub_xids_to_remove); + + PdlRedoRemoveTree(xid_to_remove, sub_xids_to_remove, nsubxacts); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, TEST_EXPECTED_NOTES_COUNT - 5); + + /* + * Check that data for removed xids is not touched by PdlRedoDropFiles and + * replace it with InvalidOid to simplify further check... + */ + int idx = xid_to_remove - ShmemVariableCache->oldestXid; + + assert_int_equal(test_expected_relnodes[idx].relNode, TEST_REL_OID1 + idx); + test_expected_relnodes[idx].relNode = InvalidOid; + /* ...including all subtransactions. */ + for (int j = 0; j < nsubxacts; j++) + { + idx = sub_xids_to_remove[j] - ShmemVariableCache->oldestXid; + assert_int_equal(test_expected_relnodes[idx].relNode, TEST_REL_OID1 + idx); + test_expected_relnodes[idx].relNode = InvalidOid; + } + + /* + * Check that now all expected nodes are replaced with InvalidOid. + */ + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid); + } +} + +static XLogRecord * +test_create_xlog_record(int pending_deletes_count) +{ + Size buffer_size = SizeOfXLogRecord + sizeof(Size) + + sizeof(PendingRelXactDelete) * pending_deletes_count; + + return (XLogRecord *)palloc0(buffer_size); +} + +/* + * Scenario: + * process PENDING_DELETE wal record with 1 pending delete node + * and then drop files. + */ +static void +test_13(void **state) +{ + setup(13); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + int pending_deletes_count = 1; + + XLogRecord *record = test_create_xlog_record(pending_deletes_count); + + PendingRelXactDeleteArray *pending_deletes = + (PendingRelXactDeleteArray *) ((char *) record + SizeOfXLogRecord); + + pending_deletes->count = pending_deletes_count; + + PendingRelXactDelete *pd = &(pending_deletes->array[0]); + + pd->xid = (TransactionId) 1; + pd->relnode.isTempRelation = false; + pd->relnode.node.spcNode = TEST_TABLESPACE_OID1; + pd->relnode.node.dbNode = TEST_DB_OID1; + pd->relnode.node.relNode = TEST_REL_OID1; + + PdlRedoXLogRecord(record); + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); + + pfree(record); +} + + +/* + * Scenario: + * process PENDING_DELETE wal record with several pending delete nodes + * and datfrozenxid is above the some node's xid + * and some transactions are not in progress + * and then drop files. + */ +static void +test_14(void **state) +{ + setup(14); + ShmemVariableCache->oldestXid = (TransactionId) 2; + + int pending_deletes_count = 5; + + XLogRecord *record = test_create_xlog_record(pending_deletes_count); + + PendingRelXactDeleteArray *pending_deletes = + (PendingRelXactDeleteArray *) ((char *) record + SizeOfXLogRecord); + + pending_deletes->count = pending_deletes_count; + + memset(test_expected_relnodes, 0, sizeof(test_expected_relnodes)); + + for (int i = 0; i < pending_deletes_count; i++) + { + PendingRelXactDelete *pd = &(pending_deletes->array[i]); + + pd->xid = (TransactionId) (i + 1); + pd->relnode.isTempRelation = false; + pd->relnode.node.spcNode = TEST_TABLESPACE_OID1; + pd->relnode.node.dbNode = TEST_DB_OID1; + pd->relnode.node.relNode = TEST_REL_OID1 + i; + + test_expected_relnodes[i] = pd->relnode.node; + } + + /* mark some transaction as complete, let's say XID: 3 */ + ls_transactions_comlpete = lappend_int(ls_transactions_comlpete, + (TransactionId) 3); + + PdlRedoXLogRecord(record); + + PdlRedoDropFiles(); + + /* + * The xids that should have been skipped due to datfrozenxid or + * transaction status. Their enties in the expected nodes should be + * untouched. Check it and replace it with InvalidOid to simplify further + * check... + */ + TransactionId skipped_xids[] = {1, 3}; + + for (int i = 0; i < ARRAY_SIZE(skipped_xids); i++) + { + int idx = skipped_xids[i] - 1; + + assert_int_equal(test_expected_relnodes[idx].relNode, + TEST_REL_OID1 + idx); + test_expected_relnodes[idx].relNode = InvalidOid; + } + + assert_int_equal(DropRelationFiles_call_count, 3); + + /* + * Check that now all expected nodes are replaced with InvalidOid. + */ + for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++) + { + assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid); + } + + pfree(record); + + list_free(ls_transactions_comlpete); + ls_transactions_comlpete = NIL; +} + + +/* + * Scenario: + * check PdlXlogInsert() if PdlXLogShmemDump returned NULL. + */ +static void +test_16(void **state) +{ + setup(16); + + PdlXLogInsert(); + + assert_int_equal(PdlXLogShmemDump_call_count, 1); + assert_int_equal(XLogInsert_call_count, 0); +} + +/* + * Scenario: + * check PdlXlogInsert() if PdlXLogShmemDump provided valid nodes. + */ +static void +test_17(void **state) +{ + setup(17); + + PdlXLogInsert(); + + assert_int_equal(PdlXLogShmemDump_call_count, 1); + assert_int_equal(XLogInsert_call_count, 1); +} + +/* + * Scenario: + * guc is disabled + */ +static void +test_18(void **state) +{ + setup(18); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + gp_track_pending_delete = false; + PdlRedoAdd(&pd); + gp_track_pending_delete = true; + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 0); + + PdlRedoAdd(&pd); + + gp_track_pending_delete = false; + PdlRedoDropFiles(); + gp_track_pending_delete = true; + + assert_int_equal(DropRelationFiles_call_count, 0); + + gp_track_pending_delete = false; + PdlRedoRemoveTree(pd.xid, NULL, 0); + gp_track_pending_delete = true; + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); + + gp_track_pending_delete = false; + PdlXLogInsert(); + gp_track_pending_delete = true; + + assert_int_equal(PdlXLogShmemDump_call_count, 0); + assert_int_equal(XLogInsert_call_count, 0); +} + +/* + * Scenario: + * IsBootstrapProcessingMode is true + */ +static void +test_19(void **state) +{ + setup(19); + ShmemVariableCache->oldestXid = (TransactionId) 1; + + PendingRelXactDelete pd = + { + .xid = (TransactionId) 1, + .relnode.isTempRelation = false, + .relnode.node.spcNode = TEST_TABLESPACE_OID1, + .relnode.node.dbNode = TEST_DB_OID1, + .relnode.node.relNode = TEST_REL_OID1 + }; + + Mode = BootstrapProcessing; + PdlRedoAdd(&pd); + Mode = NormalProcessing; + + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 0); + + PdlRedoAdd(&pd); + + Mode = BootstrapProcessing; + PdlRedoDropFiles(); + Mode = NormalProcessing; + + assert_int_equal(DropRelationFiles_call_count, 0); + + Mode = BootstrapProcessing; + PdlRedoRemoveTree(pd.xid, NULL, 0); + Mode = NormalProcessing; + PdlRedoDropFiles(); + + assert_int_equal(DropRelationFiles_call_count, 1); + + Mode = BootstrapProcessing; + PdlXLogInsert(); + Mode = NormalProcessing; + + assert_int_equal(PdlXLogShmemDump_call_count, 0); + assert_int_equal(XLogInsert_call_count, 0); +} + +int +main(int argc, char *argv[]) +{ + cmockery_parse_arguments(argc, argv); + + const UnitTest tests[] = { + unit_test(test_1), + unit_test(test_2), + unit_test(test_3), + unit_test(test_4), + unit_test(test_5), + unit_test(test_6), + unit_test(test_7), + unit_test(test_8), + unit_test(test_9), + unit_test(test_10), + unit_test(test_11), + unit_test(test_12), + unit_test(test_13), + unit_test(test_14), + unit_test(test_16), + unit_test(test_17), + unit_test(test_18), + unit_test(test_19) + }; + + MemoryContextInit(); + + return run_tests(tests); +} diff --git a/src/backend/catalog/test/storage_pending_deletes_test.c b/src/backend/catalog/test/storage_pending_deletes_test.c new file mode 100644 index 000000000000..e65d0dfe023d --- /dev/null +++ b/src/backend/catalog/test/storage_pending_deletes_test.c @@ -0,0 +1,567 @@ +/*------------------------------------------------------------------------- + * + * storage_pending_deletes_test.c + * code to test functionality from storage_pending_deletes.c + * + * Copyright (c) 2025 Greengage Community + * + * src/backend/catalog/test/storage_pending_deletes_test.c + * + *------------------------------------------------------------------------- + */ +#include +#include +#include +#include "cmockery.h" + +#include "catalog/storage_pending_deletes.h" +#include "storage/pg_shmem.h" +#include "storage/proc.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +enum +{ + TEST_TABLESPACE_OID1 = 11111, + TEST_TABLESPACE_OID2 = 11112, + + TEST_DB_OID1 = 11121, + TEST_DB_OID2 = 11122, + + TEST_REL_OID1 = 11211, + TEST_REL_OID2 = 11212, + + TEST_XID1 = 10, + TEST_XID2 = TEST_XID1 + 1, + TEST_XID3 = TEST_XID1 + 3, + TEST_XID4 = TEST_XID1 + 8, +}; + +/* Don't try to read a non-existent postmaster.pid file */ +void __wrap_AddToDataDirLockFile(int target_line, const char *str); +void +__wrap_AddToDataDirLockFile(int target_line, const char *str) +{ +} + + +/* Function to sort array of PendingRelXactDelete using qsort */ +static int +cmp_pdl(const void *p1, const void *p2) +{ + return memcmp(p1, p2, sizeof(PendingRelXactDelete)); +} + +/* Check if PdlXLogShmemDump returns expected array */ +static void +check_array(PendingRelXactDeleteArray *arr, + PendingRelXactDelete *expected, Size expectedCnt) +{ + assert_true(arr != NULL); + + assert_int_equal(arr->count, expectedCnt); + + /* Order doesn't matter */ + qsort (expected, expectedCnt, sizeof(*expected), cmp_pdl); + qsort (arr->array, expectedCnt, sizeof(*expected), cmp_pdl); + assert_memory_equal(arr->array, expected, expectedCnt*sizeof(*expected)); +} + +/* Remove nodes received in the p array from backends lists */ +static void +clean_lists(dsa_pointer *p, Size pCnt) +{ + for (int i = 0; i < pCnt; i++) + PdlShmemRemove(p[i]); + + /* Check whether cleanup is ok */ + assert_true(PdlXLogShmemDump() == NULL); +} + +/* Call PdlXLogShmemDump(), check its result and clean up */ +static void +check_dump(PendingRelXactDelete *expected, Size expectedCnt) +{ + PendingRelXactDeleteArray *arr = PdlXLogShmemDump(); + + check_array(arr, expected, expectedCnt); + pfree(arr); +} + + +/* Dump without additions */ +static void +test_empty(void **state) +{ + assert_true(PdlXLogShmemDump() == NULL); +} + +/* Add single pending delete node */ +static void +test_1(void **state) +{ + const RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + }, + .relstorage = RELSTORAGE_HEAP + }; + + dsa_pointer p = PdlShmemAdd(&relnode, TEST_XID1); + + PendingRelXactDelete expected = + { + .relnode = relnode, + .xid = TEST_XID1 + }; + + check_dump(&expected, 1); + clean_lists(&p, 1); +} + +/* Add nodes, remove the first one, add a node */ +static void +test_remove_fisrt(void **state) +{ + RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + dsa_pointer p_first = PdlShmemAdd(&relnode, TEST_XID1); + + dsa_pointer p[4]; + + relnode.node.spcNode = TEST_TABLESPACE_OID2; + p[0] = PdlShmemAdd(&relnode, TEST_XID2); + + relnode.node.dbNode = TEST_DB_OID2; + p[1] = PdlShmemAdd(&relnode, TEST_XID3); + + relnode.node.relNode = TEST_REL_OID2; + p[2] = PdlShmemAdd(&relnode, TEST_XID1); + + PdlShmemRemove(p_first); + + relnode.node.spcNode = TEST_TABLESPACE_OID1; + p[3] = PdlShmemAdd(&relnode, TEST_XID1); + + PendingRelXactDelete expected[] = + { + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID2 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID1}}, + .xid = TEST_XID3 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID2}}, + .xid = TEST_XID1 + }, + { + .relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID2, TEST_REL_OID2}}, + .xid = TEST_XID1 + }, + }; + + check_dump(expected, ARRAY_SIZE(expected)); + clean_lists(p, ARRAY_SIZE(p)); +} + +/* Add nodes, remove a node from the middle, add a node */ +static void +test_remove_middle(void **state) +{ + RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + dsa_pointer p[4]; + + p[0] = PdlShmemAdd(&relnode, TEST_XID1); + + relnode.node.spcNode = TEST_TABLESPACE_OID2; + p[1] = PdlShmemAdd(&relnode, TEST_XID2); + + relnode.node.dbNode = TEST_DB_OID2; + dsa_pointer p_middle = PdlShmemAdd(&relnode, TEST_XID1); + + relnode.node.relNode = TEST_REL_OID2; + p[2] = PdlShmemAdd(&relnode, TEST_XID3); + + PdlShmemRemove(p_middle); + + relnode.node.spcNode = TEST_TABLESPACE_OID1; + p[3] = PdlShmemAdd(&relnode, TEST_XID1); + + PendingRelXactDelete expected[] = + { + { + .relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID1 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID2 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID2}}, + .xid = TEST_XID3 + }, + { + .relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID2, TEST_REL_OID2}}, + .xid = TEST_XID1 + }, + }; + + check_dump(expected, ARRAY_SIZE(expected)); + clean_lists(p, ARRAY_SIZE(p)); +} + +/* Add nodes, remove the last one, add a node */ +static void +test_remove_last(void **state) +{ + RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + dsa_pointer p[4]; + + p[0] = PdlShmemAdd(&relnode, TEST_XID1); + + relnode.node.spcNode = TEST_TABLESPACE_OID2; + p[1] = PdlShmemAdd(&relnode, TEST_XID2); + + relnode.node.dbNode = TEST_DB_OID2; + p[2] = PdlShmemAdd(&relnode, TEST_XID3); + + relnode.node.relNode = TEST_REL_OID2; + dsa_pointer p_last = PdlShmemAdd(&relnode, TEST_XID1); + + PdlShmemRemove(p_last); + + relnode.node.dbNode = TEST_DB_OID1; + p[3] = PdlShmemAdd(&relnode, TEST_XID1); + + PendingRelXactDelete expected[] = + { + { + .relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID1 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID2 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID1}}, + .xid = TEST_XID3 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID2}}, + .xid = TEST_XID1 + }, + }; + + check_dump(expected, ARRAY_SIZE(expected)); + clean_lists(p, ARRAY_SIZE(p)); +} + +/* Add node with invalid transaction id */ +static void +test_invalid_xid(void **state) +{ + const RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + assert_false(DsaPointerIsValid( + PdlShmemAdd(&relnode, InvalidTransactionId))); + assert_true(PdlXLogShmemDump() == NULL); +} + +/* Add node when MyBackendId is invalid */ +static void +test_invalid_backend(void **state) +{ + const RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + BackendId old = MyBackendId; + + MyBackendId = InvalidBackendId; + + assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1))); + assert_true(PdlXLogShmemDump() == NULL); + + /* Clean up */ + MyBackendId = old; +} + +/* Add node when Mode == BootstrapProcessing */ +static void +test_invalid_mode(void **state) +{ + const RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + ProcessingMode old = Mode; + + Mode = BootstrapProcessing; + + assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1))); + assert_true(PdlXLogShmemDump() == NULL); + + /* Clean up */ + Mode = old; +} + +/* Add node when tracking is disabled */ +static void +test_tracking_disabled(void **state) +{ + const RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + bool old = gp_track_pending_delete; + + gp_track_pending_delete = false; + + assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1))); + assert_true(PdlXLogShmemDump() == NULL); + + /* Clean up */ + gp_track_pending_delete = old; +} + +/* Add node when dynamic_shared_memory_type == DSM_IMPL_NONE */ +static void +test_shmem_type(void **state) +{ + const RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + int old = dynamic_shared_memory_type; + + dynamic_shared_memory_type = DSM_IMPL_NONE; + + assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1))); + assert_true(PdlXLogShmemDump() == NULL); + + /* Clean up */ + dynamic_shared_memory_type = old; +} + +/* Add nodes for two backends */ +static void +test_2_backends(void **state) +{ + RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + dsa_pointer p[5]; + + p[0] = PdlShmemAdd(&relnode, TEST_XID1); + + relnode.node.spcNode = TEST_TABLESPACE_OID2; + p[1] = PdlShmemAdd(&relnode, TEST_XID2); + + relnode.node.dbNode = TEST_DB_OID2; + p[2] = PdlShmemAdd(&relnode, TEST_XID1); + + BackendId old = MyBackendId; + + MyBackendId = 3; + + relnode.node.relNode = TEST_REL_OID2; + p[3] = PdlShmemAdd(&relnode, TEST_XID3); + + relnode.node.spcNode = TEST_TABLESPACE_OID1; + p[4] = PdlShmemAdd(&relnode, TEST_XID4); + + PendingRelXactDelete expected[] = + { + { + .relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID1 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}}, + .xid = TEST_XID2 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID1}}, + .xid = TEST_XID1 + }, + { + .relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID2}}, + .xid = TEST_XID3 + }, + { + .relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID2, TEST_REL_OID2}}, + .xid = TEST_XID4 + }, + }; + + PendingRelXactDeleteArray *arr = PdlXLogShmemDump(); + + /* + * Clean up. + * Elements which were added for backend 3 should be removed + * when MyBackendId is 3. Other elements are removed in clean_lists + * after restoring MyBackendId. + */ + PdlShmemRemove(p[3]); + PdlShmemRemove(p[4]); + + MyBackendId = old; + + check_array(arr, expected, ARRAY_SIZE(expected)); + pfree(arr); + + clean_lists(p, 3); +} + +/* Add nodes to use repalloc twice in PdlXLogShmemDump() */ +static void +test_repalloc(void **state) +{ + RelFileNodePendingDelete relnode = + { + .node = + { + .spcNode = TEST_TABLESPACE_OID1, + .dbNode = TEST_DB_OID1, + .relNode = TEST_REL_OID1 + } + }; + + dsa_pointer p[100]; /* 100 > 32 + 64 */ + PendingRelXactDelete expected[ARRAY_SIZE(p)]; + + for(int i = 0; i < ARRAY_SIZE(p); i++) + { + relnode.node.spcNode += i; + relnode.node.dbNode += i; + relnode.node.relNode += i; + + p[i] = PdlShmemAdd(&relnode, TEST_XID1 + i); + + expected[i].relnode = relnode; + expected[i].xid = TEST_XID1 + i; + } + + check_dump(expected, ARRAY_SIZE(expected)); + clean_lists(p, ARRAY_SIZE(p)); +} + +int +main(int argc, char *argv[]) +{ + cmockery_parse_arguments(argc, argv); + + const UnitTest tests[] = { + unit_test(test_empty), + unit_test(test_1), + unit_test(test_remove_fisrt), + unit_test(test_remove_middle), + unit_test(test_remove_last), + unit_test(test_invalid_xid), + unit_test(test_invalid_backend), + unit_test(test_invalid_mode), + unit_test(test_tracking_disabled), + unit_test(test_shmem_type), + unit_test(test_2_backends), + unit_test(test_repalloc) + }; + + MemoryContextInit(); + + gp_track_pending_delete = true; + dynamic_shared_memory_type = DSM_IMPL_POSIX; + DataDir = "."; + MaxBackends = 5; + + PGShmemHeader *shim = NULL; + + InitShmemAccess(PGSharedMemoryCreate(300000, 6000, &shim)); + InitShmemAllocation(); + CreateLWLocks(); + InitShmemIndex(); + dsm_postmaster_startup(shim); + + PdlShmemInit(); + + IsUnderPostmaster = true; + MyBackendId = 1; + + PGPROC proc = {.backendId = MyBackendId}; + + MyProc = &proc; + return run_tests(tests); +} diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index f33ebdbf7511..947f90980889 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -13196,7 +13196,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT || (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && forkNum == INIT_FORKNUM)) - log_smgrcreate(&newrnode, forkNum); + log_smgrcreate(&newrnode, forkNum, rel->rd_rel->relstorage); copy_relation_data(rel->rd_smgr, dstrel, forkNum, rel->rd_rel->relpersistence); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 1c0bd8aea7dc..e5e6d2aedb71 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -195,6 +195,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI: case XLOG_OVERWRITE_CONTRECORD: + case XLOG_PENDING_DELETE: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c index 1cecbb205a7f..95726f21f267 100644 --- a/src/backend/storage/ipc/dsm.c +++ b/src/backend/storage/ipc/dsm.c @@ -82,6 +82,7 @@ typedef struct dsm_control_item { dsm_handle handle; uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */ + bool pinned; } dsm_control_item; /* Layout of the dynamic shared memory control segment. */ @@ -467,8 +468,8 @@ dsm_create(Size size) uint32 i; uint32 nitems; - /* Unsafe in postmaster (and pointless in a stand-alone backend). */ - Assert(IsUnderPostmaster); + /* Unsafe in postmaster. */ + Assert(!IsPostmasterEnvironment || IsUnderPostmaster); if (!dsm_init_done) dsm_backend_startup(); @@ -497,6 +498,7 @@ dsm_create(Size size) dsm_control->item[i].handle = seg->handle; /* refcnt of 1 triggers destruction, so start at 2 */ dsm_control->item[i].refcnt = 2; + dsm_control->item[i].pinned = false; seg->control_slot = i; LWLockRelease(DynamicSharedMemoryControlLock); return seg; @@ -522,6 +524,7 @@ dsm_create(Size size) dsm_control->item[nitems].handle = seg->handle; /* refcnt of 1 triggers destruction, so start at 2 */ dsm_control->item[nitems].refcnt = 2; + dsm_control->item[nitems].pinned = false; seg->control_slot = nitems; dsm_control->nitems++; LWLockRelease(DynamicSharedMemoryControlLock); @@ -765,6 +768,9 @@ dsm_detach(dsm_segment *seg) /* If new reference count is 1, try to destroy the segment. */ if (refcnt == 1) { + /* A pinned segment should never reach 1. */ + Assert(!dsm_control->item[control_slot].pinned); + /* * If we fail to destroy the segment here, or are killed before we * finish doing so, the reference count will remain at 1, which @@ -817,11 +823,11 @@ dsm_pin_mapping(dsm_segment *seg) } /* - * Keep a dynamic shared memory segment until postmaster shutdown. + * Keep a dynamic shared memory segment until postmaster shutdown, or until + * dsm_unpin_segment is called. * - * This function should not be called more than once per segment; - * on Windows, doing so will create unnecessary handles which will - * consume system resources to no benefit. + * This function should not be called more than once per segment, unless the + * segment is explicitly unpinned with dsm_unpin_segment in between calls. * * Note that this function does not arrange for the current process to * keep the segment mapped indefinitely; if that behavior is desired, @@ -834,13 +840,98 @@ dsm_pin_segment(dsm_segment *seg) /* * Bump reference count for this segment in shared memory. This will * ensure that even if there is no session which is attached to this - * segment, it will remain until postmaster shutdown. + * segment, it will remain until postmaster shutdown or an explicit call + * to unpin. */ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (dsm_control->item[seg->control_slot].pinned) + elog(ERROR, "cannot pin a segment that is already pinned"); + dsm_impl_pin_segment(seg->handle, seg->impl_private); + dsm_control->item[seg->control_slot].pinned = true; dsm_control->item[seg->control_slot].refcnt++; LWLockRelease(DynamicSharedMemoryControlLock); +} - dsm_impl_pin_segment(seg->handle, seg->impl_private); +/* + * Unpin a dynamic shared memory segment that was previously pinned with + * dsm_pin_segment. This function should not be called unless dsm_pin_segment + * was previously called for this segment. + * + * The argument is a dsm_handle rather than a dsm_segment in case you want + * to unpin a segment to which you haven't attached. This turns out to be + * useful if, for example, a reference to one shared memory segment is stored + * within another shared memory segment. You might want to unpin the + * referenced segment before destroying the referencing segment. + */ +void +dsm_unpin_segment(dsm_handle handle) +{ + uint32 control_slot = INVALID_CONTROL_SLOT; + bool destroy = false; + uint32 i; + + /* Find the control slot for the given handle. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + for (i = 0; i < dsm_control->nitems; ++i) + { + /* Skip unused slots. */ + if (dsm_control->item[i].refcnt == 0) + continue; + + /* If we've found our handle, we can stop searching. */ + if (dsm_control->item[i].handle == handle) + { + control_slot = i; + break; + } + } + + /* + * We should definitely have found the slot, and it should not already be + * in the process of going away, because this function should only be + * called on a segment which is pinned. + */ + if (control_slot == INVALID_CONTROL_SLOT) + elog(ERROR, "cannot unpin unknown segment handle"); + if (!dsm_control->item[control_slot].pinned) + elog(ERROR, "cannot unpin a segment that is not pinned"); + Assert(dsm_control->item[control_slot].refcnt > 1); + + /* Note that 1 means no references (0 means unused slot). */ + if (--dsm_control->item[control_slot].refcnt == 1) + destroy = true; + dsm_control->item[control_slot].pinned = false; + + /* Now we can release the lock. */ + LWLockRelease(DynamicSharedMemoryControlLock); + + /* Clean up resources if that was the last reference. */ + if (destroy) + { + void *junk_impl_private = NULL; + void *junk_mapped_address = NULL; + Size junk_mapped_size = 0; + + /* + * For an explanation of how error handling works in this case, see + * comments in dsm_detach. Note that if we reach this point, the + * current process certainly does not have the segment mapped, because + * if it did, the reference count would have still been greater than 1 + * even after releasing the reference count held by the pin. The fact + * that there can't be a dsm_segment for this handle makes it OK to + * pass the mapped size, mapped address, and private data as NULL + * here. + */ + if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + Assert(dsm_control->item[control_slot].handle == handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } } /* diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c index ebe84618ad25..bf949fbd91da 100644 --- a/src/backend/storage/ipc/dsm_impl.c +++ b/src/backend/storage/ipc/dsm_impl.c @@ -1055,8 +1055,8 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, #endif /* - * Implementation-specific actions that must be performed when a segment - * is to be preserved until postmaster shutdown. + * Implementation-specific actions that must be performed when a segment is to + * be preserved even when no backend has it attached. * * Except on Windows, we don't need to do anything at all. But since Windows * cleans up segments automatically when no references remain, we duplicate diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 3c664ccf93eb..d2f9ae05d84a 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -40,6 +40,7 @@ #include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/ipc.h" +#include "catalog/storage_pending_deletes.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/predicate.h" @@ -216,6 +217,9 @@ CreateSharedMemoryAndSemaphores(int port) /* size of parallel cursor count */ size = add_size(size, ParallelCursorCountSize()); + /* size of pending deletes */ + size = add_size(size, PdlShmemSize()); + elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); /* @@ -388,6 +392,8 @@ CreateSharedMemoryAndSemaphores(int port) if (Gp_role == GP_ROLE_DISPATCH) ParallelCursorCountInit(); + PdlShmemInit(); + /* * Now give loadable modules a chance to set up their shmem allocations */ diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 56e5b85b8f16..b7aab2caa047 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -279,6 +279,9 @@ NumLWLocks(void) lock_addin_request_allowed = false; numLocks += Max(lock_addin_request, NUM_USER_DEFINED_LWLOCKS); + /* storage_pending_deletes.c needs one for each backend */ + numLocks += MaxBackends; + return numLocks; } diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index 388d63881998..8d6b38368bf0 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -480,6 +480,8 @@ bool gp_log_endpoints = false; /* optional reject to parse ambigous 5-digits date in YYYMMDD format */ bool gp_allow_date_field_width_5digits = false; +bool gp_track_pending_delete = true; + /* GUC to set interval for streaming archival status */ int wal_sender_archiving_status_interval; @@ -3419,6 +3421,19 @@ struct config_bool ConfigureNamesBool_gp[] = NULL, NULL, NULL }, + { + {"gp_track_pending_delete", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("Enable extended pending deletion tracking to avoid " + "accumulation of orphaned files."), + gettext_noop("Disabling this turns off storing relation nodes in " + "shmem, dumping them to WAL and removing of files " + "during recovery.") + }, + &gp_track_pending_delete, + true, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile index 2d24fa0124cf..ee473f313d45 100644 --- a/src/backend/utils/mmgr/Makefile +++ b/src/backend/utils/mmgr/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/utils/mmgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = aset.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o +OBJS = aset.o dsa.o freepage.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o # In PostgreSQL, this is under src/common. It has been backported, but because # we haven't merged the changes that introduced the src/common directory, it diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c new file mode 100644 index 000000000000..670b12f792d2 --- /dev/null +++ b/src/backend/utils/mmgr/dsa.c @@ -0,0 +1,2214 @@ +/*------------------------------------------------------------------------- + * + * dsa.c + * Dynamic shared memory areas. + * + * This module provides dynamic shared memory areas which are built on top of + * DSM segments. While dsm.c allows segments of memory of shared memory to be + * created and shared between backends, it isn't designed to deal with small + * objects. A DSA area is a shared memory heap usually backed by one or more + * DSM segments which can allocate memory using dsa_allocate() and dsa_free(). + * Alternatively, it can be created in pre-existing shared memory, including a + * DSM segment, and then create extra DSM segments as required. Unlike the + * regular system heap, it deals in pseudo-pointers which must be converted to + * backend-local pointers before they are dereferenced. These pseudo-pointers + * can however be shared with other backends, and can be used to construct + * shared data structures. + * + * Each DSA area manages a set of DSM segments, adding new segments as + * required and detaching them when they are no longer needed. Each segment + * contains a number of 4KB pages, a free page manager for tracking + * consecutive runs of free pages, and a page map for tracking the source of + * objects allocated on each page. Allocation requests above 8KB are handled + * by choosing a segment and finding consecutive free pages in its free page + * manager. Allocation requests for smaller sizes are handled using pools of + * objects of a selection of sizes. Each pool consists of a number of 16 page + * (64KB) superblocks allocated in the same way as large objects. Allocation + * of large objects and new superblocks is serialized by a single LWLock, but + * allocation of small objects from pre-existing superblocks uses one LWLock + * per pool. Currently there is one pool, and therefore one lock, per size + * class. Per-core pools to increase concurrency and strategies for reducing + * the resulting fragmentation are areas for future research. Each superblock + * is managed with a 'span', which tracks the superblock's freelist. Free + * requests are handled by looking in the page map to find which span an + * address was allocated from, so that small objects can be returned to the + * appropriate free list, and large object pages can be returned directly to + * the free page map. When allocating, simple heuristics for selecting + * segments and superblocks try to encourage occupied memory to be + * concentrated, increasing the likelihood that whole superblocks can become + * empty and be returned to the free page manager, and whole segments can + * become empty and be returned to the operating system. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/dsa.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/dsa.h" +#include "utils/freepage.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + +/* + * The size of the initial DSM segment that backs a dsa_area created by + * dsa_create. After creating some number of segments of this size we'll + * double this size, and so on. Larger segments may be created if necessary + * to satisfy large requests. + */ +#define DSA_INITIAL_SEGMENT_SIZE ((Size) (1 * 1024 * 1024)) + +/* + * How many segments to create before we double the segment size. If this is + * low, then there is likely to be a lot of wasted space in the largest + * segment. If it is high, then we risk running out of segment slots (see + * dsm.c's limits on total number of segments), or limiting the total size + * an area can manage when using small pointers. + */ +#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4 + +/* + * The number of bits used to represent the offset part of a dsa_pointer. + * This controls the maximum size of a segment, the maximum possible + * allocation size and also the maximum number of segments per area. + */ +#if SIZEOF_DSA_POINTER == 4 +#define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */ +#else +#define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */ +#endif + +/* + * The maximum number of DSM segments that an area can own, determined by + * the number of bits remaining (but capped at 1024). + */ +#define DSA_MAX_SEGMENTS \ + Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH))) + +/* The bitmask for extracting the offset from a dsa_pointer. */ +#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1) + +/* The maximum size of a DSM segment. */ +#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH) + +/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */ +#define DSA_PAGES_PER_SUPERBLOCK 16 + +/* + * A magic number used as a sanity check for following DSM segments belonging + * to a DSA area (this number will be XORed with the area handle and + * the segment index). + */ +#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608 + +/* Build a dsa_pointer given a segment number and offset. */ +#define DSA_MAKE_POINTER(segment_number, offset) \ + (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset)) + +/* Extract the segment number from a dsa_pointer. */ +#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH) + +/* Extract the offset from a dsa_pointer. */ +#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK) + +/* The type used for index segment indexes (zero based). */ +typedef Size dsa_segment_index; + +/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */ +#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0) + +/* + * How many bins of segments do we have? The bins are used to categorize + * segments by their largest contiguous run of free pages. + */ +#define DSA_NUM_SEGMENT_BINS 16 + +/* + * What is the lowest bin that holds segments that *might* have n contiguous + * free pages? There is no point in looking in segments in lower bins; they + * definitely can't service a request for n free pages. + */ +#define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1) + +/* Macros for access to locks. */ +#define DSA_AREA_LOCK(area) (&area->control->lock) +#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock) + +/* + * The header for an individual segment. This lives at the start of each DSM + * segment owned by a DSA area including the first segment (where it appears + * as part of the dsa_area_control struct). + */ +typedef struct +{ + /* Sanity check magic value. */ + uint32 magic; + /* Total number of pages in this segment (excluding metadata area). */ + Size usable_pages; + /* Total size of this segment in bytes. */ + Size size; + + /* + * Index of the segment that precedes this one in the same segment bin, or + * DSA_SEGMENT_INDEX_NONE if this is the first one. + */ + dsa_segment_index prev; + + /* + * Index of the segment that follows this one in the same segment bin, or + * DSA_SEGMENT_INDEX_NONE if this is the last one. + */ + dsa_segment_index next; + /* The index of the bin that contains this segment. */ + Size bin; + + /* + * A flag raised to indicate that this segment is being returned to the + * operating system and has been unpinned. + */ + bool freed; +} dsa_segment_header; + +/* + * Metadata for one superblock. + * + * For most blocks, span objects are stored out-of-line; that is, the span + * object is not stored within the block itself. But, as an exception, for a + * "span of spans", the span object is stored "inline". The allocation is + * always exactly one page, and the dsa_area_span object is located at + * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS, + * and the remaining fields are used just as they would be in an ordinary + * block. We can't allocate spans out of ordinary superblocks because + * creating an ordinary superblock requires us to be able to allocate a span + * *first*. Doing it this way avoids that circularity. + */ +typedef struct +{ + dsa_pointer pool; /* Containing pool. */ + dsa_pointer prevspan; /* Previous span. */ + dsa_pointer nextspan; /* Next span. */ + dsa_pointer start; /* Starting address. */ + Size npages; /* Length of span in pages. */ + uint16 size_class; /* Size class. */ + uint16 ninitialized; /* Maximum number of objects ever allocated. */ + uint16 nallocatable; /* Number of objects currently allocatable. */ + uint16 firstfree; /* First object on free list. */ + uint16 nmax; /* Maximum number of objects ever possible. */ + uint16 fclass; /* Current fullness class. */ +} dsa_area_span; + +/* + * Given a pointer to an object in a span, access the index of the next free + * object in the same span (ie in the span's freelist) as an L-value. + */ +#define NextFreeObjectIndex(object) (* (uint16 *) (object)) + +/* + * Small allocations are handled by dividing a single block of memory into + * many small objects of equal size. The possible allocation sizes are + * defined by the following array. Larger size classes are spaced more widely + * than smaller size classes. We fudge the spacing for size classes >1kB to + * avoid space wastage: based on the knowledge that we plan to allocate 64kB + * blocks, we bump the maximum object size up to the largest multiple of + * 8 bytes that still lets us fit the same number of objects into one block. + * + * NB: Because of this fudging, if we were ever to use differently-sized blocks + * for small allocations, these size classes would need to be reworked to be + * optimal for the new size. + * + * NB: The optimal spacing for size classes, as well as the size of the blocks + * out of which small objects are allocated, is not a question that has one + * right answer. Some allocators (such as tcmalloc) use more closely-spaced + * size classes than we do here, while others (like aset.c) use more + * widely-spaced classes. Spacing the classes more closely avoids wasting + * memory within individual chunks, but also means a larger number of + * potentially-unfilled blocks. + */ +static const uint16 dsa_size_classes[] = { + sizeof(dsa_area_span), 0, /* special size classes */ + 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */ + 80, 96, 112, 128, /* 4 classes separated by 16 bytes */ + 160, 192, 224, 256, /* 4 classes separated by 32 bytes */ + 320, 384, 448, 512, /* 4 classes separated by 64 bytes */ + 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */ + 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */ + 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */ + 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */ +}; +#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes) + +/* Special size classes. */ +#define DSA_SCLASS_BLOCK_OF_SPANS 0 +#define DSA_SCLASS_SPAN_LARGE 1 + +/* + * The following lookup table is used to map the size of small objects + * (less than 1kB) onto the corresponding size class. To use this table, + * round the size of the object up to the next multiple of 8 bytes, and then + * index into this array. + */ +static char dsa_size_class_map[] = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, + 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, + 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 +}; +#define DSA_SIZE_CLASS_MAP_QUANTUM 8 + +/* + * Superblocks are binned by how full they are. Generally, each fullness + * class corresponds to one quartile, but the block being used for + * allocations is always at the head of the list for fullness class 1, + * regardless of how full it really is. + */ +#define DSA_FULLNESS_CLASSES 4 + +/* + * Maximum length of a DSA name. + */ +#define DSA_MAXLEN 64 + +/* + * A dsa_area_pool represents a set of objects of a given size class. + * + * Perhaps there should be multiple pools for the same size class for + * contention avoidance, but for now there is just one! + */ +typedef struct +{ + /* A lock protecting access to this pool. */ + LWLock lock; + /* A set of linked lists of spans, arranged by fullness. */ + dsa_pointer spans[DSA_FULLNESS_CLASSES]; + /* Should we pad this out to a cacheline boundary? */ +} dsa_area_pool; + +/* + * The control block for an area. This lives in shared memory, at the start of + * the first DSM segment controlled by this area. + */ +typedef struct +{ + /* The segment header for the first segment. */ + dsa_segment_header segment_header; + /* The handle for this area. */ + dsa_handle handle; + /* The handles of the segments owned by this area. */ + dsm_handle segment_handles[DSA_MAX_SEGMENTS]; + /* Lists of segments, binned by maximum contiguous run of free pages. */ + dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]; + /* The object pools for each size class. */ + dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]; + /* The total size of all active segments. */ + Size total_segment_size; + /* The maximum total size of backing storage we are allowed. */ + Size max_total_segment_size; + /* Highest used segment index in the history of this area. */ + dsa_segment_index high_segment_index; + /* The reference count for this area. */ + int refcnt; + /* A flag indicating that this area has been pinned. */ + bool pinned; + /* The number of times that segments have been freed. */ + Size freed_segment_counter; + /* The LWLock tranche ID. */ + int lwlock_tranche_id; + char lwlock_tranche_name[DSA_MAXLEN]; + /* The general lock (protects everything except object pools). */ + LWLock lock; +} dsa_area_control; + +/* Given a pointer to a pool, find a dsa_pointer. */ +#define DsaAreaPoolToDsaPointer(area, p) \ + DSA_MAKE_POINTER(0, (char *) p - (char *) area->control) + +/* + * A dsa_segment_map is stored within the backend-private memory of each + * individual backend. It holds the base address of the segment within that + * backend, plus the addresses of key objects within the segment. Those + * could instead be derived from the base address but it's handy to have them + * around. + */ +typedef struct +{ + dsm_segment *segment; /* DSM segment */ + char *mapped_address; /* Address at which segment is mapped */ + dsa_segment_header *header; /* Header (same as mapped_address) */ + FreePageManager *fpm; /* Free page manager within segment. */ + dsa_pointer *pagemap; /* Page map within segment. */ +} dsa_segment_map; + +/* + * Per-backend state for a storage area. Backends obtain one of these by + * creating an area or attaching to an existing one using a handle. Each + * process that needs to use an area uses its own object to track where the + * segments are mapped. + */ +struct dsa_area +{ + /* Pointer to the control object in shared memory. */ + dsa_area_control *control; + + /* The lock tranche for this process. */ + LWLockTranche lwlock_tranche; + + /* + * All the mappings are owned by this. The dsa_area itself is not + * directly tracked by the ResourceOwner, but the effect is the same. NULL + * if the attachment has session lifespan, i.e if dsa_pin_mapping() has + * been called. + */ + ResourceOwner resowner; + + /* + * This backend's array of segment maps, ordered by segment index + * corresponding to control->segment_handles. Some of the area's segments + * may not be mapped in in this backend yet, and some slots may have been + * freed and need to be detached; these operations happen on demand. + */ + dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]; + + /* The highest segment index this backend has ever mapped. */ + dsa_segment_index high_segment_index; + + /* The last observed freed_segment_counter. */ + Size freed_segment_counter; +}; + +#define DSA_SPAN_NOTHING_FREE ((uint16) -1) +#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE) + +/* Given a pointer to a segment_map, obtain a segment index number. */ +#define get_segment_index(area, segment_map_ptr) \ + (segment_map_ptr - &area->segment_maps[0]) + +static void init_span(dsa_area *area, dsa_pointer span_pointer, + dsa_area_pool *pool, dsa_pointer start, Size npages, + uint16 size_class); +static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, + int fromclass, int toclass); +static inline dsa_pointer alloc_object(dsa_area *area, int size_class); +static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, + int size_class); +static dsa_segment_map *get_segment_by_index(dsa_area *area, + dsa_segment_index index); +static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer); +static void unlink_span(dsa_area *area, dsa_area_span *span); +static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, + dsa_pointer span_pointer, int fclass); +static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map); +static dsa_segment_map *get_best_segment(dsa_area *area, Size npages); +static dsa_segment_map *make_new_segment(dsa_area *area, Size requested_pages); +static dsa_area *create_internal(void *place, size_t size, + int tranche_id, const char *tranche_name, + dsm_handle control_handle, + dsm_segment *control_segment); +static dsa_area *attach_internal(void *place, dsm_segment *segment, + dsa_handle handle); +static void check_for_freed_segments(dsa_area *area); + +/* + * Create a new shared area in a new DSM segment. Further DSM segments will + * be allocated as required to extend the available space. + * + * We can't allocate a LWLock tranche_id within this function, because tranche + * IDs are a scarce resource; there are only 64k available, using low numbers + * when possible matters, and we have no provision for recycling them. So, + * we require the caller to provide one. The caller must also provide the + * tranche name, so that we can distinguish LWLocks belonging to different + * DSAs. + */ +dsa_area * +dsa_create(int tranche_id, const char *tranche_name) +{ + dsm_segment *segment; + dsa_area *area; + + /* + * Create the DSM segment that will hold the shared control object and the + * first segment of usable space. + */ + segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE); + + /* + * All segments backing this area are pinned, so that DSA can explicitly + * control their lifetime (otherwise a newly created segment belonging to + * this area might be freed when the only backend that happens to have it + * mapped in ends, corrupting the area). + */ + dsm_pin_segment(segment); + + /* Create a new DSA area with the control objet in this segment. */ + area = create_internal(dsm_segment_address(segment), + DSA_INITIAL_SEGMENT_SIZE, + tranche_id, tranche_name, + dsm_segment_handle(segment), segment); + + /* Clean up when the control segment detaches. */ + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(dsm_segment_address(segment))); + + return area; +} + +/* + * Create a new shared area in an existing shared memory space, which may be + * either DSM or Postmaster-initialized memory. DSM segments will be + * allocated as required to extend the available space, though that can be + * prevented with dsa_set_size_limit(area, size) using the same size provided + * to dsa_create_in_place. + * + * Areas created in-place must eventually be released by the backend that + * created them and all backends that attach to them. This can be done + * explicitly with dsa_release_in_place, or, in the special case that 'place' + * happens to be in a pre-existing DSM segment, by passing in a pointer to the + * segment so that a detach hook can be registered with the containing DSM + * segment. + * + * See dsa_create() for a note about the tranche arguments. + */ +dsa_area * +dsa_create_in_place(void *place, size_t size, + int tranche_id, const char *tranche_name, + dsm_segment *segment) +{ + dsa_area *area; + + area = create_internal(place, size, tranche_id, tranche_name, + DSM_HANDLE_INVALID, NULL); + + /* + * Clean up when the control segment detaches, if a containing DSM segment + * was provided. + */ + if (segment != NULL) + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(place)); + + return area; +} + +/* + * Obtain a handle that can be passed to other processes so that they can + * attach to the given area. Cannot be called for areas created with + * dsa_create_in_place. + */ +dsa_handle +dsa_get_handle(dsa_area *area) +{ + Assert(area->control->handle != DSM_HANDLE_INVALID); + return area->control->handle; +} + +/* + * Attach to an area given a handle generated (possibly in another process) by + * dsa_get_area_handle. The area must have been created with dsa_create (not + * dsa_create_in_place). + */ +dsa_area * +dsa_attach(dsa_handle handle) +{ + dsm_segment *segment; + dsa_area *area; + + /* + * An area handle is really a DSM segment handle for the first segment, so + * we go ahead and attach to that. + */ + segment = dsm_attach(handle); + if (segment == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to dsa_handle"))); + + area = attach_internal(dsm_segment_address(segment), segment, handle); + + /* Clean up when the control segment detaches. */ + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(dsm_segment_address(segment))); + + return area; +} + +/* + * Attach to an area that was created with dsa_create_in_place. The caller + * must somehow know the location in memory that was used when the area was + * created, though it may be mapped at a different virtual address in this + * process. + * + * See dsa_create_in_place for note about releasing in-place areas, and the + * optional 'segment' argument which can be provided to allow automatic + * release if the containing memory happens to be a DSM segment. + */ +dsa_area * +dsa_attach_in_place(void *place, dsm_segment *segment) +{ + dsa_area *area; + + area = attach_internal(place, NULL, DSM_HANDLE_INVALID); + + /* + * Clean up when the control segment detaches, if a containing DSM segment + * was provided. + */ + if (segment != NULL) + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(place)); + + return area; +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. The 'segment' argument is ignored but provides an + * interface suitable for on_dsm_detach, for the convenience of users who want + * to create a DSA segment inside an existing DSM segment and have it + * automatically released when the containing DSM segment is detached. + * 'place' should be the address of the place where the area was created. + * + * This callback is automatically registered for the DSM segment containing + * the control object of in-place areas when a segment is provided to + * dsa_create_in_place or dsa_attach_in_place, and also for all areas created + * with dsa_create. + */ +void +dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place) +{ + dsa_release_in_place(DatumGetPointer(place)); +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. The 'code' argument is ignored but provides an + * interface suitable for on_shmem_exit or before_shmem_exit, for the + * convenience of users who want to create a DSA segment inside shared memory + * other than a DSM segment and have it automatically release at backend exit. + * 'place' should be the address of the place where the area was created. + */ +void +dsa_on_shmem_exit_release_in_place(int code, Datum place) +{ + dsa_release_in_place(DatumGetPointer(place)); +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX' + * callbacks so that this is managed automatically, because failure to release + * an area created in-place leaks its segments permanently. + * + * This is also called automatically for areas produced by dsa_create or + * dsa_attach as an implementation detail. + */ +void +dsa_release_in_place(void *place) +{ + dsa_area_control *control = (dsa_area_control *) place; + int i; + + LWLockAcquire(&control->lock, LW_EXCLUSIVE); + Assert(control->segment_header.magic == + (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0)); + Assert(control->refcnt > 0); + if (--control->refcnt == 0) + { + for (i = 0; i <= control->high_segment_index; ++i) + { + dsm_handle handle; + + handle = control->segment_handles[i]; + if (handle != DSM_HANDLE_INVALID) + dsm_unpin_segment(handle); + } + } + LWLockRelease(&control->lock); +} + +/* + * Keep a DSA area attached until end of session or explicit detach. + * + * By default, areas are owned by the current resource owner, which means they + * are detached automatically when that scope ends. + */ +void +dsa_pin_mapping(dsa_area *area) +{ + int i; + + if (area->resowner != NULL) + { + area->resowner = NULL; + + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_pin_mapping(area->segment_maps[i].segment); + } +} + +/* + * Allocate memory in this storage area. The return value is a dsa_pointer + * that can be passed to other processes, and converted to a local pointer + * with dsa_get_address. If no memory is available, returns + * InvalidDsaPointer. + */ +dsa_pointer +dsa_allocate(dsa_area *area, Size size) +{ + uint16 size_class; + dsa_pointer start_pointer; + dsa_segment_map *segment_map; + + Assert(size > 0); + + /* + * If bigger than the largest size class, just grab a run of pages from + * the free page manager, instead of allocating an object from a pool. + * There will still be a span, but it's a special class of span that + * manages this whole allocation and simply gives all pages back to the + * free page manager when dsa_free is called. + */ + if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1]) + { + Size npages = fpm_size_to_pages(size); + Size first_page; + dsa_pointer span_pointer; + dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE]; + + /* Obtain a span object. */ + span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); + if (!DsaPointerIsValid(span_pointer)) + return InvalidDsaPointer; + + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + + /* Find a segment from which to allocate. */ + segment_map = get_best_segment(area, npages); + if (segment_map == NULL) + segment_map = make_new_segment(area, npages); + if (segment_map == NULL) + { + /* Can't make any more segments: game over. */ + LWLockRelease(DSA_AREA_LOCK(area)); + dsa_free(area, span_pointer); + return InvalidDsaPointer; + } + + /* + * Ask the free page manager for a run of pages. This should always + * succeed, since both get_best_segment and make_new_segment should + * only return a non-NULL pointer if it actually contains enough + * contiguous freespace. If it does fail, something in our backend + * private state is out of whack, so use FATAL to kill the process. + */ + if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) + elog(FATAL, + "dsa_allocate could not find %zu free pages", npages); + LWLockRelease(DSA_AREA_LOCK(area)); + + start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map), + first_page * FPM_PAGE_SIZE); + + /* Initialize span and pagemap. */ + LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), + LW_EXCLUSIVE); + init_span(area, span_pointer, pool, start_pointer, npages, + DSA_SCLASS_SPAN_LARGE); + segment_map->pagemap[first_page] = span_pointer; + LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); + + return start_pointer; + } + + /* Map allocation to a size class. */ + if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM) + { + int mapidx; + + /* For smaller sizes we have a lookup table... */ + mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) / + DSA_SIZE_CLASS_MAP_QUANTUM) - 1; + size_class = dsa_size_class_map[mapidx]; + } + else + { + uint16 min; + uint16 max; + + /* ... and for the rest we search by binary chop. */ + min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1]; + max = lengthof(dsa_size_classes) - 1; + + while (min < max) + { + uint16 mid = (min + max) / 2; + uint16 class_size = dsa_size_classes[mid]; + + if (class_size < size) + min = mid + 1; + else + max = mid; + } + + size_class = min; + } + Assert(size <= dsa_size_classes[size_class]); + Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]); + + /* + * Attempt to allocate an object from the appropriate pool. This might + * return InvalidDsaPointer if there's no space available. + */ + return alloc_object(area, size_class); +} + +/* + * Free memory obtained with dsa_allocate. + */ +void +dsa_free(dsa_area *area, dsa_pointer dp) +{ + dsa_segment_map *segment_map; + int pageno; + dsa_pointer span_pointer; + dsa_area_span *span; + char *superblock; + char *object; + Size size; + int size_class; + + /* Make sure we don't have a stale segment in the slot 'dp' refers to. */ + check_for_freed_segments(area); + + /* Locate the object, span and pool. */ + segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp)); + pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE; + span_pointer = segment_map->pagemap[pageno]; + span = dsa_get_address(area, span_pointer); + superblock = dsa_get_address(area, span->start); + object = dsa_get_address(area, dp); + size_class = span->size_class; + size = dsa_size_classes[size_class]; + + /* + * Special case for large objects that live in a special span: we return + * those pages directly to the free page manager and free the span. + */ + if (span->size_class == DSA_SCLASS_SPAN_LARGE) + { + +#ifdef CLOBBER_FREED_MEMORY + memset(object, 0x7f, span->npages * FPM_PAGE_SIZE); +#endif + + /* Give pages back to free page manager. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + FreePageManagerPut(segment_map->fpm, + DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, + span->npages); + LWLockRelease(DSA_AREA_LOCK(area)); + /* Unlink span. */ + LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), + LW_EXCLUSIVE); + unlink_span(area, span); + LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); + /* Free the span object so it can be reused. */ + dsa_free(area, span_pointer); + return; + } + +#ifdef CLOBBER_FREED_MEMORY + memset(object, 0x7f, size); +#endif + + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + + /* Put the object on the span's freelist. */ + Assert(object >= superblock); + Assert(object < superblock + DSA_SUPERBLOCK_SIZE); + Assert((object - superblock) % size == 0); + NextFreeObjectIndex(object) = span->firstfree; + span->firstfree = (object - superblock) / size; + ++span->nallocatable; + + /* + * See if the span needs to moved to a different fullness class, or be + * freed so its pages can be given back to the segment. + */ + if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1) + { + /* + * The block was completely full and is located in the + * highest-numbered fullness class, which is never scanned for free + * chunks. We must move it to the next-lower fullness class. + */ + unlink_span(area, span); + add_span_to_fullness_class(area, span, span_pointer, + DSA_FULLNESS_CLASSES - 2); + + /* + * If this is the only span, and there is no active span, then we + * should probably move this span to fullness class 1. (Otherwise if + * you allocate exactly all the objects in the only span, it moves to + * class 3, then you free them all, it moves to 2, and then is given + * back, leaving no active span). + */ + } + else if (span->nallocatable == span->nmax && + (span->fclass != 1 || span->prevspan != InvalidDsaPointer)) + { + /* + * This entire block is free, and it's not the active block for this + * size class. Return the memory to the free page manager. We don't + * do this for the active block to prevent hysteresis: if we + * repeatedly allocate and free the only chunk in the active block, it + * will be very inefficient if we deallocate and reallocate the block + * every time. + */ + destroy_superblock(area, span_pointer); + } + + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); +} + +/* + * Obtain a backend-local address for a dsa_pointer. 'dp' must point to + * memory allocated by the given area (possibly in another process) that + * hasn't yet been freed. This may cause a segment to be mapped into the + * current process if required, and may cause freed segments to be unmapped. + */ +void * +dsa_get_address(dsa_area *area, dsa_pointer dp) +{ + dsa_segment_index index; + Size offset; + + /* Convert InvalidDsaPointer to NULL. */ + if (!DsaPointerIsValid(dp)) + return NULL; + + /* Process any requests to detach from freed segments. */ + check_for_freed_segments(area); + + /* Break the dsa_pointer into its components. */ + index = DSA_EXTRACT_SEGMENT_NUMBER(dp); + offset = DSA_EXTRACT_OFFSET(dp); + Assert(index < DSA_MAX_SEGMENTS); + + /* Check if we need to cause this segment to be mapped in. */ + if (unlikely(area->segment_maps[index].mapped_address == NULL)) + { + /* Call for effect (we don't need the result). */ + get_segment_by_index(area, index); + } + + return area->segment_maps[index].mapped_address + offset; +} + +/* + * Pin this area, so that it will continue to exist even if all backends + * detach from it. In that case, the area can still be reattached to if a + * handle has been recorded somewhere. + */ +void +dsa_pin(dsa_area *area) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + if (area->control->pinned) + { + LWLockRelease(DSA_AREA_LOCK(area)); + elog(ERROR, "dsa_area already pinned"); + } + area->control->pinned = true; + ++area->control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Undo the effects of dsa_pin, so that the given area can be freed when no + * backends are attached to it. May be called only if dsa_pin has been + * called. + */ +void +dsa_unpin(dsa_area *area) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + Assert(area->control->refcnt > 1); + if (!area->control->pinned) + { + LWLockRelease(DSA_AREA_LOCK(area)); + elog(ERROR, "dsa_area not pinned"); + } + area->control->pinned = false; + --area->control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Set the total size limit for this area. This limit is checked whenever new + * segments need to be allocated from the operating system. If the new size + * limit is already exceeded, this has no immediate effect. + * + * Note that the total virtual memory usage may be temporarily larger than + * this limit when segments have been freed, but not yet detached by all + * backends that have attached to them. + */ +void +dsa_set_size_limit(dsa_area *area, Size limit) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + area->control->max_total_segment_size = limit; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Aggressively free all spare memory in the hope of returning DSM segments to + * the operating system. + */ +void +dsa_trim(dsa_area *area) +{ + int size_class; + + /* + * Trim in reverse pool order so we get to the spans-of-spans last, just + * in case any become entirely free while processing all the other pools. + */ + for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class) + { + dsa_area_pool *pool = &area->control->pools[size_class]; + dsa_pointer span_pointer; + + if (size_class == DSA_SCLASS_SPAN_LARGE) + { + /* Large object frees give back segments aggressively already. */ + continue; + } + + /* + * Search fullness class 1 only. That is where we expect to find an + * entirely empty superblock (entirely empty superblocks in other + * fullness classes are returned to the free page map by dsa_free). + */ + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + span_pointer = pool->spans[1]; + while (DsaPointerIsValid(span_pointer)) + { + dsa_area_span *span = dsa_get_address(area, span_pointer); + dsa_pointer next = span->nextspan; + + if (span->nallocatable == span->nmax) + destroy_superblock(area, span_pointer); + + span_pointer = next; + } + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); + } +} + +/* + * Print out debugging information about the internal state of the shared + * memory area. + */ +void +dsa_dump(dsa_area *area) +{ + Size i, + j; + + /* + * Note: This gives an inconsistent snapshot as it acquires and releases + * individual locks as it goes... + */ + + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + fprintf(stderr, "dsa_area handle %x:\n", area->control->handle); + fprintf(stderr, " max_total_segment_size: %zu\n", + area->control->max_total_segment_size); + fprintf(stderr, " total_segment_size: %zu\n", + area->control->total_segment_size); + fprintf(stderr, " refcnt: %d\n", area->control->refcnt); + fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f'); + fprintf(stderr, " segment bins:\n"); + for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) + { + if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_index segment_index; + + fprintf(stderr, + " segment bin %zu (at least %d contiguous pages free):\n", + i, 1 << (i - 1)); + segment_index = area->control->segment_bins[i]; + while (segment_index != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *segment_map; + + segment_map = + get_segment_by_index(area, segment_index); + + fprintf(stderr, + " segment index %zu, usable_pages = %zu, " + "contiguous_pages = %zu, mapped at %p\n", + segment_index, + segment_map->header->usable_pages, + fpm_largest(segment_map->fpm), + segment_map->mapped_address); + segment_index = segment_map->header->next; + } + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + + fprintf(stderr, " pools:\n"); + for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) + { + bool found = false; + + LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE); + for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) + if (DsaPointerIsValid(area->control->pools[i].spans[j])) + found = true; + if (found) + { + if (i == DSA_SCLASS_BLOCK_OF_SPANS) + fprintf(stderr, " pool for blocks of span objects:\n"); + else if (i == DSA_SCLASS_SPAN_LARGE) + fprintf(stderr, " pool for large object spans:\n"); + else + fprintf(stderr, + " pool for size class %zu (object size %hu bytes):\n", + i, dsa_size_classes[i]); + for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) + { + if (!DsaPointerIsValid(area->control->pools[i].spans[j])) + fprintf(stderr, " fullness class %zu is empty\n", j); + else + { + dsa_pointer span_pointer = area->control->pools[i].spans[j]; + + fprintf(stderr, " fullness class %zu:\n", j); + while (DsaPointerIsValid(span_pointer)) + { + dsa_area_span *span; + + span = dsa_get_address(area, span_pointer); + fprintf(stderr, + " span descriptor at %016lx, " + "superblock at %016lx, pages = %zu, " + "objects free = %hu/%hu\n", + span_pointer, span->start, span->npages, + span->nallocatable, span->nmax); + span_pointer = span->nextspan; + } + } + } + } + LWLockRelease(DSA_SCLASS_LOCK(area, i)); + } +} + +/* + * Return the smallest size that you can successfully provide to + * dsa_create_in_place. + */ +Size +dsa_minimum_size(void) +{ + Size size; + int pages = 0; + + size = MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager)); + + /* Figure out how many pages we need, including the page map... */ + while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages) + { + ++pages; + size += sizeof(dsa_pointer); + } + + return pages * FPM_PAGE_SIZE; +} + +/* + * Workhorse function for dsa_create and dsa_create_in_place. + */ +static dsa_area * +create_internal(void *place, size_t size, + int tranche_id, const char *tranche_name, + dsm_handle control_handle, + dsm_segment *control_segment) +{ + dsa_area_control *control; + dsa_area *area; + dsa_segment_map *segment_map; + Size usable_pages; + Size total_pages; + Size metadata_bytes; + int i; + + /* Sanity check on the space we have to work in. */ + if (size < dsa_minimum_size()) + elog(ERROR, "dsa_area space must be at least %zu, but %zu provided", + dsa_minimum_size(), size); + + /* Now figure out how much space is usuable */ + total_pages = size / FPM_PAGE_SIZE; + metadata_bytes = + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager)) + + total_pages * sizeof(dsa_pointer); + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + Assert(metadata_bytes <= size); + usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE; + + /* + * Initialize the dsa_area_control object located at the start of the + * space. + */ + control = (dsa_area_control *) place; + control->segment_header.magic = + DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0; + control->segment_header.next = DSA_SEGMENT_INDEX_NONE; + control->segment_header.prev = DSA_SEGMENT_INDEX_NONE; + control->segment_header.usable_pages = usable_pages; + control->segment_header.freed = false; + control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE; + control->handle = control_handle; + control->max_total_segment_size = SIZE_MAX; + control->total_segment_size = size; + memset(&control->segment_handles[0], 0, + sizeof(dsm_handle) * DSA_MAX_SEGMENTS); + control->segment_handles[0] = control_handle; + for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) + control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE; + control->high_segment_index = 0; + control->refcnt = 1; + control->freed_segment_counter = 0; + control->lwlock_tranche_id = tranche_id; + strlcpy(control->lwlock_tranche_name, tranche_name, DSA_MAXLEN); + + /* + * Create the dsa_area object that this backend will use to access the + * area. Other backends will need to obtain their own dsa_area object by + * attaching. + */ + area = palloc(sizeof(dsa_area)); + area->control = control; + area->resowner = CurrentResourceOwner; + memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); + area->high_segment_index = 0; + area->lwlock_tranche.array_base = &area->control->pools[0]; + area->lwlock_tranche.array_stride = sizeof(dsa_area_pool); + area->lwlock_tranche.name = control->lwlock_tranche_name; + LWLockRegisterTranche(control->lwlock_tranche_id, &area->lwlock_tranche); + LWLockInitialize(&control->lock, control->lwlock_tranche_id); + for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) + LWLockInitialize(DSA_SCLASS_LOCK(area, i), + control->lwlock_tranche_id); + + /* Set up the segment map for this process's mapping. */ + segment_map = &area->segment_maps[0]; + segment_map->segment = control_segment; + segment_map->mapped_address = place; + segment_map->header = (dsa_segment_header *) place; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_area_control))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Set up the free page map. */ + FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); + /* There can be 0 usable pages if size is dsa_minimum_size(). */ + + if (usable_pages > 0) + FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, + usable_pages); + + /* Put this segment into the appropriate bin. */ + control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0; + segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); + + return area; +} + +/* + * Workhorse function for dsa_attach and dsa_attach_in_place. + */ +static dsa_area * +attach_internal(void *place, dsm_segment *segment, dsa_handle handle) +{ + dsa_area_control *control; + dsa_area *area; + dsa_segment_map *segment_map; + + control = (dsa_area_control *) place; + Assert(control->handle == handle); + Assert(control->segment_handles[0] == handle); + Assert(control->segment_header.magic == + (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0)); + + /* Build the backend-local area object. */ + area = palloc(sizeof(dsa_area)); + area->control = control; + area->resowner = CurrentResourceOwner; + memset(&area->segment_maps[0], 0, + sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); + area->high_segment_index = 0; + area->lwlock_tranche.array_base = &area->control->pools[0]; + area->lwlock_tranche.array_stride = sizeof(dsa_area_pool); + area->lwlock_tranche.name = control->lwlock_tranche_name; + LWLockRegisterTranche(control->lwlock_tranche_id, &area->lwlock_tranche); + + /* Set up the segment map for this process's mapping. */ + segment_map = &area->segment_maps[0]; + segment_map->segment = segment; /* NULL for in-place */ + segment_map->mapped_address = place; + segment_map->header = (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Bump the reference count. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + ++control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); + + return area; +} + +/* + * Add a new span to fullness class 1 of the indicated pool. + */ +static void +init_span(dsa_area *area, + dsa_pointer span_pointer, + dsa_area_pool *pool, dsa_pointer start, Size npages, + uint16 size_class) +{ + dsa_area_span *span = dsa_get_address(area, span_pointer); + Size obsize = dsa_size_classes[size_class]; + + /* + * The per-pool lock must be held because we manipulate the span list for + * this pool. + */ + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + + /* Push this span onto the front of the span list for fullness class 1. */ + if (DsaPointerIsValid(pool->spans[1])) + { + dsa_area_span *head = (dsa_area_span *) + dsa_get_address(area, pool->spans[1]); + + head->prevspan = span_pointer; + } + span->pool = DsaAreaPoolToDsaPointer(area, pool); + span->nextspan = pool->spans[1]; + span->prevspan = InvalidDsaPointer; + pool->spans[1] = span_pointer; + + span->start = start; + span->npages = npages; + span->size_class = size_class; + span->ninitialized = 0; + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + { + /* + * A block-of-spans contains its own descriptor, so mark one object as + * initialized and reduce the count of allocatable objects by one. + * Doing this here has the side effect of also reducing nmax by one, + * which is important to make sure we free this object at the correct + * time. + */ + span->ninitialized = 1; + span->nallocatable = FPM_PAGE_SIZE / obsize - 1; + } + else if (size_class != DSA_SCLASS_SPAN_LARGE) + span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize; + span->firstfree = DSA_SPAN_NOTHING_FREE; + span->nmax = span->nallocatable; + span->fclass = 1; +} + +/* + * Transfer the first span in one fullness class to the head of another + * fullness class. + */ +static bool +transfer_first_span(dsa_area *area, + dsa_area_pool *pool, int fromclass, int toclass) +{ + dsa_pointer span_pointer; + dsa_area_span *span; + dsa_area_span *nextspan; + + /* Can't do it if source list is empty. */ + span_pointer = pool->spans[fromclass]; + if (!DsaPointerIsValid(span_pointer)) + return false; + + /* Remove span from head of source list. */ + span = dsa_get_address(area, span_pointer); + pool->spans[fromclass] = span->nextspan; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = InvalidDsaPointer; + } + + /* Add span to head of target list. */ + span->nextspan = pool->spans[toclass]; + pool->spans[toclass] = span_pointer; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = span_pointer; + } + span->fclass = toclass; + + return true; +} + +/* + * Allocate one object of the requested size class from the given area. + */ +static inline dsa_pointer +alloc_object(dsa_area *area, int size_class) +{ + dsa_area_pool *pool = &area->control->pools[size_class]; + dsa_area_span *span; + dsa_pointer block; + dsa_pointer result; + char *object; + Size size; + + /* + * Even though ensure_active_superblock can in turn call alloc_object if + * it needs to allocate a new span, that's always from a different pool, + * and the order of lock acquisition is always the same, so it's OK that + * we hold this lock for the duration of this function. + */ + Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + + /* + * If there's no active superblock, we must successfully obtain one or + * fail the request. + */ + if (!DsaPointerIsValid(pool->spans[1]) && + !ensure_active_superblock(area, pool, size_class)) + { + result = InvalidDsaPointer; + } + else + { + /* + * There should be a block in fullness class 1 at this point, and it + * should never be completely full. Thus we can either pop an object + * from the free list or, failing that, initialize a new object. + */ + Assert(DsaPointerIsValid(pool->spans[1])); + span = (dsa_area_span *) + dsa_get_address(area, pool->spans[1]); + Assert(span->nallocatable > 0); + block = span->start; + Assert(size_class < DSA_NUM_SIZE_CLASSES); + size = dsa_size_classes[size_class]; + if (span->firstfree != DSA_SPAN_NOTHING_FREE) + { + result = block + span->firstfree * size; + object = dsa_get_address(area, result); + span->firstfree = NextFreeObjectIndex(object); + } + else + { + result = block + span->ninitialized * size; + ++span->ninitialized; + } + --span->nallocatable; + + /* If it's now full, move it to the highest-numbered fullness class. */ + if (span->nallocatable == 0) + transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1); + } + + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); + + return result; +} + +/* + * Ensure an active (i.e. fullness class 1) superblock, unless all existing + * superblocks are completely full and no more can be allocated. + * + * Fullness classes K of 0..N are loosely intended to represent blocks whose + * utilization percentage is at least K/N, but we only enforce this rigorously + * for the highest-numbered fullness class, which always contains exactly + * those blocks that are completely full. It's otherwise acceptable for a + * block to be in a higher-numbered fullness class than the one to which it + * logically belongs. In addition, the active block, which is always the + * first block in fullness class 1, is permitted to have a higher allocation + * percentage than would normally be allowable for that fullness class; we + * don't move it until it's completely full, and then it goes to the + * highest-numbered fullness class. + * + * It might seem odd that the active block is the head of fullness class 1 + * rather than fullness class 0, but experience with other allocators has + * shown that it's usually better to allocate from a block that's moderately + * full rather than one that's nearly empty. Insofar as is reasonably + * possible, we want to avoid performing new allocations in a block that would + * otherwise become empty soon. + */ +static bool +ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, + int size_class) +{ + dsa_pointer span_pointer; + dsa_pointer start_pointer; + Size obsize = dsa_size_classes[size_class]; + Size nmax; + int fclass; + Size npages = 1; + Size first_page; + Size i; + dsa_segment_map *segment_map; + + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + + /* + * Compute the number of objects that will fit in a block of this size + * class. Span-of-spans blocks are just a single page, and the first + * object isn't available for use because it describes the block-of-spans + * itself. + */ + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + nmax = FPM_PAGE_SIZE / obsize - 1; + else + nmax = DSA_SUPERBLOCK_SIZE / obsize; + + /* + * If fullness class 1 is empty, try to find a span to put in it by + * scanning higher-numbered fullness classes (excluding the last one, + * whose blocks are certain to all be completely full). + */ + for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) + { + span_pointer = pool->spans[fclass]; + + while (DsaPointerIsValid(span_pointer)) + { + int tfclass; + dsa_area_span *span; + dsa_area_span *nextspan; + dsa_area_span *prevspan; + dsa_pointer next_span_pointer; + + span = (dsa_area_span *) + dsa_get_address(area, span_pointer); + next_span_pointer = span->nextspan; + + /* Figure out what fullness class should contain this span. */ + tfclass = (nmax - span->nallocatable) + * (DSA_FULLNESS_CLASSES - 1) / nmax; + + /* Look up next span. */ + if (DsaPointerIsValid(span->nextspan)) + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + else + nextspan = NULL; + + /* + * If utilization has dropped enough that this now belongs in some + * other fullness class, move it there. + */ + if (tfclass < fclass) + { + /* Remove from the current fullness class list. */ + if (pool->spans[fclass] == span_pointer) + { + /* It was the head; remove it. */ + Assert(!DsaPointerIsValid(span->prevspan)); + pool->spans[fclass] = span->nextspan; + if (nextspan != NULL) + nextspan->prevspan = InvalidDsaPointer; + } + else + { + /* It was not the head. */ + Assert(DsaPointerIsValid(span->prevspan)); + prevspan = (dsa_area_span *) + dsa_get_address(area, span->prevspan); + prevspan->nextspan = span->nextspan; + } + if (nextspan != NULL) + nextspan->prevspan = span->prevspan; + + /* Push onto the head of the new fullness class list. */ + span->nextspan = pool->spans[tfclass]; + pool->spans[tfclass] = span_pointer; + span->prevspan = InvalidDsaPointer; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = span_pointer; + } + span->fclass = tfclass; + } + + /* Advance to next span on list. */ + span_pointer = next_span_pointer; + } + + /* Stop now if we found a suitable block. */ + if (DsaPointerIsValid(pool->spans[1])) + return true; + } + + /* + * If there are no blocks that properly belong in fullness class 1, pick + * one from some other fullness class and move it there anyway, so that we + * have an allocation target. Our last choice is to transfer a block + * that's almost empty (and might become completely empty soon if left + * alone), but even that is better than failing, which is what we must do + * if there are no blocks at all with freespace. + */ + Assert(!DsaPointerIsValid(pool->spans[1])); + for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) + if (transfer_first_span(area, pool, fclass, 1)) + return true; + if (!DsaPointerIsValid(pool->spans[1]) && + transfer_first_span(area, pool, 0, 1)) + return true; + + /* + * We failed to find an existing span with free objects, so we need to + * allocate a new superblock and construct a new span to manage it. + * + * First, get a dsa_area_span object to describe the new superblock block + * ... unless this allocation is for a dsa_area_span object, in which case + * that's surely not going to work. We handle that case by storing the + * span describing a block-of-spans inline. + */ + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + { + span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); + if (!DsaPointerIsValid(span_pointer)) + return false; + npages = DSA_PAGES_PER_SUPERBLOCK; + } + + /* Find or create a segment and allocate the superblock. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + segment_map = get_best_segment(area, npages); + if (segment_map == NULL) + { + segment_map = make_new_segment(area, npages); + if (segment_map == NULL) + { + LWLockRelease(DSA_AREA_LOCK(area)); + return false; + } + } + if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) + { + LWLockRelease(DSA_AREA_LOCK(area)); + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + dsa_free(area, span_pointer); + return false; + } + LWLockRelease(DSA_AREA_LOCK(area)); + + /* Compute the start of the superblock. */ + start_pointer = + DSA_MAKE_POINTER(get_segment_index(area, segment_map), + first_page * FPM_PAGE_SIZE); + + /* + * If this is a block-of-spans, carve the descriptor right out of the + * allocated space. + */ + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + { + /* + * We have a pointer into the segment. We need to build a dsa_pointer + * from the segment index and offset into the segment. + */ + span_pointer = start_pointer; + } + + /* Initialize span and pagemap. */ + init_span(area, span_pointer, pool, start_pointer, npages, size_class); + for (i = 0; i < npages; ++i) + segment_map->pagemap[first_page + i] = span_pointer; + + return true; +} + +/* + * Return the segment map corresponding to a given segment index, mapping the + * segment in if necessary. For internal segment book-keeping, this is called + * with the area lock held. It is also called by dsa_free and dsa_get_address + * without any locking, relying on the fact they have a known live segment + * index and they always call check_for_freed_segments to ensures that any + * freed segment occupying the same slot is detached first. + */ +static dsa_segment_map * +get_segment_by_index(dsa_area *area, dsa_segment_index index) +{ + if (unlikely(area->segment_maps[index].mapped_address == NULL)) + { + dsm_handle handle; + dsm_segment *segment; + dsa_segment_map *segment_map; + ResourceOwner oldowner; + + /* + * If we are reached by dsa_free or dsa_get_address, there must be at + * least one object allocated in the referenced segment. Otherwise, + * their caller has a double-free or access-after-free bug, which we + * have no hope of detecting. So we know it's safe to access this + * array slot without holding a lock; it won't change underneath us. + * Furthermore, we know that we can see the latest contents of the + * slot, as explained in check_for_freed_segments, which those + * functions call before arriving here. + */ + handle = area->control->segment_handles[index]; + + /* It's an erro to try to access an unused slot. */ + if (handle == DSM_HANDLE_INVALID) + elog(ERROR, + "dsa_area could not attach to a segment that has been freed"); + + oldowner = CurrentResourceOwner; + CurrentResourceOwner = area->resowner; + segment = dsm_attach(handle); + CurrentResourceOwner = oldowner; + if (segment == NULL) + elog(ERROR, "dsa_area could not attach to segment"); + segment_map = &area->segment_maps[index]; + segment_map->segment = segment; + segment_map->mapped_address = dsm_segment_address(segment); + segment_map->header = + (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Remember the highest index this backend has ever mapped. */ + if (area->high_segment_index < index) + area->high_segment_index = index; + + Assert(segment_map->header->magic == + (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index)); + } + + return &area->segment_maps[index]; +} + +/* + * Return a superblock to the free page manager. If the underlying segment + * has become entirely free, then return it to the operating system. + * + * The appropriate pool lock must be held. + */ +static void +destroy_superblock(dsa_area *area, dsa_pointer span_pointer) +{ + dsa_area_span *span = dsa_get_address(area, span_pointer); + int size_class = span->size_class; + dsa_segment_map *segment_map; + + segment_map = + get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start)); + + /* Remove it from its fullness class list. */ + unlink_span(area, span); + + /* + * Note: Here we acquire the area lock while we already hold a per-pool + * lock. We never hold the area lock and then take a pool lock, or we + * could deadlock. + */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + FreePageManagerPut(segment_map->fpm, + DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, + span->npages); + /* Check if the segment is now entirely free. */ + if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages) + { + dsa_segment_index index = get_segment_index(area, segment_map); + + /* If it's not the segment with extra control data, free it. */ + if (index != 0) + { + /* + * Give it back to the OS, and allow other backends to detect that + * they need to detach. + */ + unlink_segment(area, segment_map); + segment_map->header->freed = true; + Assert(area->control->total_segment_size >= + segment_map->header->size); + area->control->total_segment_size -= + segment_map->header->size; + dsm_unpin_segment(dsm_segment_handle(segment_map->segment)); + dsm_detach(segment_map->segment); + area->control->segment_handles[index] = DSM_HANDLE_INVALID; + ++area->control->freed_segment_counter; + segment_map->segment = NULL; + segment_map->header = NULL; + segment_map->mapped_address = NULL; + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + + /* + * Span-of-spans blocks store the span which describes them within the + * block itself, so freeing the storage implicitly frees the descriptor + * also. If this is a block of any other type, we need to separately free + * the span object also. This recursive call to dsa_free will acquire the + * span pool's lock. We can't deadlock because the acquisition order is + * always some other pool and then the span pool. + */ + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + dsa_free(area, span_pointer); +} + +static void +unlink_span(dsa_area *area, dsa_area_span *span) +{ + if (DsaPointerIsValid(span->nextspan)) + { + dsa_area_span *next = dsa_get_address(area, span->nextspan); + + next->prevspan = span->prevspan; + } + if (DsaPointerIsValid(span->prevspan)) + { + dsa_area_span *prev = dsa_get_address(area, span->prevspan); + + prev->nextspan = span->nextspan; + } + else + { + dsa_area_pool *pool = dsa_get_address(area, span->pool); + + pool->spans[span->fclass] = span->nextspan; + } +} + +static void +add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, + dsa_pointer span_pointer, + int fclass) +{ + dsa_area_pool *pool = dsa_get_address(area, span->pool); + + if (DsaPointerIsValid(pool->spans[fclass])) + { + dsa_area_span *head = dsa_get_address(area, + pool->spans[fclass]); + + head->prevspan = span_pointer; + } + span->prevspan = InvalidDsaPointer; + span->nextspan = pool->spans[fclass]; + pool->spans[fclass] = span_pointer; + span->fclass = fclass; +} + +/* + * Detach from an area that was either created or attached to by this process. + */ +void +dsa_detach(dsa_area *area) +{ + int i; + + /* Detach from all segments. */ + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_detach(area->segment_maps[i].segment); + + /* + * Note that 'detaching' (= detaching from DSM segments) doesn't include + * 'releasing' (= adjusting the reference count). It would be nice to + * combine these operations, but client code might never get around to + * calling dsa_detach because of an error path, and a detach hook on any + * particular segment is too late to detach other segments in the area + * without risking a 'leak' warning in the non-error path. + */ + + /* Free the backend-local area object. */ + pfree(area); +} + +/* + * Unlink a segment from the bin that contains it. + */ +static void +unlink_segment(dsa_area *area, dsa_segment_map *segment_map) +{ + if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *prev; + + prev = get_segment_by_index(area, segment_map->header->prev); + prev->header->next = segment_map->header->next; + } + else + { + Assert(area->control->segment_bins[segment_map->header->bin] == + get_segment_index(area, segment_map)); + area->control->segment_bins[segment_map->header->bin] = + segment_map->header->next; + } + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next; + + next = get_segment_by_index(area, segment_map->header->next); + next->header->prev = segment_map->header->prev; + } +} + +/* + * Find a segment that could satisfy a request for 'npages' of contiguous + * memory, or return NULL if none can be found. This may involve attaching to + * segments that weren't previously attached so that we can query their free + * pages map. + */ +static dsa_segment_map * +get_best_segment(dsa_area *area, Size npages) +{ + Size bin; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + + /* + * Start searching from the first bin that *might* have enough contiguous + * pages. + */ + for (bin = contiguous_pages_to_segment_bin(npages); + bin < DSA_NUM_SEGMENT_BINS; + ++bin) + { + /* + * The minimum contiguous size that any segment in this bin should + * have. We'll re-bin if we see segments with fewer. + */ + Size threshold = 1 << (bin - 1); + dsa_segment_index segment_index; + + /* Search this bin for a segment with enough contiguous space. */ + segment_index = area->control->segment_bins[bin]; + while (segment_index != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *segment_map; + dsa_segment_index next_segment_index; + Size contiguous_pages; + + segment_map = get_segment_by_index(area, segment_index); + next_segment_index = segment_map->header->next; + contiguous_pages = fpm_largest(segment_map->fpm); + + /* Not enough for the request, still enough for this bin. */ + if (contiguous_pages >= threshold && contiguous_pages < npages) + { + segment_index = next_segment_index; + continue; + } + + /* Re-bin it if it's no longer in the appropriate bin. */ + if (contiguous_pages < threshold) + { + Size new_bin; + + new_bin = contiguous_pages_to_segment_bin(contiguous_pages); + + /* Remove it from its current bin. */ + unlink_segment(area, segment_map); + + /* Push it onto the front of its new bin. */ + segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; + segment_map->header->next = + area->control->segment_bins[new_bin]; + segment_map->header->bin = new_bin; + area->control->segment_bins[new_bin] = segment_index; + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next; + + next = get_segment_by_index(area, + segment_map->header->next); + Assert(next->header->bin == new_bin); + next->header->prev = segment_index; + } + + /* + * But fall through to see if it's enough to satisfy this + * request anyway.... + */ + } + + /* Check if we are done. */ + if (contiguous_pages >= npages) + return segment_map; + + /* Continue searching the same bin. */ + segment_index = next_segment_index; + } + } + + /* Not found. */ + return NULL; +} + +/* + * Create a new segment that can handle at least requested_pages. Returns + * NULL if the requested total size limit or maximum allowed number of + * segments would be exceeded. + */ +static dsa_segment_map * +make_new_segment(dsa_area *area, Size requested_pages) +{ + dsa_segment_index new_index; + Size metadata_bytes; + Size total_size; + Size total_pages; + Size usable_pages; + dsa_segment_map *segment_map; + dsm_segment *segment; + ResourceOwner oldowner; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + + /* Find a segment slot that is not in use (linearly for now). */ + for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index) + { + if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID) + break; + } + if (new_index == DSA_MAX_SEGMENTS) + return NULL; + + /* + * If the total size limit is already exceeded, then we exit early and + * avoid arithmetic wraparound in the unsigned expressions below. + */ + if (area->control->total_segment_size >= + area->control->max_total_segment_size) + return NULL; + + /* + * The size should be at least as big as requested, and at least big + * enough to follow a geometric series that approximately doubles the + * total storage each time we create a new segment. We use geometric + * growth because the underlying DSM system isn't designed for large + * numbers of segments (otherwise we might even consider just using one + * DSM segment for each large allocation and for each superblock, and then + * we wouldn't need to use FreePageManager). + * + * We decide on a total segment size first, so that we produce tidy + * power-of-two sized segments. This is a good property to have if we + * move to huge pages in the future. Then we work back to the number of + * pages we can fit. + */ + total_size = DSA_INITIAL_SEGMENT_SIZE * + ((Size) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE)); + total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE); + total_size = Min(total_size, + area->control->max_total_segment_size - + area->control->total_segment_size); + + total_pages = total_size / FPM_PAGE_SIZE; + metadata_bytes = + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager)) + + sizeof(dsa_pointer) * total_pages; + + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + if (total_size <= metadata_bytes) + return NULL; + usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE; + Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size); + + /* See if that is enough... */ + if (requested_pages > usable_pages) + { + /* + * We'll make an odd-sized segment, working forward from the requested + * number of pages. + */ + usable_pages = requested_pages; + metadata_bytes = + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager)) + + usable_pages * sizeof(dsa_pointer); + + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE; + + /* Is that too large for dsa_pointer's addressing scheme? */ + if (total_size > DSA_MAX_SEGMENT_SIZE) + return NULL; + + /* Would that exceed the limit? */ + if (total_size > area->control->max_total_segment_size - + area->control->total_segment_size) + return NULL; + } + + /* Create the segment. */ + oldowner = CurrentResourceOwner; + CurrentResourceOwner = area->resowner; + segment = dsm_create(total_size); + CurrentResourceOwner = oldowner; + if (segment == NULL) + return NULL; + dsm_pin_segment(segment); + + /* Store the handle in shared memory to be found by index. */ + area->control->segment_handles[new_index] = + dsm_segment_handle(segment); + /* Track the highest segment index in the history of the area. */ + if (area->control->high_segment_index < new_index) + area->control->high_segment_index = new_index; + /* Track the highest segment index this backend has ever mapped. */ + if (area->high_segment_index < new_index) + area->high_segment_index = new_index; + /* Track total size of all segments. */ + area->control->total_segment_size += total_size; + Assert(area->control->total_segment_size <= + area->control->max_total_segment_size); + + /* Build a segment map for this segment in this backend. */ + segment_map = &area->segment_maps[new_index]; + segment_map->segment = segment; + segment_map->mapped_address = dsm_segment_address(segment); + segment_map->header = (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Set up the free page map. */ + FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); + FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, + usable_pages); + + /* Set up the segment header and put it in the appropriate bin. */ + segment_map->header->magic = + DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index; + segment_map->header->usable_pages = usable_pages; + segment_map->header->size = total_size; + segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); + segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; + segment_map->header->next = + area->control->segment_bins[segment_map->header->bin]; + segment_map->header->freed = false; + area->control->segment_bins[segment_map->header->bin] = new_index; + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next = + get_segment_by_index(area, segment_map->header->next); + + Assert(next->header->bin == segment_map->header->bin); + next->header->prev = new_index; + } + + return segment_map; +} + +/* + * Check if any segments have been freed by destroy_superblock, so we can + * detach from them in this backend. This function is called by + * dsa_get_address and dsa_free to make sure that a dsa_pointer they have + * received can be resolved to the correct segment. + * + * The danger we want to defend against is that there could be an old segment + * mapped into a given slot in this backend, and the dsa_pointer they have + * might refer to some new segment in the same slot. So those functions must + * be sure to process all instructions to detach from a freed segment that had + * been generated by the time this process received the dsa_pointer, before + * they call get_segment_by_index. + */ +static void +check_for_freed_segments(dsa_area *area) +{ + Size freed_segment_counter; + + /* + * Any other process that has freed a segment has incremented + * free_segment_counter while holding an LWLock, and that must precede any + * backend creating a new segment in the same slot while holding an + * LWLock, and that must precede the creation of any dsa_pointer pointing + * into the new segment which might reach us here, and the caller must + * have sent the dsa_pointer to this process using appropriate memory + * synchronization (some kind of locking or atomic primitive or system + * call). So all we need to do on the reading side is ask for the load of + * freed_segment_counter to follow the caller's load of the dsa_pointer it + * has, and we can be sure to detect any segments that had been freed as + * of the time that the dsa_pointer reached this process. + */ + pg_read_barrier(); + freed_segment_counter = area->control->freed_segment_counter; + if (unlikely(area->freed_segment_counter != freed_segment_counter)) + { + int i; + + /* Check all currently mapped segments to find what's been freed. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + for (i = 0; i <= area->high_segment_index; ++i) + { + if (area->segment_maps[i].header != NULL && + area->segment_maps[i].header->freed) + { + dsm_detach(area->segment_maps[i].segment); + area->segment_maps[i].segment = NULL; + area->segment_maps[i].header = NULL; + area->segment_maps[i].mapped_address = NULL; + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + area->freed_segment_counter = freed_segment_counter; + } +} diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c new file mode 100644 index 000000000000..8c017a425a44 --- /dev/null +++ b/src/backend/utils/mmgr/freepage.c @@ -0,0 +1,1886 @@ +/*------------------------------------------------------------------------- + * + * freepage.c + * Management of free memory pages. + * + * The intention of this code is to provide infrastructure for memory + * allocators written specifically for PostgreSQL. At least in the case + * of dynamic shared memory, we can't simply use malloc() or even + * relatively thin wrappers like palloc() which sit on top of it, because + * no allocator built into the operating system will deal with relative + * pointers. In the future, we may find other cases in which greater + * control over our own memory management seems desirable. + * + * A FreePageManager keeps track of which 4kB pages of memory are currently + * unused from the point of view of some higher-level memory allocator. + * Unlike a user-facing allocator such as palloc(), a FreePageManager can + * only allocate and free in units of whole pages, and freeing an + * allocation can only be done given knowledge of its length in pages. + * + * Since a free page manager has only a fixed amount of dedicated memory, + * and since there is no underlying allocator, it uses the free pages + * it is given to manage to store its bookkeeping data. It keeps multiple + * freelists of runs of pages, sorted by the size of the run; the head of + * each freelist is stored in the FreePageManager itself, and the first + * page of each run contains a relative pointer to the next run. See + * FreePageManagerGetInternal for more details on how the freelists are + * managed. + * + * To avoid memory fragmentation, it's important to consolidate adjacent + * spans of pages whenever possible; otherwise, large allocation requests + * might not be satisfied even when sufficient contiguous space is + * available. Therefore, in addition to the freelists, we maintain an + * in-memory btree of free page ranges ordered by page number. If a + * range being freed precedes or follows a range that is already free, + * the existing range is extended; if it exactly bridges the gap between + * free ranges, then the two existing ranges are consolidated with the + * newly-freed range to form one great big range of free pages. + * + * When there is only one range of free pages, the btree is trivial and + * is stored within the FreePageManager proper; otherwise, pages are + * allocated from the area under management as needed. Even in cases + * where memory fragmentation is very severe, only a tiny fraction of + * the pages under management are consumed by this btree. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/freepage.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" + +#include "utils/freepage.h" +#include "utils/relptr.h" + + +/* Magic numbers to identify various page types */ +#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0 +#define FREE_PAGE_LEAF_MAGIC 0x98eae728 +#define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9 + +/* Doubly linked list of spans of free pages; stored in first page of span. */ +struct FreePageSpanLeader +{ + int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */ + Size npages; /* number of pages in span */ + RelptrFreePageSpanLeader prev; + RelptrFreePageSpanLeader next; +}; + +/* Common header for btree leaf and internal pages. */ +typedef struct FreePageBtreeHeader +{ + int magic; /* FREE_PAGE_LEAF_MAGIC or + * FREE_PAGE_INTERNAL_MAGIC */ + Size nused; /* number of items used */ + RelptrFreePageBtree parent; /* uplink */ +} FreePageBtreeHeader; + +/* Internal key; points to next level of btree. */ +typedef struct FreePageBtreeInternalKey +{ + Size first_page; /* low bound for keys on child page */ + RelptrFreePageBtree child; /* downlink */ +} FreePageBtreeInternalKey; + +/* Leaf key; no payload data. */ +typedef struct FreePageBtreeLeafKey +{ + Size first_page; /* first page in span */ + Size npages; /* number of pages in span */ +} FreePageBtreeLeafKey; + +/* Work out how many keys will fit on a page. */ +#define FPM_ITEMS_PER_INTERNAL_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeInternalKey)) +#define FPM_ITEMS_PER_LEAF_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeLeafKey)) + +/* A btree page of either sort */ +struct FreePageBtree +{ + FreePageBtreeHeader hdr; + union + { + FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]; + FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]; + } u; +}; + +/* Results of a btree search */ +typedef struct FreePageBtreeSearchResult +{ + FreePageBtree *page; + Size index; + bool found; + unsigned split_pages; +} FreePageBtreeSearchResult; + +/* Helper functions */ +static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, + FreePageBtree *btp); +static Size FreePageBtreeCleanup(FreePageManager *fpm); +static FreePageBtree *FreePageBtreeFindLeftSibling(char *base, + FreePageBtree *btp); +static FreePageBtree *FreePageBtreeFindRightSibling(char *base, + FreePageBtree *btp); +static Size FreePageBtreeFirstKey(FreePageBtree *btp); +static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm); +static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, + Size index, Size first_page, FreePageBtree *child); +static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, + Size first_page, Size npages); +static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno); +static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, + Size index); +static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp); +static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result); +static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page); +static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page); +static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm, + FreePageBtree *btp); +static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp); +static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf); +static void FreePageManagerDumpSpans(FreePageManager *fpm, + FreePageSpanLeader *span, Size expected_pages, + StringInfo buf); +static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, + Size *first_page); +static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, + Size npages, bool soft); +static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno); +static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, + Size npages); +static Size FreePageManagerLargestContiguous(FreePageManager *fpm); +static void FreePageManagerUpdateLargest(FreePageManager *fpm); + +#if FPM_EXTRA_ASSERTS +static Size sum_free_pages(FreePageManager *fpm); +#endif + +/* + * Initialize a new, empty free page manager. + * + * 'fpm' should reference caller-provided memory large enough to contain a + * FreePageManager. We'll initialize it here. + * + * 'base' is the address to which all pointers are relative. When managing + * a dynamic shared memory segment, it should normally be the base of the + * segment. When managing backend-private memory, it can be either NULL or, + * if managing a single contiguous extent of memory, the start of that extent. + */ +void +FreePageManagerInitialize(FreePageManager *fpm, char *base) +{ + Size f; + + relptr_store(base, fpm->self, fpm); + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL); + fpm->btree_depth = 0; + fpm->btree_recycle_count = 0; + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->contiguous_pages = 0; + fpm->contiguous_pages_dirty = true; +#ifdef FPM_EXTRA_ASSERTS + fpm->free_pages = 0; +#endif + + for (f = 0; f < FPM_NUM_FREELISTS; f++) + relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL); +} + +/* + * Allocate a run of pages of the given length from the free page manager. + * The return value indicates whether we were able to satisfy the request; + * if true, the first page of the allocation is stored in *first_page. + */ +bool +FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) +{ + bool result; + Size contiguous_pages; + + result = FreePageManagerGetInternal(fpm, npages, first_page); + + /* + * It's a bit counterintuitive, but allocating pages can actually create + * opportunities for cleanup that create larger ranges. We might pull a + * key out of the btree that enables the item at the head of the btree + * recycle list to be inserted; and then if there are more items behind it + * one of those might cause two currently-separated ranges to merge, + * creating a single range of contiguous pages larger than any that + * existed previously. It might be worth trying to improve the cleanup + * algorithm to avoid such corner cases, but for now we just notice the + * condition and do the appropriate reporting. + */ + contiguous_pages = FreePageBtreeCleanup(fpm); + if (fpm->contiguous_pages < contiguous_pages) + fpm->contiguous_pages = contiguous_pages; + + /* + * FreePageManagerGetInternal may have set contiguous_pages_dirty. + * Recompute contigous_pages if so. + */ + FreePageManagerUpdateLargest(fpm); + +#ifdef FPM_EXTRA_ASSERTS + if (result) + { + Assert(fpm->free_pages >= npages); + fpm->free_pages -= npages; + } + Assert(fpm->free_pages == sum_free_pages(fpm)); + Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); +#endif + return result; +} + +#ifdef FPM_EXTRA_ASSERTS +static void +sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum) +{ + char *base = fpm_segment_base(fpm); + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC || + btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + ++*sum; + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + Size index; + + + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + sum_free_pages_recurse(fpm, child, sum); + } + } +} +static Size +sum_free_pages(FreePageManager *fpm) +{ + FreePageSpanLeader *recycle; + char *base = fpm_segment_base(fpm); + Size sum = 0; + int list; + + /* Count the spans by scanning the freelists. */ + for (list = 0; list < FPM_NUM_FREELISTS; ++list) + { + + if (!relptr_is_null(fpm->freelist[list])) + { + FreePageSpanLeader *candidate = + relptr_access(base, fpm->freelist[list]); + + do + { + sum += candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + } + + /* Count btree internal pages. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + sum_free_pages_recurse(fpm, root, &sum); + } + + /* Count the recycle list. */ + for (recycle = relptr_access(base, fpm->btree_recycle); + recycle != NULL; + recycle = relptr_access(base, recycle->next)) + { + Assert(recycle->npages == 1); + ++sum; + } + + return sum; +} +#endif + +/* + * Compute the size of the largest run of pages that the user could + * succesfully get. + */ +static Size +FreePageManagerLargestContiguous(FreePageManager *fpm) +{ + char *base; + Size largest; + + base = fpm_segment_base(fpm); + largest = 0; + if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1])) + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]); + do + { + if (candidate->npages > largest) + largest = candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + else + { + Size f = FPM_NUM_FREELISTS - 1; + + do + { + --f; + if (!relptr_is_null(fpm->freelist[f])) + { + largest = f + 1; + break; + } + } while (f > 0); + } + + return largest; +} + +/* + * Recompute the size of the largest run of pages that the user could + * succesfully get, if it has been marked dirty. + */ +static void +FreePageManagerUpdateLargest(FreePageManager *fpm) +{ + if (!fpm->contiguous_pages_dirty) + return; + + fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm); + fpm->contiguous_pages_dirty = false; +} + +/* + * Transfer a run of pages to the free page manager. + */ +void +FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages) +{ + Size contiguous_pages; + + Assert(npages > 0); + + /* Record the new pages. */ + contiguous_pages = + FreePageManagerPutInternal(fpm, first_page, npages, false); + + /* + * If the new range we inserted into the page manager was contiguous with + * an existing range, it may have opened up cleanup opportunities. + */ + if (contiguous_pages > npages) + { + Size cleanup_contiguous_pages; + + cleanup_contiguous_pages = FreePageBtreeCleanup(fpm); + if (cleanup_contiguous_pages > contiguous_pages) + contiguous_pages = cleanup_contiguous_pages; + } + + /* See if we now have a new largest chunk. */ + if (fpm->contiguous_pages < contiguous_pages) + fpm->contiguous_pages = contiguous_pages; + + /* + * The earlier call to FreePageManagerPutInternal may have set + * contiguous_pages_dirty if it needed to allocate internal pages, so + * recompute contiguous_pages if necessary. + */ + FreePageManagerUpdateLargest(fpm); + +#ifdef FPM_EXTRA_ASSERTS + fpm->free_pages += npages; + Assert(fpm->free_pages == sum_free_pages(fpm)); + Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); +#endif +} + +/* + * Produce a debugging dump of the state of a free page manager. + */ +char * +FreePageManagerDump(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + StringInfoData buf; + FreePageSpanLeader *recycle; + bool dumped_any_freelist = false; + Size f; + + /* Initialize output buffer. */ + initStringInfo(&buf); + + /* Dump general stuff. */ + appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n", + fpm->self.relptr_off, fpm->contiguous_pages); + + /* Dump btree. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root; + + appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth); + root = relptr_access(base, fpm->btree_root); + FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf); + } + else if (fpm->singleton_npages > 0) + { + appendStringInfo(&buf, "singleton: %zu(%zu)\n", + fpm->singleton_first_page, fpm->singleton_npages); + } + + /* Dump btree recycle list. */ + recycle = relptr_access(base, fpm->btree_recycle); + if (recycle != NULL) + { + appendStringInfo(&buf, "btree recycle:"); + FreePageManagerDumpSpans(fpm, recycle, 1, &buf); + } + + /* Dump free lists. */ + for (f = 0; f < FPM_NUM_FREELISTS; ++f) + { + FreePageSpanLeader *span; + + if (relptr_is_null(fpm->freelist[f])) + continue; + if (!dumped_any_freelist) + { + appendStringInfo(&buf, "freelists:\n"); + dumped_any_freelist = true; + } + appendStringInfo(&buf, " %zu:", f + 1); + span = relptr_access(base, fpm->freelist[f]); + FreePageManagerDumpSpans(fpm, span, f + 1, &buf); + } + + /* And return result to caller. */ + return buf.data; +} + + +/* + * The first_page value stored at index zero in any non-root page must match + * the first_page value stored in its parent at the index which points to that + * page. So when the value stored at index zero in a btree page changes, we've + * got to walk up the tree adjusting ancestor keys until we reach an ancestor + * where that key isn't index zero. This function should be called after + * updating the first key on the target page; it will propagate the change + * upward as far as needed. + * + * We assume here that the first key on the page has not changed enough to + * require changes in the ordering of keys on its ancestor pages. Thus, + * if we search the parent page for the first key greater than or equal to + * the first key on the current page, the downlink to this page will be either + * the exact index returned by the search (if the first key decreased) + * or one less (if the first key increased). + */ +static void +FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + Size first_page; + FreePageBtree *parent; + FreePageBtree *child; + + /* This might be either a leaf or an internal page. */ + Assert(btp->hdr.nused > 0); + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + first_page = btp->u.leaf_key[0].first_page; + } + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + first_page = btp->u.internal_key[0].first_page; + } + child = btp; + + /* Loop until we find an ancestor that does not require adjustment. */ + for (;;) + { + Size s; + + parent = relptr_access(base, child->hdr.parent); + if (parent == NULL) + break; + s = FreePageBtreeSearchInternal(parent, first_page); + + /* Key is either at index s or index s-1; figure out which. */ + if (s >= parent->hdr.nused) + { + Assert(s == parent->hdr.nused); + --s; + } + else + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + if (check != child) + { + Assert(s > 0); + --s; + } + } + +#ifdef USE_ASSERT_CHECKING + /* Debugging double-check. */ + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + Assert(s < parent->hdr.nused); + Assert(child == check); + } +#endif + + /* Update the parent key. */ + parent->u.internal_key[s].first_page = first_page; + + /* + * If this is the first key in the parent, go up another level; else + * done. + */ + if (s > 0) + break; + child = parent; + } +} + +/* + * Attempt to reclaim space from the free-page btree. The return value is + * the largest range of contiguous pages created by the cleanup operation. + */ +static Size +FreePageBtreeCleanup(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + Size max_contiguous_pages = 0; + + /* Attempt to shrink the depth of the btree. */ + while (!relptr_is_null(fpm->btree_root)) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + /* If the root contains only one key, reduce depth by one. */ + if (root->hdr.nused == 1) + { + /* Shrink depth of tree by one. */ + Assert(fpm->btree_depth > 0); + --fpm->btree_depth; + if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + /* If root is a leaf, convert only entry to singleton range. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages; + } + else + { + FreePageBtree *newroot; + + /* If root is an internal page, make only child the root. */ + Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + relptr_copy(fpm->btree_root, root->u.internal_key[0].child); + newroot = relptr_access(base, fpm->btree_root); + relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); + } + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root)); + } + else if (root->hdr.nused == 2 && + root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Size end_of_first; + Size start_of_second; + + end_of_first = root->u.leaf_key[0].first_page + + root->u.leaf_key[0].npages; + start_of_second = root->u.leaf_key[1].first_page; + + if (end_of_first + 1 == start_of_second) + { + Size root_page = fpm_pointer_to_page(base, root); + + if (end_of_first == root_page) + { + FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page); + FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages + + root->u.leaf_key[1].npages + 1; + fpm->btree_depth = 0; + relptr_store(base, fpm->btree_root, + (FreePageBtree *) NULL); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + Assert(max_contiguous_pages == 0); + max_contiguous_pages = fpm->singleton_npages; + } + } + + /* Whether it worked or not, it's time to stop. */ + break; + } + else + { + /* Nothing more to do. Stop. */ + break; + } + } + + /* + * Attempt to free recycled btree pages. We skip this if releasing the + * recycled page would require a btree page split, because the page we're + * trying to recycle would be consumed by the split, which would be + * counterproductive. + * + * We also currently only ever attempt to recycle the first page on the + * list; that could be made more aggressive, but it's not clear that the + * complexity would be worthwhile. + */ + while (fpm->btree_recycle_count > 0) + { + FreePageBtree *btp; + Size first_page; + Size contiguous_pages; + + btp = FreePageBtreeGetRecycled(fpm); + first_page = fpm_pointer_to_page(base, btp); + contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true); + if (contiguous_pages == 0) + { + FreePageBtreeRecycle(fpm, first_page); + break; + } + else + { + if (contiguous_pages > max_contiguous_pages) + max_contiguous_pages = contiguous_pages; + } + } + + return max_contiguous_pages; +} + +/* + * Consider consolidating the given page with its left or right sibling, + * if it's fairly empty. + */ +static void +FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *np; + Size max; + + /* + * We only try to consolidate pages that are less than a third full. We + * could be more aggressive about this, but that might risk performing + * consolidation only to end up splitting again shortly thereafter. Since + * the btree should be very small compared to the space under management, + * our goal isn't so much to ensure that it always occupies the absolutely + * smallest possible number of pages as to reclaim pages before things get + * too egregiously out of hand. + */ + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + max = FPM_ITEMS_PER_LEAF_PAGE; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + max = FPM_ITEMS_PER_INTERNAL_PAGE; + } + if (btp->hdr.nused >= max / 3) + return; + + /* + * If we can fit our right sibling's keys onto this page, consolidate. + */ + np = FreePageBtreeFindRightSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + } + else + { + memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + FreePageBtreeUpdateParentPointers(base, btp); + } + FreePageBtreeRemovePage(fpm, np); + return; + } + + /* + * If we can fit our keys onto our left sibling's page, consolidate. In + * this case, we move our keys onto the other page rather than visca + * versa, to avoid having to adjust ancestor keys. + */ + np = FreePageBtreeFindLeftSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + } + else + { + memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + FreePageBtreeUpdateParentPointers(base, np); + } + FreePageBtreeRemovePage(fpm, btp); + return; + } +} + +/* + * Find the passed page's left sibling; that is, the page at the same level + * of the tree whose keyspace immediately precedes ours. + */ +static FreePageBtree * +FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move left. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index > 0) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index - 1].child); + break; + } + Assert(index == 0); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Find the passed page's right sibling; that is, the page at the same level + * of the tree whose keyspace immediately follows ours. + */ +static FreePageBtree * +FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move right. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index < p->hdr.nused - 1) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index + 1].child); + break; + } + Assert(index == p->hdr.nused - 1); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[0].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Get the first key on a btree page. + */ +static Size +FreePageBtreeFirstKey(FreePageBtree *btp) +{ + Assert(btp->hdr.nused > 0); + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + return btp->u.leaf_key[0].first_page; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + return btp->u.internal_key[0].first_page; + } +} + +/* + * Get a page from the btree recycle list for use as a btree page. + */ +static FreePageBtree * +FreePageBtreeGetRecycled(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *newhead; + + Assert(victim != NULL); + newhead = relptr_access(base, victim->next); + if (newhead != NULL) + relptr_copy(newhead->prev, victim->prev); + relptr_store(base, fpm->btree_recycle, newhead); + Assert(fpm_pointer_is_page_aligned(base, victim)); + fpm->btree_recycle_count--; + return (FreePageBtree *) victim; +} + +/* + * Insert an item into an internal page. + */ +static void +FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, + Size first_page, FreePageBtree *child) +{ + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], + sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); + btp->u.internal_key[index].first_page = first_page; + relptr_store(base, btp->u.internal_key[index].child, child); + ++btp->hdr.nused; +} + +/* + * Insert an item into a leaf page. + */ +static void +FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, + Size npages) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + btp->u.leaf_key[index].first_page = first_page; + btp->u.leaf_key[index].npages = npages; + ++btp->hdr.nused; +} + +/* + * Put a page on the btree recycle list. + */ +static void +FreePageBtreeRecycle(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = 1; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->btree_recycle, span); + fpm->btree_recycle_count++; +} + +/* + * Remove an item from the btree at the given position on the given page. + */ +static void +FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(index < btp->hdr.nused); + + /* When last item is removed, extirpate entire page from btree. */ + if (btp->hdr.nused == 1) + { + FreePageBtreeRemovePage(fpm, btp); + return; + } + + /* Physically remove the key from the page. */ + --btp->hdr.nused; + if (index < btp->hdr.nused) + memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + + /* If we just removed the first key, adjust ancestor keys. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, btp); + + /* Consider whether to consolidate this page with a sibling. */ + FreePageBtreeConsolidate(fpm, btp); +} + +/* + * Remove a page from the btree. Caller is responsible for having relocated + * any keys from this page that are still wanted. The page is placed on the + * recycled list. + */ +static void +FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *parent; + Size index; + Size first_page; + + for (;;) + { + /* Find parent page. */ + parent = relptr_access(base, btp->hdr.parent); + if (parent == NULL) + { + /* We are removing the root page. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->btree_depth = 0; + Assert(fpm->singleton_first_page == 0); + Assert(fpm->singleton_npages == 0); + return; + } + + /* + * If the parent contains only one item, we need to remove it as well. + */ + if (parent->hdr.nused > 1) + break; + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + btp = parent; + } + + /* Find and remove the downlink. */ + first_page = FreePageBtreeFirstKey(btp); + if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + index = FreePageBtreeSearchLeaf(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.leaf_key[index], + &parent->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) + * (parent->hdr.nused - index - 1)); + } + else + { + index = FreePageBtreeSearchInternal(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.internal_key[index], + &parent->u.internal_key[index + 1], + sizeof(FreePageBtreeInternalKey) + * (parent->hdr.nused - index - 1)); + } + parent->hdr.nused--; + Assert(parent->hdr.nused > 0); + + /* Recycle the page. */ + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + + /* Adjust ancestor keys if needed. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + + /* Consider whether to consolidate the parent with a sibling. */ + FreePageBtreeConsolidate(fpm, parent); +} + +/* + * Search the btree for an entry for the given first page and initialize + * *result with the results of the search. result->page and result->index + * indicate either the position of an exact match or the position at which + * the new key should be inserted. result->found is true for an exact match, + * otherwise false. result->split_pages will contain the number of additional + * btree pages that will be needed when performing a split to insert a key. + * Except as described above, the contents of fields in the result object are + * undefined on return. + */ +static void +FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *btp = relptr_access(base, fpm->btree_root); + Size index; + + result->split_pages = 1; + + /* If the btree is empty, there's nothing to find. */ + if (btp == NULL) + { + result->page = NULL; + result->found = false; + return; + } + + /* Descend until we hit a leaf. */ + while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + FreePageBtree *child; + bool found_exact; + + index = FreePageBtreeSearchInternal(btp, first_page); + found_exact = index < btp->hdr.nused && + btp->u.internal_key[index].first_page == first_page; + + /* + * If we found an exact match we descend directly. Otherwise, we + * descend into the child to the left if possible so that we can find + * the insertion point at that child's high end. + */ + if (!found_exact && index > 0) + --index; + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Descend to appropriate child page. */ + Assert(index < btp->hdr.nused); + child = relptr_access(base, btp->u.internal_key[index].child); + Assert(relptr_access(base, child->hdr.parent) == btp); + btp = child; + } + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Search leaf page. */ + index = FreePageBtreeSearchLeaf(btp, first_page); + + /* Assemble results. */ + result->page = btp; + result->index = index; + result->found = index < btp->hdr.nused && + first_page == btp->u.leaf_key[index].first_page; +} + +/* + * Search an internal page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.internal_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Search a leaf page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.leaf_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Allocate a new btree page and move half the keys from the provided page + * to the new page. Caller is responsible for making sure that there's a + * page available from fpm->btree_recycle. Returns a pointer to the new page, + * to which caller must add a downlink. + */ +static FreePageBtree * +FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp) +{ + FreePageBtree *newsibling; + + newsibling = FreePageBtreeGetRecycled(fpm); + newsibling->hdr.magic = btp->hdr.magic; + newsibling->hdr.nused = btp->hdr.nused / 2; + relptr_copy(newsibling->hdr.parent, btp->hdr.parent); + btp->hdr.nused -= newsibling->hdr.nused; + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + memcpy(&newsibling->u.leaf_key, + &btp->u.leaf_key[btp->hdr.nused], + sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused); + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + memcpy(&newsibling->u.internal_key, + &btp->u.internal_key[btp->hdr.nused], + sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused); + FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling); + } + + return newsibling; +} + +/* + * When internal pages are split or merged, the parent pointers of their + * children must be updated. + */ +static void +FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp) +{ + Size i; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + for (i = 0; i < btp->hdr.nused; ++i) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[i].child); + relptr_store(base, child->hdr.parent, btp); + } +} + +/* + * Debugging dump of btree data. + */ +static void +FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + Size pageno = fpm_pointer_to_page(base, btp); + Size index; + FreePageBtree *check_parent; + + check_stack_depth(); + check_parent = relptr_access(base, btp->hdr.parent); + appendStringInfo(buf, " %zu@%d %c", pageno, level, + btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l'); + if (parent != check_parent) + appendStringInfo(buf, " [actual parent %zu, expected %zu]", + fpm_pointer_to_page(base, check_parent), + fpm_pointer_to_page(base, parent)); + appendStringInfoChar(buf, ':'); + for (index = 0; index < btp->hdr.nused; ++index) + { + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + appendStringInfo(buf, " %zu->%zu", + btp->u.internal_key[index].first_page, + btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE); + else + appendStringInfo(buf, " %zu(%zu)", + btp->u.leaf_key[index].first_page, + btp->u.leaf_key[index].npages); + } + appendStringInfo(buf, "\n"); + + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf); + } + } +} + +/* + * Debugging dump of free-span data. + */ +static void +FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, + Size expected_pages, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + + while (span != NULL) + { + if (span->npages != expected_pages) + appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span), + span->npages); + else + appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span)); + span = relptr_access(base, span->next); + } + + appendStringInfo(buf, "\n"); +} + +/* + * This function allocates a run of pages of the given length from the free + * page manager. + */ +static bool +FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = NULL; + FreePageSpanLeader *prev; + FreePageSpanLeader *next; + FreePageBtreeSearchResult result; + Size victim_page = 0; /* placate compiler */ + Size f; + + /* + * Search for a free span. + * + * Right now, we use a simple best-fit policy here, but it's possible for + * this to result in memory fragmentation if we're repeatedly asked to + * allocate chunks just a little smaller than what we have available. + * Hopefully, this is unlikely, because we expect most requests to be + * single pages or superblock-sized chunks -- but no policy can be optimal + * under all circumstances unless it has knowledge of future allocation + * patterns. + */ + for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f) + { + /* Skip empty freelists. */ + if (relptr_is_null(fpm->freelist[f])) + continue; + + /* + * All of the freelists except the last one contain only items of a + * single size, so we just take the first one. But the final free + * list contains everything too big for any of the other lists, so we + * need to search the list. + */ + if (f < FPM_NUM_FREELISTS - 1) + victim = relptr_access(base, fpm->freelist[f]); + else + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[f]); + do + { + if (candidate->npages >= npages && (victim == NULL || + victim->npages > candidate->npages)) + { + victim = candidate; + if (victim->npages == npages) + break; + } + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + break; + } + + /* If we didn't find an allocatable span, return failure. */ + if (victim == NULL) + return false; + + /* Remove span from free list. */ + Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC); + prev = relptr_access(base, victim->prev); + next = relptr_access(base, victim->next); + if (prev != NULL) + relptr_copy(prev->next, victim->next); + else + relptr_copy(fpm->freelist[f], victim->next); + if (next != NULL) + relptr_copy(next->prev, victim->prev); + victim_page = fpm_pointer_to_page(base, victim); + + /* Decide whether we might be invalidating contiguous_pages. */ + if (f == FPM_NUM_FREELISTS - 1 && + victim->npages == fpm->contiguous_pages) + { + /* + * The victim span came from the oversized freelist, and had the same + * size as the longest span. There may or may not be another one of + * the same size, so contiguous_pages must be recomputed just to be + * safe. + */ + fpm->contiguous_pages_dirty = true; + } + else if (f + 1 == fpm->contiguous_pages && + relptr_is_null(fpm->freelist[f])) + { + /* + * The victim span came from a fixed sized freelist, and it was the + * list for spans of the same size as the current longest span, and + * the list is now empty after removing the victim. So + * contiguous_pages must be recomputed without a doubt. + */ + fpm->contiguous_pages_dirty = true; + } + + /* + * If we haven't initialized the btree yet, the victim must be the single + * span stored within the FreePageManager itself. Otherwise, we need to + * update the btree. + */ + if (relptr_is_null(fpm->btree_root)) + { + Assert(victim_page == fpm->singleton_first_page); + Assert(victim->npages == fpm->singleton_npages); + Assert(victim->npages >= npages); + fpm->singleton_first_page += npages; + fpm->singleton_npages -= npages; + if (fpm->singleton_npages > 0) + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + } + else + { + /* + * If the span we found is exactly the right size, remove it from the + * btree completely. Otherwise, adjust the btree entry to reflect the + * still-unallocated portion of the span, and put that portion on the + * appropriate free list. + */ + FreePageBtreeSearch(fpm, victim_page, &result); + Assert(result.found); + if (victim->npages == npages) + FreePageBtreeRemove(fpm, result.page, result.index); + else + { + FreePageBtreeLeafKey *key; + + /* Adjust btree to reflect remaining pages. */ + Assert(victim->npages > npages); + key = &result.page->u.leaf_key[result.index]; + Assert(key->npages == victim->npages); + key->first_page += npages; + key->npages -= npages; + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put the unallocated pages back on the appropriate free list. */ + FreePagePushSpanLeader(fpm, victim_page + npages, + victim->npages - npages); + } + } + + /* Return results to caller. */ + *first_page = fpm_pointer_to_page(base, victim); + return true; +} + +/* + * Put a range of pages into the btree and freelists, consolidating it with + * existing free spans just before and/or after it. If 'soft' is true, + * only perform the insertion if it can be done without allocating new btree + * pages; if false, do it always. Returns 0 if the soft flag caused the + * insertion to be skipped, or otherwise the size of the contiguous span + * created by the insertion. This may be larger than npages if we're able + * to consolidate with an adjacent range. *internal_pages_used is set to + * true if the btree allocated pages for internal purposes, which might + * invalidate the current largest run requiring it to be recomputed. + */ +static Size +FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, + bool soft) +{ + char *base = fpm_segment_base(fpm); + FreePageBtreeSearchResult result; + FreePageBtreeLeafKey *prevkey = NULL; + FreePageBtreeLeafKey *nextkey = NULL; + FreePageBtree *np; + Size nindex; + + Assert(npages > 0); + + /* We can store a single free span without initializing the btree. */ + if (fpm->btree_depth == 0) + { + if (fpm->singleton_npages == 0) + { + /* Don't have a span yet; store this one. */ + fpm->singleton_first_page = first_page; + fpm->singleton_npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return fpm->singleton_npages; + } + else if (fpm->singleton_first_page + fpm->singleton_npages == + first_page) + { + /* New span immediately follows sole existing span. */ + fpm->singleton_npages += npages; + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else if (first_page + npages == fpm->singleton_first_page) + { + /* New span immediately precedes sole existing span. */ + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + fpm->singleton_first_page = first_page; + fpm->singleton_npages += npages; + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else + { + /* Not contiguous; we need to initialize the btree. */ + Size root_page; + FreePageBtree *root; + + if (!relptr_is_null(fpm->btree_recycle)) + root = FreePageBtreeGetRecycled(fpm); + else if (FreePageManagerGetInternal(fpm, 1, &root_page)) + root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); + else + { + /* We'd better be able to get a page from the existing range. */ + elog(FATAL, "free page manager btree is corrupt"); + } + + /* Create the btree and move the preexisting range into it. */ + root->hdr.magic = FREE_PAGE_LEAF_MAGIC; + root->hdr.nused = 1; + relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL); + root->u.leaf_key[0].first_page = fpm->singleton_first_page; + root->u.leaf_key[0].npages = fpm->singleton_npages; + relptr_store(base, fpm->btree_root, root); + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->btree_depth = 1; + + /* + * Corner case: it may be that the btree root took the very last + * free page. In that case, the sole btree entry covers a zero + * page run, which is invalid. Overwrite it with the entry we're + * trying to insert and get out. + */ + if (root->u.leaf_key[0].npages == 0) + { + root->u.leaf_key[0].first_page = first_page; + root->u.leaf_key[0].npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return npages; + } + + /* Fall through to insert the new key. */ + } + } + + /* Search the btree. */ + FreePageBtreeSearch(fpm, first_page, &result); + Assert(!result.found); + if (result.index > 0) + prevkey = &result.page->u.leaf_key[result.index - 1]; + if (result.index < result.page->hdr.nused) + { + np = result.page; + nindex = result.index; + nextkey = &result.page->u.leaf_key[result.index]; + } + else + { + np = FreePageBtreeFindRightSibling(base, result.page); + nindex = 0; + if (np != NULL) + nextkey = &np->u.leaf_key[0]; + } + + /* Consolidate with the previous entry if possible. */ + if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page) + { + bool remove_next = false; + Size result; + + Assert(prevkey->first_page + prevkey->npages == first_page); + prevkey->npages = (first_page - prevkey->first_page) + npages; + + /* Check whether we can *also* consolidate with the following entry. */ + if (nextkey != NULL && + prevkey->first_page + prevkey->npages >= nextkey->first_page) + { + Assert(prevkey->first_page + prevkey->npages == + nextkey->first_page); + prevkey->npages = (nextkey->first_page - prevkey->first_page) + + nextkey->npages; + FreePagePopSpanLeader(fpm, nextkey->first_page); + remove_next = true; + } + + /* Put the span on the correct freelist and save size. */ + FreePagePopSpanLeader(fpm, prevkey->first_page); + FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages); + result = prevkey->npages; + + /* + * If we consolidated with both the preceding and following entries, + * we must remove the following entry. We do this last, because + * removing an element from the btree may invalidate pointers we hold + * into the current data structure. + * + * NB: The btree is technically in an invalid state a this point + * because we've already updated prevkey to cover the same key space + * as nextkey. FreePageBtreeRemove() shouldn't notice that, though. + */ + if (remove_next) + FreePageBtreeRemove(fpm, np, nindex); + + return result; + } + + /* Consolidate with the next entry if possible. */ + if (nextkey != NULL && first_page + npages >= nextkey->first_page) + { + Size newpages; + + /* Compute new size for span. */ + Assert(first_page + npages == nextkey->first_page); + newpages = (nextkey->first_page - first_page) + nextkey->npages; + + /* Put span on correct free list. */ + FreePagePopSpanLeader(fpm, nextkey->first_page); + FreePagePushSpanLeader(fpm, first_page, newpages); + + /* Update key in place. */ + nextkey->first_page = first_page; + nextkey->npages = newpages; + + /* If reducing first key on page, ancestors might need adjustment. */ + if (nindex == 0) + FreePageBtreeAdjustAncestorKeys(fpm, np); + + return nextkey->npages; + } + + /* Split leaf page and as many of its ancestors as necessary. */ + if (result.split_pages > 0) + { + /* + * NB: We could consider various coping strategies here to avoid a + * split; most obviously, if np != result.page, we could target that + * page instead. More complicated shuffling strategies could be + * possible as well; basically, unless every single leaf page is 100% + * full, we can jam this key in there if we try hard enough. It's + * unlikely that trying that hard is worthwhile, but it's possible we + * might need to make more than no effort. For now, we just do the + * easy thing, which is nothing. + */ + + /* If this is a soft insert, it's time to give up. */ + if (soft) + return 0; + + /* Check whether we need to allocate more btree pages to split. */ + if (result.split_pages > fpm->btree_recycle_count) + { + Size pages_needed; + Size recycle_page; + Size i; + + /* + * Allocate the required number of pages and split each one in + * turn. This should never fail, because if we've got enough + * spans of free pages kicking around that we need additional + * storage space just to remember them all, then we should + * certainly have enough to expand the btree, which should only + * ever use a tiny number of pages compared to the number under + * management. If it does, something's badly screwed up. + */ + pages_needed = result.split_pages - fpm->btree_recycle_count; + for (i = 0; i < pages_needed; ++i) + { + if (!FreePageManagerGetInternal(fpm, 1, &recycle_page)) + elog(FATAL, "free page manager btree is corrupt"); + FreePageBtreeRecycle(fpm, recycle_page); + } + + /* + * The act of allocating pages to recycle may have invalidated the + * results of our previous btree reserch, so repeat it. (We could + * recheck whether any of our split-avoidance strategies that were + * not viable before now are, but it hardly seems worthwhile, so + * we don't bother. Consolidation can't be possible now if it + * wasn't previously.) + */ + FreePageBtreeSearch(fpm, first_page, &result); + + /* + * The act of allocating pages for use in constructing our btree + * should never cause any page to become more full, so the new + * split depth should be no greater than the old one, and perhaps + * less if we fortutiously allocated a chunk that freed up a slot + * on the page we need to update. + */ + Assert(result.split_pages <= fpm->btree_recycle_count); + } + + /* If we still need to perform a split, do it. */ + if (result.split_pages > 0) + { + FreePageBtree *split_target = result.page; + FreePageBtree *child = NULL; + Size key = first_page; + + for (;;) + { + FreePageBtree *newsibling; + FreePageBtree *parent; + + /* Identify parent page, which must receive downlink. */ + parent = relptr_access(base, split_target->hdr.parent); + + /* Split the page - downlink not added yet. */ + newsibling = FreePageBtreeSplitPage(fpm, split_target); + + /* + * At this point in the loop, we're always carrying a pending + * insertion. On the first pass, it's the actual key we're + * trying to insert; on subsequent passes, it's the downlink + * that needs to be added as a result of the split performed + * during the previous loop iteration. Since we've just split + * the page, there's definitely room on one of the two + * resulting pages. + */ + if (child == NULL) + { + Size index; + FreePageBtree *insert_into; + + insert_into = key < newsibling->u.leaf_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchLeaf(insert_into, key); + FreePageBtreeInsertLeaf(insert_into, index, key, npages); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + else + { + Size index; + FreePageBtree *insert_into; + + insert_into = + key < newsibling->u.internal_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchInternal(insert_into, key); + FreePageBtreeInsertInternal(base, insert_into, index, + key, child); + relptr_store(base, child->hdr.parent, insert_into); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + + /* If the page we just split has no parent, split the root. */ + if (parent == NULL) + { + FreePageBtree *newroot; + + newroot = FreePageBtreeGetRecycled(fpm); + newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC; + newroot->hdr.nused = 2; + relptr_store(base, newroot->hdr.parent, + (FreePageBtree *) NULL); + newroot->u.internal_key[0].first_page = + FreePageBtreeFirstKey(split_target); + relptr_store(base, newroot->u.internal_key[0].child, + split_target); + relptr_store(base, split_target->hdr.parent, newroot); + newroot->u.internal_key[1].first_page = + FreePageBtreeFirstKey(newsibling); + relptr_store(base, newroot->u.internal_key[1].child, + newsibling); + relptr_store(base, newsibling->hdr.parent, newroot); + relptr_store(base, fpm->btree_root, newroot); + fpm->btree_depth++; + + break; + } + + /* If the parent page isn't full, insert the downlink. */ + key = newsibling->u.internal_key[0].first_page; + if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE) + { + Size index; + + index = FreePageBtreeSearchInternal(parent, key); + FreePageBtreeInsertInternal(base, parent, index, + key, newsibling); + relptr_store(base, newsibling->hdr.parent, parent); + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + break; + } + + /* The parent also needs to be split, so loop around. */ + child = newsibling; + split_target = parent; + } + + /* + * The loop above did the insert, so just need to update the free + * list, and we're done. + */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; + } + } + + /* Physically add the key to the page. */ + Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); + FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages); + + /* If new first key on page, ancestors might need adjustment. */ + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put it on the free list. */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; +} + +/* + * Remove a FreePageSpanLeader from the linked-list that contains it, either + * because we're changing the size of the span, or because we're allocating it. + */ +static void +FreePagePopSpanLeader(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *span; + FreePageSpanLeader *next; + FreePageSpanLeader *prev; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + + next = relptr_access(base, span->next); + prev = relptr_access(base, span->prev); + if (next != NULL) + relptr_copy(next->prev, span->prev); + if (prev != NULL) + relptr_copy(prev->next, span->next); + else + { + Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1; + + Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE); + relptr_copy(fpm->freelist[f], span->next); + } +} + +/* + * Initialize a new FreePageSpanLeader and put it on the appropriate free list. + */ +static void +FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages) +{ + char *base = fpm_segment_base(fpm); + Size f = Min(npages, FPM_NUM_FREELISTS) - 1; + FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = npages; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->freelist[f], span); +} diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 13955b8a94cd..673f6d3bcea1 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -889,6 +889,7 @@ extractPageInfo(XLogRecord *record) switch (info) { case XLOG_SMGR_CREATE: + case XLOG_SMGR_CREATE_PDL: /* * We can safely ignore these. The local file will be * removed, if it doesn't exist in remote system. If a diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index e9d5e866cc6c..d96d981030c6 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -108,4 +108,6 @@ extern void getTwoPhasePreparedTransactionData(prepared_transaction_agg_state ** extern void SetupCheckpointPreparedTransactionList(prepared_transaction_agg_state *ptas); +extern bool RemovePendingDeletesForPreparedTransactions(void); + #endif /* TWOPHASE_H */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 8246686e7563..732f20739ba8 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -78,6 +78,7 @@ typedef struct CheckPoint #define XLOG_FPI 0xA0 #define XLOG_NEXTRELFILENODE 0xB0 #define XLOG_OVERWRITE_CONTRECORD 0xC0 +#define XLOG_PENDING_DELETE 0xD0 /* diff --git a/src/include/catalog/storage_pending_deletes.h b/src/include/catalog/storage_pending_deletes.h new file mode 100644 index 000000000000..587b33125b1d --- /dev/null +++ b/src/include/catalog/storage_pending_deletes.h @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * storage_pending_deletes.h + * prototypes for functions in backend/catalog/storage_pending_deletes.c + * + * Copyright (c) 2025 Greengage Community + * + * src/include/catalog/storage_pending_deletes.h + * + *------------------------------------------------------------------------- + */ +#ifndef STORAGE_PENDING_DELETES_H +#define STORAGE_PENDING_DELETES_H + +#include "postgres.h" + +#include "storage/relfilenode.h" +#include "utils/dsa.h" + +/* Pending delete node linked to xact which created it */ +typedef struct PendingRelXactDelete +{ + RelFileNodePendingDelete relnode; + TransactionId xid; +} PendingRelXactDelete; + +typedef struct PendingRelXactDeleteArray +{ + Size count; + PendingRelXactDelete array[FLEXIBLE_ARRAY_MEMBER]; +} PendingRelXactDeleteArray; + +static inline Size +PdlDumpSize(Size count) +{ + Size array_size = sizeof(PendingRelXactDelete) * count; + + return offsetof(PendingRelXactDeleteArray, array) + array_size; +} + +extern Size PdlShmemSize(void); +extern void PdlShmemInit(void); +extern dsa_pointer PdlShmemAdd(const RelFileNodePendingDelete * relnode, + TransactionId xid); +extern void PdlShmemRemove(dsa_pointer node_ptr); +extern PendingRelXactDeleteArray *PdlXLogShmemDump(void); + +#endif /* STORAGE_PENDING_DELETES_H */ diff --git a/src/include/catalog/storage_pending_deletes_redo.h b/src/include/catalog/storage_pending_deletes_redo.h new file mode 100644 index 000000000000..aa3f7b7a4edf --- /dev/null +++ b/src/include/catalog/storage_pending_deletes_redo.h @@ -0,0 +1,31 @@ +/*------------------------------------------------------------------------- + * + * storage_pending_deletes_redo.h + * prototypes for functions in backend/catalog/storage_pending_deletes_redo.c + * + * Copyright (c) 2025 Greengage Community + * + * src/include/catalog/storage_pending_deletes_redo.h + * + *------------------------------------------------------------------------- + */ +#ifndef STORAGE_PENDING_DELETES_REDO_H +#define STORAGE_PENDING_DELETES_REDO_H + +#include "postgres.h" + +#include "access/xlog.h" +#include "catalog/storage_pending_deletes.h" + +extern void PdlXLogInsert(void); + +extern void PdlRedoAdd(PendingRelXactDelete * pd); + +extern void PdlRedoXLogRecord(XLogRecord *record); + +extern void PdlRedoRemoveTree(TransactionId xid, + TransactionId *sub_xids, int nsubxacts); + +extern void PdlRedoDropFiles(void); + +#endif /* STORAGE_PENDING_DELETES_REDO_H */ diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index f3916190ce29..2cf8a3ced7ca 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -26,22 +26,34 @@ */ /* XLOG gives us high 4 bits */ -#define XLOG_SMGR_CREATE 0x10 -#define XLOG_SMGR_TRUNCATE 0x20 +#define XLOG_SMGR_CREATE 0x10 +#define XLOG_SMGR_TRUNCATE 0x20 +#define XLOG_SMGR_CREATE_PDL 0x30 +/* + * We do not create `xl_smgr_create` records anymore. We use + * `xl_smgr_create_pdl` instead. But we still process `xl_smgr_create` records + * for backward compatibility. + */ typedef struct xl_smgr_create { RelFileNode rnode; ForkNumber forkNum; } xl_smgr_create; +typedef struct xl_smgr_create_pdl +{ + xl_smgr_create createrec; + char relstorage; +} xl_smgr_create_pdl; + typedef struct xl_smgr_truncate { BlockNumber blkno; RelFileNode rnode; } xl_smgr_truncate; -extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum); +extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum, char relstorage); extern void smgr_redo(XLogRecPtr beginLoc, XLogRecPtr lsn, XLogRecord *record); extern void smgr_desc(StringInfo buf, XLogRecord *record); diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h index 9ff49162c18d..766f94e3d64a 100644 --- a/src/include/storage/dsm.h +++ b/src/include/storage/dsm.h @@ -41,6 +41,7 @@ extern void dsm_detach(dsm_segment *seg); /* Resource management functions. */ extern void dsm_pin_mapping(dsm_segment *seg); extern void dsm_pin_segment(dsm_segment *seg); +extern void dsm_unpin_segment(dsm_handle h); extern dsm_segment *dsm_find_mapping(dsm_handle h); /* Informational functions. */ diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h index 32cfed2ee9c4..37155354a6e9 100644 --- a/src/include/storage/dsm_impl.h +++ b/src/include/storage/dsm_impl.h @@ -72,7 +72,7 @@ extern bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, /* Some implementations cannot resize segments. Can this one? */ extern bool dsm_impl_can_resize(void); -/* Implementation-dependent actions required to keep segment until shudown. */ +/* Implementation-dependent actions required to keep segment until shutdown. */ extern void dsm_impl_pin_segment(dsm_handle handle, void *impl_private); #endif /* DSM_IMPL_H */ diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h new file mode 100644 index 000000000000..4ef5c241c912 --- /dev/null +++ b/src/include/utils/dsa.h @@ -0,0 +1,108 @@ +/*------------------------------------------------------------------------- + * + * dsa.h + * Dynamic shared memory areas. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/utils/dsa.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSA_H +#define DSA_H + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/dsm.h" + +/* The opaque type used for an area. */ +struct dsa_area; +typedef struct dsa_area dsa_area; + +/* + * If this system doesn't support atomic operations on 64 bit values then + * we fall back to 32 bit dsa_pointer. For testing purposes, + * USE_SMALL_DSA_POINTER can be defined to force the use of 32 bit + * dsa_pointer even on systems that support 64 bit atomics. + */ +#ifndef PG_HAVE_ATOMIC_U64_SUPPORT +#define SIZEOF_DSA_POINTER 4 +#else +#ifdef USE_SMALL_DSA_POINTER +#define SIZEOF_DSA_POINTER 4 +#else +#define SIZEOF_DSA_POINTER 8 +#endif +#endif + +/* + * The type of 'relative pointers' to memory allocated by a dynamic shared + * area. dsa_pointer values can be shared with other processes, but must be + * converted to backend-local pointers before they can be dereferenced. See + * dsa_get_address. Also, an atomic version and appropriately sized atomic + * operations. + */ +#if DSA_POINTER_SIZEOF == 4 +typedef uint32 dsa_pointer; +typedef pg_atomic_uint32 dsa_pointer_atomic; +#define dsa_pointer_atomic_init pg_atomic_init_u32 +#define dsa_pointer_atomic_read pg_atomic_read_u32 +#define dsa_pointer_atomic_write pg_atomic_write_u32 +#define dsa_pointer_atomic_fetch_add pg_atomic_fetch_add_u32 +#define dsa_pointer_atomic_compare_exchange pg_atomic_compare_exchange_u32 +#else +typedef uint64 dsa_pointer; +typedef pg_atomic_uint64 dsa_pointer_atomic; +#define dsa_pointer_atomic_init pg_atomic_init_u64 +#define dsa_pointer_atomic_read pg_atomic_read_u64 +#define dsa_pointer_atomic_write pg_atomic_write_u64 +#define dsa_pointer_atomic_fetch_add pg_atomic_fetch_add_u64 +#define dsa_pointer_atomic_compare_exchange pg_atomic_compare_exchange_u64 +#endif + +/* A sentinel value for dsa_pointer used to indicate failure to allocate. */ +#define InvalidDsaPointer ((dsa_pointer) 0) + +/* Check if a dsa_pointer value is valid. */ +#define DsaPointerIsValid(x) ((x) != InvalidDsaPointer) + +/* + * The type used for dsa_area handles. dsa_handle values can be shared with + * other processes, so that they can attach to them. This provides a way to + * share allocated storage with other processes. + * + * The handle for a dsa_area is currently implemented as the dsm_handle + * for the first DSM segment backing this dynamic storage area, but client + * code shouldn't assume that is true. + */ +typedef dsm_handle dsa_handle; + +extern void dsa_startup(void); + +extern dsa_area *dsa_create(int tranche_id, const char *tranche_name); +extern dsa_area *dsa_create_in_place(void *place, Size size, + int tranche_id, const char *tranche_name, + dsm_segment *segment); +extern dsa_area *dsa_attach(dsa_handle handle); +extern dsa_area *dsa_attach_in_place(void *place, dsm_segment *segment); +extern void dsa_release_in_place(void *place); +extern void dsa_on_dsm_detach_release_in_place(dsm_segment *, Datum); +extern void dsa_on_shmem_exit_release_in_place(int, Datum); +extern void dsa_pin_mapping(dsa_area *area); +extern void dsa_detach(dsa_area *area); +extern void dsa_pin(dsa_area *area); +extern void dsa_unpin(dsa_area *area); +extern void dsa_set_size_limit(dsa_area *area, Size limit); +extern Size dsa_minimum_size(void); +extern dsa_handle dsa_get_handle(dsa_area *area); +extern dsa_pointer dsa_allocate(dsa_area *area, Size size); +extern void dsa_free(dsa_area *area, dsa_pointer dp); +extern void *dsa_get_address(dsa_area *area, dsa_pointer dp); +extern void dsa_trim(dsa_area *area); +extern void dsa_dump(dsa_area *area); + +#endif /* DSA_H */ diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h new file mode 100644 index 000000000000..5e1305bc25cd --- /dev/null +++ b/src/include/utils/freepage.h @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------- + * + * freepage.h + * Management of page-organized free memory. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/freepage.h + * + *------------------------------------------------------------------------- + */ + +#ifndef FREEPAGE_H +#define FREEPAGE_H + +#include "storage/lwlock.h" +#include "utils/relptr.h" + +/* Forward declarations. */ +typedef struct FreePageSpanLeader FreePageSpanLeader; +typedef struct FreePageBtree FreePageBtree; +typedef struct FreePageManager FreePageManager; + +/* + * PostgreSQL normally uses 8kB pages for most things, but many common + * architecture/operating system pairings use a 4kB page size for memory + * allocation, so we do that here also. + */ +#define FPM_PAGE_SIZE 4096 + +/* + * Each freelist except for the last contains only spans of one particular + * size. Everything larger goes on the last one. In some sense this seems + * like a waste since most allocations are in a few common sizes, but it + * means that small allocations can simply pop the head of the relevant list + * without needing to worry about whether the object we find there is of + * precisely the correct size (because we know it must be). + */ +#define FPM_NUM_FREELISTS 129 + +/* Define relative pointer types. */ +relptr_declare(FreePageBtree, RelptrFreePageBtree); +relptr_declare(FreePageManager, RelptrFreePageManager); +relptr_declare(FreePageSpanLeader, RelptrFreePageSpanLeader); + +/* Everything we need in order to manage free pages (see freepage.c) */ +struct FreePageManager +{ + RelptrFreePageManager self; + RelptrFreePageBtree btree_root; + RelptrFreePageSpanLeader btree_recycle; + unsigned btree_depth; + unsigned btree_recycle_count; + Size singleton_first_page; + Size singleton_npages; + Size contiguous_pages; + bool contiguous_pages_dirty; + RelptrFreePageSpanLeader freelist[FPM_NUM_FREELISTS]; +#ifdef FPM_EXTRA_ASSERTS + /* For debugging only, pages put minus pages gotten. */ + Size free_pages; +#endif +}; + +/* Macros to convert between page numbers (expressed as Size) and pointers. */ +#define fpm_page_to_pointer(base, page) \ + (AssertVariableIsOfTypeMacro(page, Size), \ + (base) + FPM_PAGE_SIZE * (page)) +#define fpm_pointer_to_page(base, ptr) \ + (((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE) + +/* Macro to convert an allocation size to a number of pages. */ +#define fpm_size_to_pages(sz) \ + (((sz) + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) + +/* Macros to check alignment of absolute and relative pointers. */ +#define fpm_pointer_is_page_aligned(base, ptr) \ + (((Size) (((char *) (ptr)) - (base))) % FPM_PAGE_SIZE == 0) +#define fpm_relptr_is_page_aligned(base, relptr) \ + ((relptr).relptr_off % FPM_PAGE_SIZE == 0) + +/* Macro to find base address of the segment containing a FreePageManager. */ +#define fpm_segment_base(fpm) \ + (((char *) fpm) - fpm->self.relptr_off) + +/* Macro to access a FreePageManager's largest consecutive run of pages. */ +#define fpm_largest(fpm) \ + (fpm->contiguous_pages) + +/* Functions to manipulate the free page map. */ +extern void FreePageManagerInitialize(FreePageManager *fpm, char *base); +extern bool FreePageManagerGet(FreePageManager *fpm, Size npages, + Size *first_page); +extern void FreePageManagerPut(FreePageManager *fpm, Size first_page, + Size npages); +extern char *FreePageManagerDump(FreePageManager *fpm); + +#endif /* FREEPAGE_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 6c8995f8004e..712fd7bc6557 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -602,6 +602,8 @@ extern bool gp_log_endpoints; extern bool gp_allow_date_field_width_5digits; +extern bool gp_track_pending_delete; + typedef enum { INDEX_CHECK_NONE, diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h new file mode 100644 index 000000000000..f01924a1edf5 --- /dev/null +++ b/src/include/utils/relptr.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * relptr.h + * This file contains basic declarations for relative pointers. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/relptr.h + * + *------------------------------------------------------------------------- + */ + +#ifndef RELPTR_H +#define RELPTR_H + +/* + * Relative pointers are intended to be used when storing an address that may + * be relative either to the base of the processes address space or some + * dynamic shared memory segment mapped therein. + * + * The idea here is that you declare a relative pointer as relptr(type) + * and then use relptr_access to dereference it and relptr_store to change + * it. The use of a union here is a hack, because what's stored in the + * relptr is always a Size, never an actual pointer. But including a pointer + * in the union allows us to use stupid macro tricks to provide some measure + * of type-safety. + */ +#define relptr(type) union { type *relptr_type; Size relptr_off; } + +/* + * pgindent gets confused by declarations of the type relptr(type), so it's + * useful to give them a name that doesn't include parentheses. + */ +#define relptr_declare(type, name) \ + typedef union { type *relptr_type; Size relptr_off; } name; + +#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P +#define relptr_access(base, rp) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \ + (base + (rp).relptr_off))) +#else +/* + * If we don't have __builtin_types_compatible_p, assume we might not have + * __typeof__ either. + */ +#define relptr_access(base, rp) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (void *) ((rp).relptr_off == 0 ? NULL : (base + (rp).relptr_off))) +#endif + +#define relptr_is_null(rp) \ + ((rp).relptr_off == 0) + +#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P +#define relptr_store(base, rp, val) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \ + (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base))) +#else +/* + * If we don't have __builtin_types_compatible_p, assume we might not have + * __typeof__ either. + */ +#define relptr_store(base, rp, val) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base))) +#endif + +#define relptr_copy(rp1, rp2) \ + ((rp1).relptr_off = (rp2).relptr_off) + +#endif /* RELPTR_H */ diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h index f41f3320cef9..1dcad86fcc01 100644 --- a/src/include/utils/unsync_guc_name.h +++ b/src/include/utils/unsync_guc_name.h @@ -565,3 +565,4 @@ "xmlbinary", "xmloption", "zero_damaged_pages", + "gp_track_pending_delete", diff --git a/src/test/isolation2/expected/gp_orphaned_files.out b/src/test/isolation2/expected/gp_orphaned_files.out new file mode 100644 index 000000000000..39195e2e4381 --- /dev/null +++ b/src/test/isolation2/expected/gp_orphaned_files.out @@ -0,0 +1,352 @@ +-- start_ignore +-- end_ignore + + +-- Test case 1 +-- Check that orphaned files are not left on the coordinator and the standby +-- when the files are created before checkpoint + +-- Create tables of different access methods and return command to check their +-- files existence on the coordinator and the standby +1: create or replace function createTables(n text) returns text as $$ declare cmd text; /**/ begin execute 'create table t_orphaned_h'||n||'(i int) distributed by (i)'; /**/ +execute 'create table t_orphaned_r'||n||'(i int) with (appendonly=true, orientation=row) distributed by (i)'; /**/ -- Create index to create block directory table execute 'create index t_orphaned_r'||n||'_i on t_orphaned_r'||n||'(i)'; /**/ +execute 'create table t_orphaned_c'||n||'(i int) with (appendonly=true, orientation=column) distributed by (i)'; /**/ /* Create index to create block directory table */ execute 'create index t_orphaned_c'||n||'_i on t_orphaned_c'||n||'(i)'; /**/ +/* Ensure that the mirrors have applied the filesystem changes */ perform force_mirrors_to_catch_up(); /**/ +/* The command do not output PGDATA directories to make it possible to run the test without docker */ select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) into cmd from ( select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') || ' 2>/dev/null | wc -l' lswc from ( select unnest(array[('t_orphaned_h'||n)::regclass, ('t_orphaned_r'||n)::regclass, ('t_orphaned_r'||n||'_i')::regclass, ('t_orphaned_c'||n)::regclass, ('t_orphaned_c'||n||'_i')::regclass]) union all select unnest(array[segrelid, blkdirrelid, blkdiridxid, visimaprelid, visimapidxid]) from pg_catalog.pg_appendonly where relid in (('t_orphaned_r'||n)::regclass, ('t_orphaned_c'||n)::regclass) ) a ) f, (select datadir from gp_segment_configuration where content = -1) d; /**/ +return cmd; /**/ end $$ language plpgsql; +CREATE + +-- Start transaction and create tables in it before checkpoint +1: begin; +BEGIN +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : select createTables('1') check_files; + +2: begin; +BEGIN +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : select createTables('2') check_files; + +1: checkpoint; +CHECKPOINT + +-- Make sure that the tables files exist on the coordinator and the standby +1: ! sh /tmp/gp_orphaned_files.sh; +15 +15 +15 +15 + + +-- Get segfault on the coordinator and reconnect after its restart +1: select gp_inject_fault('exec_simple_query_start', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) + +-- The error message can be different, so ignore it +1: @post_run 'echo ""' : select 1; + +-- Wait for the coordinator to be recovered +! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done; + +1q: ... +2q: ... + +1: select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Check that the tables files don't exist on the coordinator and the standby +! sh /tmp/gp_orphaned_files.sh; +0 +0 +0 +0 + + +-- Cleanup +! rm /tmp/gp_orphaned_files.sh; + +1: drop function createTables(n text); +DROP + + +-- Test case 2 +-- Check that orphaned files are not left on segments when the files are created +-- before checkpoint + +1: create or replace function getTableSegFiles (t regclass, out gp_contentid smallint, out filepath text) as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' language sql execute on all segments; +CREATE + +1: create or replace function createTables(n text) returns text as $$ declare cmd text; /**/ begin /* Minimal fillfactor to minimize rows number for creating second main fork file */ execute 'create table t_orphaned_h'||n||'(i int) with (fillfactor=10) distributed by (i)'; /**/ /* Create the .1 file. Separate insert to create FSM. */ execute 'insert into t_orphaned_h'||n||' select generate_series(1,9000000)'; /**/ +execute 'create table t_orphaned_r'||n||'(i int) with (appendonly=true, orientation=row) distributed by (i)'; /**/ /* Create the .1 file */ execute 'insert into t_orphaned_r'||n||' select generate_series(1,100)'; /**/ +/* Create the .128 file */ execute 'create table t_orphaned_c'||n||' with (appendonly=true, orientation=column) as select i as i, i*2 as j from generate_series(1,100) i distributed by (i)'; /**/ /* Create the .1 and .129 files */ execute 'insert into t_orphaned_c'||n||' select i as i, i*2 as j from generate_series(1,100) i'; /**/ +/* Ensure that the mirrors have applied the filesystem changes */ perform force_mirrors_to_catch_up(); /**/ +/* The command do not output PGDATA directories to make it possible to run the test without docker */ select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) into cmd from ( select gp_contentid, 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc from ( select gp_contentid, filepath || suf f from getTableSegFiles('t_orphaned_h'||n), (values(''), ('.1'), ('_fsm')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_r'||n), (values(''), ('.1')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_c'||n), (values(''), ('.1'), ('.128'), ('.129')) v(suf) ) a group by gp_contentid ) f, (select content, datadir from gp_segment_configuration where content > -1) d where f.gp_contentid = d.content; /**/ +return cmd; /**/ end $$ language plpgsql; +CREATE + +-- Test case 2.1 +-- Segfault on all segments + +-- Start transaction and create tables in it before checkpoint +1: begin; +BEGIN +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : select createTables('1') check_files; + +2: begin; +BEGIN +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : select createTables('2') check_files; + +1: checkpoint; +CHECKPOINT + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files.sh; +9 +9 +9 +9 +9 +9 +9 +9 +9 +9 +9 +9 + + +-- Get segfault on all segments +1: select gp_inject_fault('qe_exec_finished', 'segv', dbid) from gp_segment_configuration where role = 'p' and content != -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-- The error message can be different, so ignore it +1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id'); + + +-- Rollback the transaction to make it possible to run queries after the error +1: rollback; +ROLLBACK +2: rollback; +ROLLBACK + +1: select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Check that the tables files don't exist on the segments +! sh /tmp/gp_orphaned_files.sh; +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + + + +-- Test case 2.2 +-- Segfault on one segment + +-- Start transaction and create tables in it before checkpoint +1: begin; +BEGIN +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : select createTables('1') check_files; + +2: begin; +BEGIN +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : select createTables('2') check_files; + +1: checkpoint; +CHECKPOINT + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files.sh; +9 +9 +9 +9 +9 +9 +9 +9 +9 +9 +9 +9 + + +-- Get segfault on a segment +1: select gp_inject_fault('qe_exec_finished', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault +----------------- + Success: +(1 row) + +-- The error message can be different, so ignore it +1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id'); + + +-- Rollback the transaction to make it possible to run queries after the error +1: rollback; +ROLLBACK +2: rollback; +ROLLBACK + +1: select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Make a checkpoint to remove orphaned files from segments where segfault did +-- not happen +1: select gp_inject_fault_infinite('checkpoint', 'reset', dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) +1: checkpoint; +CHECKPOINT + +-- Check that the tables files don't exist on the segments +! sh /tmp/gp_orphaned_files.sh; +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + + + +-- Cleanup +! rm /tmp/gp_orphaned_files.sh; + +1: drop function createTables(n text); +DROP +1: drop function getTableSegFiles (t regclass, out gp_contentid smallint, out filepath text); +DROP + + +-- Test case 3 +-- Check that table files are not deleted in the case of prepared transaction + +-- Don't create checkpoints on the segment number 1 +1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- Stop after `MyPgXact->delayChkpt = false` and before `PostPrepare_smgr()` +-- Stop at the beginning of the checkpointer loop +1: select gp_inject_fault_infinite('end_prepare_two_phase', 'suspend', dbid), gp_inject_fault_infinite('ckpt_loop_begin', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite | gp_inject_fault_infinite +--------------------------+-------------------------- + Success: | Success: +(1 row) + +1&: select gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role = 'p' and content = 1; + +2&: create table t(i int) distributed by (i); +1<: <... completed> + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) + +1&: select gp_wait_until_triggered_fault('ckpt_loop_begin', 1, dbid) from gp_segment_configuration where role = 'p' and content = 1; + +-- Create a checkpoint and the XLOG_PENDING_DELETE WAL record with RelFileNode +-- of the created table. No more creating checkpoint +3: select gp_inject_fault_infinite('checkpoint', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +3&: checkpoint; +1<: <... completed> + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +1: select gp_inject_fault_infinite('ckpt_loop_end', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +1: select gp_inject_fault_infinite('ckpt_loop_begin', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +1: select gp_wait_until_triggered_fault('ckpt_loop_end', 1, dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +3<: <... completed> +CHECKPOINT +3q: ... +1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +1: select gp_inject_fault_infinite('ckpt_loop_end', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- Get a segfault on the segment number 1 at the beginning of the prepared +-- transaction commit +1: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +1: select gp_inject_fault_infinite('end_prepare_two_phase', 'resume', dbid) from gp_segment_configuration where role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +1q: ... +2<: <... completed> +CREATE +2q: ... + +-- Check that the table files are not removed +1: select * from t; + i +--- +(0 rows) + +-- Cleanup +1: drop table t; +DROP diff --git a/src/test/isolation2/expected/gp_orphaned_files_fts_promote.out b/src/test/isolation2/expected/gp_orphaned_files_fts_promote.out new file mode 100644 index 000000000000..66a045587581 --- /dev/null +++ b/src/test/isolation2/expected/gp_orphaned_files_fts_promote.out @@ -0,0 +1,356 @@ + +include: helpers/server_helpers.sql; +CREATE + +1: create or replace function getTableSegFiles (t regclass, out gp_contentid smallint, out filepath text) as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' language sql execute on all segments; +CREATE + +1: create or replace function createTables(n text, mirror_catch_up bool default true) returns text as $$ declare cmd text; /**/ begin execute 'create table t_orphaned_h'||n||'(i int) distributed by (i)'; /**/ execute 'insert into t_orphaned_h'||n||' select generate_series(1,100)'; /**/ +execute 'create table t_orphaned_r'||n||'(i int) with (appendonly=true, orientation=row) distributed by (i)'; /**/ /* Create the .1 file */ execute 'insert into t_orphaned_r'||n||' select generate_series(1,100)'; /**/ +/* Create the .128 file */ execute 'create table t_orphaned_c'||n||' with (appendonly=true, orientation=column) as select i as i, i*2 as j from generate_series(1,100) i distributed by (i)'; /**/ /* Create the .1 and .129 files */ execute 'insert into t_orphaned_c'||n||' select i as i, i*2 as j from generate_series(1,100) i'; /**/ +if mirror_catch_up then /* Ensure that the mirrors have applied the filesystem changes */ perform force_mirrors_to_catch_up(); /**/ end if; /**/ +/* The command do not output PGDATA directories to make it possible to run the test without docker */ select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) into cmd from ( select gp_contentid, 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc from ( select gp_contentid, filepath || suf f from getTableSegFiles('t_orphaned_h'||n), (values(''), ('.1'), ('_fsm')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_r'||n), (values(''), ('.1')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_c'||n), (values(''), ('.1'), ('.128'), ('.129')) v(suf) ) a group by gp_contentid ) f, (select content, datadir from gp_segment_configuration where content > -1) d where f.gp_contentid = d.content; /**/ +return cmd; /**/ end $$ language plpgsql; +CREATE + +-- A copy of standard 'force_mirrors_to_catch_up()', but it forces all mirrors +-- except the one specified by the argument +-- (should be used in case one of mirrors is currently down). +1: create or replace function force_mirrors_to_catch_up_with_exception(excluded_content int) returns void as $$ begin perform pg_switch_xlog(); /**/ perform pg_switch_xlog() from gp_dist_random('gp_id'); /**/ perform gp_inject_fault('after_xlog_redo_noop', 'sleep', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ perform insert_noop_xlog_record(); /**/ perform insert_noop_xlog_record() from gp_dist_random('gp_id'); /**/ perform gp_wait_until_triggered_fault('after_xlog_redo_noop', 1, dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ perform gp_inject_fault('after_xlog_redo_noop', 'reset', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ end $$ language plpgsql; +CREATE + +-- Test case 1 +-- Check removal of orphaned files together with mirror promotion + +-- Start transaction and create tables in it before checkpoint +1: begin; +BEGIN +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1'); + +-- Let 2nd transaction to commit +2: begin; +BEGIN +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' : select createTables('_tx2'); +2: commit; +COMMIT +1: checkpoint; +CHECKPOINT + +-- Create another bunch of tables after savepoint +1: savepoint sp1; +SAVEPOINT +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1_sp1'); + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files_tx1.sh; +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 + + +-- shutdown primary and make sure the segment is down +-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where c.role='p' and c.content=0), 'stop', 'immediate'); + pg_ctl +-------- + OK +(1 row) +select gp_request_fts_probe_scan(); + gp_request_fts_probe_scan +--------------------------- + t +(1 row) +select role, preferred_role, status from gp_segment_configuration where content = 0; + role | preferred_role | status +------+----------------+-------- + m | p | d + p | m | u +(2 rows) + +-- Rollback the transaction to make it possible to run queries after the error +1: rollback; +ROLLBACK + +-- Make a checkpoint to remove orphaned files from segments that are still up +1: checkpoint; +CHECKPOINT + +1: select force_mirrors_to_catch_up_with_exception(0); + force_mirrors_to_catch_up_with_exception +------------------------------------------ + +(1 row) + +-- Check that the tables files don't exist on the segments (except ex-primary 0, which is yet down) +! sh /tmp/gp_orphaned_files_tx1.sh; +7 +0 +0 +0 +0 +0 +7 +0 +0 +0 +0 +0 + + +-- recovery the nodes +!\retcode gprecoverseg -a; +(exited with code 0) +select wait_until_segment_synchronized(0); + wait_until_segment_synchronized +--------------------------------- + OK +(1 row) + +-- Check that the tables files don't exist on all segments now +! sh /tmp/gp_orphaned_files_tx1.sh; +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + + +!\retcode gprecoverseg -ar; +(exited with code 0) +select wait_until_segment_synchronized(0); + wait_until_segment_synchronized +--------------------------------- + OK +(1 row) + +-- verify the first segment is recovered to the original state. +select role, preferred_role, status from gp_segment_configuration where content = 0; + role | preferred_role | status +------+----------------+-------- + p | p | u + m | m | u +(2 rows) + +-- Check that the tables from the committed transaction still exist +! sh /tmp/gp_orphaned_files_tx2.sh; +7 +7 +7 +7 +7 +7 + + +drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2; +DROP + +-- Test case 2 +-- Check that orphaned files are not removed after prepare is done +-- together with mirror promotion +-- and with orphaned files created (and later cleaned up) when the mirror is down. + +-- Start transaction and create tables in it before checkpoint +1: begin; +BEGIN +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1'); + +-- Let 2nd transaction to commit +2: begin; +BEGIN +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' : select createTables('_tx2'); +2: commit; +COMMIT +1: checkpoint; +CHECKPOINT + +-- Create another bunch of tables after savepoint +1: savepoint sp1; +SAVEPOINT +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1_sp1'); + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files_tx1.sh; +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 + + +-- Suspend commit after prepare +select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) + +1&: commit; +select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) + +-- shutdown primary and make sure the segment is down +-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where c.role='p' and c.content=0), 'stop', 'immediate'); + pg_ctl +-------- + OK +(1 row) +select gp_request_fts_probe_scan(); + gp_request_fts_probe_scan +--------------------------- + t +(1 row) +select role, preferred_role, status from gp_segment_configuration where content = 0; + role | preferred_role | status +------+----------------+-------- + m | p | d + p | m | u +(2 rows) + +3: begin; +BEGIN +3: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx3.sh' : select createTables('_tx3', false); + +-- Get segfault on a segment +3: select gp_inject_fault('qe_exec_finished', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = 0; + gp_inject_fault +----------------- + Success: +(1 row) + +-- The error message can be different, so ignore it +3: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id'); + + +3: rollback; +ROLLBACK +3: checkpoint; +CHECKPOINT + +3: select force_mirrors_to_catch_up_with_exception(0); + force_mirrors_to_catch_up_with_exception +------------------------------------------ + +(1 row) + +! sh /tmp/gp_orphaned_files_tx3.sh; +0 +0 +0 +0 +0 +0 + + +-- recovery the nodes +!\retcode gprecoverseg -a; +(exited with code 0) +select wait_until_segment_synchronized(0); + wait_until_segment_synchronized +--------------------------------- + OK +(1 row) + +!\retcode gprecoverseg -ar; +(exited with code 0) +select wait_until_segment_synchronized(0); + wait_until_segment_synchronized +--------------------------------- + OK +(1 row) + +-- verify the first segment is recovered to the original state. +select role, preferred_role, status from gp_segment_configuration where content = 0; + role | preferred_role | status +------+----------------+-------- + p | p | u + m | m | u +(2 rows) + +select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) +1<: <... completed> +COMMIT + +-- Check that the tables from the committed transactions still exist +! sh /tmp/gp_orphaned_files_tx1.sh; +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 +7 + +! sh /tmp/gp_orphaned_files_tx2.sh; +7 +7 +7 +7 +7 +7 + + +-- Check that the tables from the not committed transaction don't exist +! sh /tmp/gp_orphaned_files_tx3.sh; +0 +0 +0 +0 +0 +0 + + +-- Cleanup +drop table t_orphaned_h_tx1, t_orphaned_r_tx1, t_orphaned_c_tx1; +DROP +drop table t_orphaned_h_tx1_sp1, t_orphaned_r_tx1_sp1, t_orphaned_c_tx1_sp1; +DROP +drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2; +DROP + +drop function force_mirrors_to_catch_up_with_exception(excluded_content int); +DROP +drop function createTables(n text, mirror_catch_up bool); +DROP +drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text); +DROP + +! rm /tmp/gp_orphaned_files_tx1.sh; + +! rm /tmp/gp_orphaned_files_tx2.sh; + +! rm /tmp/gp_orphaned_files_tx3.sh; + diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule index d5673be13995..48e6fa37fd0e 100644 --- a/src/test/isolation2/isolation2_schedule +++ b/src/test/isolation2/isolation2_schedule @@ -358,4 +358,8 @@ test: copy_interrupt # test pg_locks view and pg_lock_status() function test: lock_status + test: dependency + +test: gp_orphaned_files +test: gp_orphaned_files_fts_promote diff --git a/src/test/isolation2/sql/gp_orphaned_files.sql b/src/test/isolation2/sql/gp_orphaned_files.sql new file mode 100644 index 000000000000..1678f2ce7d87 --- /dev/null +++ b/src/test/isolation2/sql/gp_orphaned_files.sql @@ -0,0 +1,325 @@ +-- start_ignore +1: create extension if not exists gp_inject_fault; +1: drop index if exists t_orphaned_r1_i, t_orphaned_c1_i, + t_orphaned_r2_i, t_orphaned_c2_i; +1: drop table if exists t_orphaned_h1, t_orphaned_r1, t_orphaned_c1, + t_orphaned_h2, t_orphaned_r2, t_orphaned_c2, t; +-- end_ignore + + +-- Test case 1 +-- Check that orphaned files are not left on the coordinator and the standby +-- when the files are created before checkpoint + +-- Create tables of different access methods and return command to check their +-- files existence on the coordinator and the standby +1: create or replace function createTables(n text) returns text as +$$ +declare + cmd text; /**/ +begin + execute 'create table t_orphaned_h'||n||'(i int) distributed by (i)'; /**/ + + execute 'create table t_orphaned_r'||n||'(i int) + with (appendonly=true, orientation=row) + distributed by (i)'; /**/ + -- Create index to create block directory table + execute 'create index t_orphaned_r'||n||'_i on t_orphaned_r'||n||'(i)'; /**/ + + execute 'create table t_orphaned_c'||n||'(i int) + with (appendonly=true, orientation=column) + distributed by (i)'; /**/ + /* Create index to create block directory table */ + execute 'create index t_orphaned_c'||n||'_i on t_orphaned_c'||n||'(i)'; /**/ + + /* Ensure that the mirrors have applied the filesystem changes */ + perform force_mirrors_to_catch_up(); /**/ + + /* The command do not output PGDATA directories to make it possible to run + the test without docker */ + select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') + || ' 2>/dev/null | wc -l' lswc + from ( + select unnest(array[('t_orphaned_h'||n)::regclass, + ('t_orphaned_r'||n)::regclass, + ('t_orphaned_r'||n||'_i')::regclass, + ('t_orphaned_c'||n)::regclass, + ('t_orphaned_c'||n||'_i')::regclass]) + union all + select unnest(array[segrelid, + blkdirrelid, blkdiridxid, + visimaprelid, visimapidxid]) + from pg_catalog.pg_appendonly + where relid in (('t_orphaned_r'||n)::regclass, + ('t_orphaned_c'||n)::regclass) + ) a + ) f, + (select datadir from gp_segment_configuration where content = -1) d; /**/ + + return cmd; /**/ +end +$$ language plpgsql; + +-- Start transaction and create tables in it before checkpoint +1: begin; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : + select createTables('1') check_files; + +2: begin; +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : + select createTables('2') check_files; + +1: checkpoint; + +-- Make sure that the tables files exist on the coordinator and the standby +1: ! sh /tmp/gp_orphaned_files.sh; + +-- Get segfault on the coordinator and reconnect after its restart +1: select gp_inject_fault('exec_simple_query_start', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + +-- The error message can be different, so ignore it +1: @post_run 'echo ""' : select 1; +-- Wait for the coordinator to be recovered +! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done; +1q: +2q: + +1: select force_mirrors_to_catch_up(); + +-- Check that the tables files don't exist on the coordinator and the standby +! sh /tmp/gp_orphaned_files.sh; + +-- Cleanup +! rm /tmp/gp_orphaned_files.sh; +1: drop function createTables(n text); + + +-- Test case 2 +-- Check that orphaned files are not left on segments when the files are created +-- before checkpoint + +1: create or replace function getTableSegFiles +(t regclass, out gp_contentid smallint, out filepath text) +as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' +language sql +execute on all segments; + +1: create or replace function createTables(n text) returns text as +$$ +declare + cmd text; /**/ +begin + /* Minimal fillfactor to minimize rows number for creating second main fork + file */ + execute 'create table t_orphaned_h'||n||'(i int) + with (fillfactor=10) + distributed by (i)'; /**/ + /* Create the .1 file. Separate insert to create FSM. */ + execute 'insert into t_orphaned_h'||n||' + select generate_series(1,9000000)'; /**/ + + execute 'create table t_orphaned_r'||n||'(i int) + with (appendonly=true, orientation=row) + distributed by (i)'; /**/ + /* Create the .1 file */ + execute 'insert into t_orphaned_r'||n||' + select generate_series(1,100)'; /**/ + + /* Create the .128 file */ + execute 'create table t_orphaned_c'||n||' + with (appendonly=true, orientation=column) as + select i as i, i*2 as j from generate_series(1,100) i + distributed by (i)'; /**/ + /* Create the .1 and .129 files */ + execute 'insert into t_orphaned_c'||n||' + select i as i, i*2 as j from generate_series(1,100) i'; /**/ + + /* Ensure that the mirrors have applied the filesystem changes */ + perform force_mirrors_to_catch_up(); /**/ + + /* The command do not output PGDATA directories to make it possible to run + the test without docker */ + select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select gp_contentid, + 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc + from ( + select gp_contentid, filepath || suf f + from getTableSegFiles('t_orphaned_h'||n), + (values(''), ('.1'), ('_fsm')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_r'||n), + (values(''), ('.1')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_c'||n), + (values(''), ('.1'), ('.128'), ('.129')) v(suf) + ) a + group by gp_contentid + ) f, + (select content, datadir from gp_segment_configuration where content > -1) d + where f.gp_contentid = d.content; /**/ + + return cmd; /**/ +end +$$ language plpgsql; + +-- Test case 2.1 +-- Segfault on all segments + +-- Start transaction and create tables in it before checkpoint +1: begin; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : + select createTables('1') check_files; + +2: begin; +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : + select createTables('2') check_files; + +1: checkpoint; + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files.sh; + +-- Get segfault on all segments +1: select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content != -1; + +-- The error message can be different, so ignore it +1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id'); + +-- Rollback the transaction to make it possible to run queries after the error +1: rollback; +2: rollback; + +1: select force_mirrors_to_catch_up(); + +-- Check that the tables files don't exist on the segments +! sh /tmp/gp_orphaned_files.sh; + + +-- Test case 2.2 +-- Segfault on one segment + +-- Start transaction and create tables in it before checkpoint +1: begin; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : + select createTables('1') check_files; + +2: begin; +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : + select createTables('2') check_files; + +1: checkpoint; + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files.sh; + +-- Get segfault on a segment +1: select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +-- The error message can be different, so ignore it +1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id'); + +-- Rollback the transaction to make it possible to run queries after the error +1: rollback; +2: rollback; + +1: select force_mirrors_to_catch_up(); + +-- Make a checkpoint to remove orphaned files from segments where segfault did +-- not happen +1: select gp_inject_fault_infinite('checkpoint', 'reset', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; +1: checkpoint; + +-- Check that the tables files don't exist on the segments +! sh /tmp/gp_orphaned_files.sh; + + +-- Cleanup +! rm /tmp/gp_orphaned_files.sh; +1: drop function createTables(n text); +1: drop function getTableSegFiles + (t regclass, out gp_contentid smallint, out filepath text); + + +-- Test case 3 +-- Check that table files are not deleted in the case of prepared transaction + +-- Don't create checkpoints on the segment number 1 +1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +-- Stop after `MyPgXact->delayChkpt = false` and before `PostPrepare_smgr()` +-- Stop at the beginning of the checkpointer loop +1: select gp_inject_fault_infinite('end_prepare_two_phase', 'suspend', dbid), + gp_inject_fault_infinite('ckpt_loop_begin', 'suspend', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +1&: select gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +2&: create table t(i int) distributed by (i); +1<: + +1&: select gp_wait_until_triggered_fault('ckpt_loop_begin', 1, dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +-- Create a checkpoint and the XLOG_PENDING_DELETE WAL record with RelFileNode +-- of the created table. No more creating checkpoint +3: select gp_inject_fault_infinite('checkpoint', 'reset', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +3&: checkpoint; +1<: +1: select gp_inject_fault_infinite('ckpt_loop_end', 'suspend', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +1: select gp_inject_fault_infinite('ckpt_loop_begin', 'reset', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +1: select gp_wait_until_triggered_fault('ckpt_loop_end', 1, dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +3<: +3q: +1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +1: select gp_inject_fault_infinite('ckpt_loop_end', 'reset', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +-- Get a segfault on the segment number 1 at the beginning of the prepared +-- transaction commit +1: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +1: select gp_inject_fault_infinite('end_prepare_two_phase', 'resume', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; +1q: +2<: +2q: + +-- Check that the table files are not removed +1: select * from t; + +-- Cleanup +1: drop table t; diff --git a/src/test/isolation2/sql/gp_orphaned_files_fts_promote.sql b/src/test/isolation2/sql/gp_orphaned_files_fts_promote.sql new file mode 100644 index 000000000000..12257b9f528b --- /dev/null +++ b/src/test/isolation2/sql/gp_orphaned_files_fts_promote.sql @@ -0,0 +1,248 @@ +-- start_ignore +-- Increase the number of connection attempts to a segment to 120, reduce +-- the interval between attempts to 1 second. So the segments will have 120 +-- seconds to recover after segfault. +! gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly; +! gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly; +! gpstop -u; +1: create extension if not exists gp_inject_fault; +-- end_ignore + +include: helpers/server_helpers.sql; + +1: create or replace function getTableSegFiles +(t regclass, out gp_contentid smallint, out filepath text) +as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' +language sql +execute on all segments; + +1: create or replace function createTables(n text, mirror_catch_up bool default true) returns text as +$$ +declare + cmd text; /**/ +begin + execute 'create table t_orphaned_h'||n||'(i int) + distributed by (i)'; /**/ + execute 'insert into t_orphaned_h'||n||' + select generate_series(1,100)'; /**/ + + execute 'create table t_orphaned_r'||n||'(i int) + with (appendonly=true, orientation=row) + distributed by (i)'; /**/ + /* Create the .1 file */ + execute 'insert into t_orphaned_r'||n||' + select generate_series(1,100)'; /**/ + + /* Create the .128 file */ + execute 'create table t_orphaned_c'||n||' + with (appendonly=true, orientation=column) as + select i as i, i*2 as j from generate_series(1,100) i + distributed by (i)'; /**/ + /* Create the .1 and .129 files */ + execute 'insert into t_orphaned_c'||n||' + select i as i, i*2 as j from generate_series(1,100) i'; /**/ + + if mirror_catch_up then + /* Ensure that the mirrors have applied the filesystem changes */ + perform force_mirrors_to_catch_up(); /**/ + end if; /**/ + + /* The command do not output PGDATA directories to make it possible to run + the test without docker */ + select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select gp_contentid, + 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc + from ( + select gp_contentid, filepath || suf f + from getTableSegFiles('t_orphaned_h'||n), + (values(''), ('.1'), ('_fsm')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_r'||n), + (values(''), ('.1')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_c'||n), + (values(''), ('.1'), ('.128'), ('.129')) v(suf) + ) a + group by gp_contentid + ) f, + (select content, datadir from gp_segment_configuration where content > -1) d + where f.gp_contentid = d.content; /**/ + + return cmd; /**/ +end +$$ language plpgsql; + +-- A copy of standard 'force_mirrors_to_catch_up()', but it forces all mirrors +-- except the one specified by the argument +-- (should be used in case one of mirrors is currently down). +1: create or replace function force_mirrors_to_catch_up_with_exception(excluded_content int) returns void as +$$ +begin + perform pg_switch_xlog(); /**/ + perform pg_switch_xlog() from gp_dist_random('gp_id'); /**/ + perform gp_inject_fault('after_xlog_redo_noop', 'sleep', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ + perform insert_noop_xlog_record(); /**/ + perform insert_noop_xlog_record() from gp_dist_random('gp_id'); /**/ + perform gp_wait_until_triggered_fault('after_xlog_redo_noop', 1, dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ + perform gp_inject_fault('after_xlog_redo_noop', 'reset', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ +end +$$ language plpgsql; + +-- Test case 1 +-- Check removal of orphaned files together with mirror promotion + +-- Start transaction and create tables in it before checkpoint +1: begin; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' : + select createTables('_tx1'); + +-- Let 2nd transaction to commit +2: begin; +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' : + select createTables('_tx2'); +2: commit; +1: checkpoint; + +-- Create another bunch of tables after savepoint +1: savepoint sp1; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' : + select createTables('_tx1_sp1'); + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files_tx1.sh; + +-- shutdown primary and make sure the segment is down +-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c + where c.role='p' and c.content=0), 'stop', 'immediate'); +select gp_request_fts_probe_scan(); +select role, preferred_role, status from gp_segment_configuration where content = 0; + +-- Rollback the transaction to make it possible to run queries after the error +1: rollback; + +-- Make a checkpoint to remove orphaned files from segments that are still up +1: checkpoint; + +1: select force_mirrors_to_catch_up_with_exception(0); + +-- Check that the tables files don't exist on the segments (except ex-primary 0, which is yet down) +! sh /tmp/gp_orphaned_files_tx1.sh; + +-- recovery the nodes +!\retcode gprecoverseg -a; +select wait_until_segment_synchronized(0); + +-- Check that the tables files don't exist on all segments now +! sh /tmp/gp_orphaned_files_tx1.sh; + +!\retcode gprecoverseg -ar; +select wait_until_segment_synchronized(0); + +-- verify the first segment is recovered to the original state. +select role, preferred_role, status from gp_segment_configuration where content = 0; + +-- Check that the tables from the committed transaction still exist +! sh /tmp/gp_orphaned_files_tx2.sh; + +drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2; + +-- Test case 2 +-- Check that orphaned files are not removed after prepare is done +-- together with mirror promotion +-- and with orphaned files created (and later cleaned up) when the mirror is down. + +-- Start transaction and create tables in it before checkpoint +1: begin; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' : + select createTables('_tx1'); + +-- Let 2nd transaction to commit +2: begin; +2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' : + select createTables('_tx2'); +2: commit; +1: checkpoint; + +-- Create another bunch of tables after savepoint +1: savepoint sp1; +1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' : + select createTables('_tx1_sp1'); + +-- Make sure that all the tables files exist on the segments +1: ! sh /tmp/gp_orphaned_files_tx1.sh; + +-- Suspend commit after prepare +select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content = -1; + +1&: commit; +select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) + from gp_segment_configuration where role = 'p' and content = -1; + +-- shutdown primary and make sure the segment is down +-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c + where c.role='p' and c.content=0), 'stop', 'immediate'); +select gp_request_fts_probe_scan(); +select role, preferred_role, status from gp_segment_configuration where content = 0; + +3: begin; +3: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx3.sh' : + select createTables('_tx3', false); + +-- Get segfault on a segment +3: select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = 0; + +-- The error message can be different, so ignore it +3: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id'); + +3: rollback; +3: checkpoint; + +3: select force_mirrors_to_catch_up_with_exception(0); + +! sh /tmp/gp_orphaned_files_tx3.sh; + +-- recovery the nodes +!\retcode gprecoverseg -a; +select wait_until_segment_synchronized(0); + +!\retcode gprecoverseg -ar; +select wait_until_segment_synchronized(0); + +-- verify the first segment is recovered to the original state. +select role, preferred_role, status from gp_segment_configuration where content = 0; + +select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content = -1; +1<: + +-- Check that the tables from the committed transactions still exist +! sh /tmp/gp_orphaned_files_tx1.sh; +! sh /tmp/gp_orphaned_files_tx2.sh; + +-- Check that the tables from the not committed transaction don't exist +! sh /tmp/gp_orphaned_files_tx3.sh; + +-- Cleanup +drop table t_orphaned_h_tx1, t_orphaned_r_tx1, t_orphaned_c_tx1; +drop table t_orphaned_h_tx1_sp1, t_orphaned_r_tx1_sp1, t_orphaned_c_tx1_sp1; +drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2; + +drop function force_mirrors_to_catch_up_with_exception(excluded_content int); +drop function createTables(n text, mirror_catch_up bool); +drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text); + +! rm /tmp/gp_orphaned_files_tx1.sh; +! rm /tmp/gp_orphaned_files_tx2.sh; +! rm /tmp/gp_orphaned_files_tx3.sh; +-- start_ignore +! gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly; +! gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly; +! gpstop -u; +-- end_ignore diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 2a940fe40fcb..255c443b1d80 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -7,5 +7,6 @@ include $(top_builddir)/src/Makefile.global SUBDIRS = test_planner SUBDIRS += connection SUBDIRS += test_extensions +SUBDIRS += test_dsa $(recurse) diff --git a/src/test/modules/test_dsa/Makefile b/src/test/modules/test_dsa/Makefile new file mode 100644 index 000000000000..bcddb84b0618 --- /dev/null +++ b/src/test/modules/test_dsa/Makefile @@ -0,0 +1,28 @@ +# src/test/modules/test_dsa/Makefile + +MODULE_big = test_dsa +OBJS = \ + $(WIN32RES) \ + test_dsa.o +PGFILEDESC = "test_dsa - test code for dynamic shared memory areas" + +EXTENSION = test_dsa +DATA = test_dsa--1.0.sql + +REGRESS = test_dsa + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_dsa +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +installcheck: install + +test: clean all install + psql postgres -f sql/test_dsa.sql 2>&1 diff --git a/src/test/modules/test_dsa/expected/test_dsa.out b/src/test/modules/test_dsa/expected/test_dsa.out new file mode 100644 index 000000000000..266010e77fe9 --- /dev/null +++ b/src/test/modules/test_dsa/expected/test_dsa.out @@ -0,0 +1,13 @@ +CREATE EXTENSION test_dsa; +SELECT test_dsa_basic(); + test_dsa_basic +---------------- + +(1 row) + +SELECT test_dsa_resowners(); + test_dsa_resowners +-------------------- + +(1 row) + diff --git a/src/test/modules/test_dsa/meson.build b/src/test/modules/test_dsa/meson.build new file mode 100644 index 000000000000..21738290ad58 --- /dev/null +++ b/src/test/modules/test_dsa/meson.build @@ -0,0 +1,33 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +test_dsa_sources = files( + 'test_dsa.c', +) + +if host_system == 'windows' + test_dsa_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_dsa', + '--FILEDESC', 'test_dsa - test code for dynamic shared memory areas',]) +endif + +test_dsa = shared_module('test_dsa', + test_dsa_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_dsa + +test_install_data += files( + 'test_dsa.control', + 'test_dsa--1.0.sql', +) + +tests += { + 'name': 'test_dsa', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_dsa', + ], + }, +} diff --git a/src/test/modules/test_dsa/sql/test_dsa.sql b/src/test/modules/test_dsa/sql/test_dsa.sql new file mode 100644 index 000000000000..c3d8db943720 --- /dev/null +++ b/src/test/modules/test_dsa/sql/test_dsa.sql @@ -0,0 +1,4 @@ +CREATE EXTENSION test_dsa; + +SELECT test_dsa_basic(); +SELECT test_dsa_resowners(); diff --git a/src/test/modules/test_dsa/test_dsa--1.0.sql b/src/test/modules/test_dsa/test_dsa--1.0.sql new file mode 100644 index 000000000000..2904cb23525e --- /dev/null +++ b/src/test/modules/test_dsa/test_dsa--1.0.sql @@ -0,0 +1,12 @@ +/* src/test/modules/test_dsa/test_dsa--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_dsa" to load this file. \quit + +CREATE FUNCTION test_dsa_basic() + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION test_dsa_resowners() + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c new file mode 100644 index 000000000000..5ad4c405d79f --- /dev/null +++ b/src/test/modules/test_dsa/test_dsa.c @@ -0,0 +1,111 @@ +/*-------------------------------------------------------------------------- + * + * test_dsa.c + * Test dynamic shared memory areas (DSAs) + * + * Copyright (c) 2022-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_dsa/test_dsa.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "fmgr.h" +#include "utils/dsa.h" +#include "storage/lwlock.h" +#include "utils/resowner.h" + +PG_MODULE_MAGIC; + +/* Test basic DSA functionality */ +PG_FUNCTION_INFO_V1(test_dsa_basic); +Datum +test_dsa_basic(PG_FUNCTION_ARGS) +{ + int tranche_id; + dsa_area *a; + dsa_pointer p[100]; + + /* XXX: this tranche is leaked */ + tranche_id = LWLockNewTrancheId(); + + a = dsa_create(tranche_id, "test_dsa"); + for (int i = 0; i < 100; i++) + { + p[i] = dsa_allocate(a, 1000); + snprintf(dsa_get_address(a, p[i]), 1000, "foobar%d", i); + } + + for (int i = 0; i < 100; i++) + { + char buf[100]; + + snprintf(buf, 100, "foobar%d", i); + if (strcmp(dsa_get_address(a, p[i]), buf) != 0) + elog(ERROR, "no match"); + } + + for (int i = 0; i < 100; i++) + { + dsa_free(a, p[i]); + } + + dsa_detach(a); + + PG_RETURN_VOID(); +} + +/* Test using DSA across different resource owners */ +PG_FUNCTION_INFO_V1(test_dsa_resowners); +Datum +test_dsa_resowners(PG_FUNCTION_ARGS) +{ + int tranche_id; + dsa_area *a; + dsa_pointer p[10000]; + ResourceOwner oldowner; + ResourceOwner childowner; + + /* XXX: this tranche is leaked */ + tranche_id = LWLockNewTrancheId(); + + /* Create DSA in parent resource owner */ + a = dsa_create(tranche_id, "test_dsa"); + + /* + * Switch to child resource owner, and do a bunch of allocations in the + * DSA + */ + oldowner = CurrentResourceOwner; + childowner = ResourceOwnerCreate(oldowner, "test_dsa temp owner"); + CurrentResourceOwner = childowner; + + for (int i = 0; i < 10000; i++) + { + p[i] = dsa_allocate(a, 1000); + snprintf(dsa_get_address(a, p[i]), 1000, "foobar%d", i); + } + + /* Also test freeing, by freeing some of the allocations. */ + for (int i = 0; i < 500; i++) + dsa_free(a, p[i]); + + /* Release the child resource owner */ + CurrentResourceOwner = oldowner; + ResourceOwnerRelease(childowner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, false); + ResourceOwnerRelease(childowner, + RESOURCE_RELEASE_LOCKS, + true, false); + ResourceOwnerRelease(childowner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, false); + ResourceOwnerDelete(childowner); + + dsa_detach(a); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_dsa/test_dsa.control b/src/test/modules/test_dsa/test_dsa.control new file mode 100644 index 000000000000..ac9674b2193d --- /dev/null +++ b/src/test/modules/test_dsa/test_dsa.control @@ -0,0 +1,4 @@ +comment = 'Test code for dynamic shared memory areas' +default_version = '1.0' +module_pathname = '$libdir/test_dsa' +relocatable = true diff --git a/src/test/regress/expected/gp_orphaned_files.out b/src/test/regress/expected/gp_orphaned_files.out new file mode 100644 index 000000000000..c8cc49b76b8f --- /dev/null +++ b/src/test/regress/expected/gp_orphaned_files.out @@ -0,0 +1,437 @@ +-- start_ignore +-- end_ignore +-- start_matchsubs +-- m/ERROR: Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/ +-- s/ERROR: Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/ERROR: Error on receive from segX sliceX X.X.X.X:X pid=X: server closed the connection unexpectedly/ +-- end_matchsubs +-- Test case 1 +-- Check that orphaned files are not left on the coordinator and the standby +-- when the files are created after checkpoint +-- Create tables of different access methods and return command to check their +-- files existence on the coordinator and the standby +create or replace function createTables() returns text as +$$ +declare + cmd text; +begin + create table t_orphaned_h(i int) + distributed by (i); + + create table t_orphaned_r(i int) + with (appendonly=true, orientation=row) + distributed by (i); + -- Create index to create block directory table + create index t_orphaned_r_i on t_orphaned_r(i); + + create table t_orphaned_c(i int) + with (appendonly=true, orientation=column) + distributed by (i); + -- Create index to create block directory table + create index t_orphaned_c_i on t_orphaned_c(i); + + -- Ensure that the mirrors have applied the filesystem changes + perform force_mirrors_to_catch_up(); + + -- The command do not output PGDATA directories to make it possible to run + -- the test without docker + select '\! ' || + string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') + || ' 2>/dev/null | wc -l' lswc + from ( + select unnest(array['t_orphaned_h'::regclass, + 't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass, + 't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass]) + union all + select unnest(array[segrelid, + blkdirrelid, blkdiridxid, + visimaprelid, visimapidxid]) + from pg_catalog.pg_appendonly + where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass) + ) a + ) f, + (select datadir from gp_segment_configuration where content = -1) d; + + return cmd; +end +$$ language plpgsql; +checkpoint; +-- Skip checkpoints on the coordinator +select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- Create tables in subtransactions +begin; +create table t_top(i int) distributed by (i); +savepoint sp1; +create table t_sub1(i int) distributed by (i); +savepoint sp2; +create table t_sub2(i int) distributed by (i); +commit; +-- Start transaction and create tables in it +begin; +select createTables() check_files +\gset + +-- Make sure that the tables files exist on the coordinator and the standby +:check_files +15 +15 +-- Get segfault on the coordinator and reconnect after its restart +select gp_inject_fault('exec_simple_query_start', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) + +-- The error message from psql can be different, so ignore it +\! psql postgres -c "select 1" 2> /dev/null +-- Wait for the coordinator to be recovered +\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done; +\c regression +-- All the inject faults have been reset after the coordinator restart +select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Check that the tables files don't exist on the coordinator and the standby +:check_files +0 +0 +-- Check that the coordinator recovery didn't remove files of the tables which +-- were created in subtransactions +table t_sub1; + i +--- +(0 rows) + +table t_sub2; + i +--- +(0 rows) + +-- Clean up +drop table t_top, t_sub1, t_sub2; +\unset check_files +-- Test case 2 +-- Check that files are left untouched on the coordinator and the standby +-- when the corresponding distributed commit record exists in WAL +select gp_inject_fault('dtm_xlog_distributed_commit', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + gp_inject_fault +----------------- + Success: +(1 row) + +-- Create tables in a transaction. Get segfault right after the distributed +-- commit record is flushed +\! psql regression -c "begin; select createTables(); commit;" +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +connection to server was lost +-- Wait for the coordinator to be recovered +\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done; +\c regression +select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Check that all the tables and its indexes files exist +select '\! ' || + string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) lswc + from ( + select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') + || ' 2>/dev/null | wc -l' lswc + from ( + select unnest(array['t_orphaned_h'::regclass, + 't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass, + 't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass]) + union all + select unnest(array[segrelid, + blkdirrelid, blkdiridxid, + visimaprelid, visimapidxid]) + from pg_catalog.pg_appendonly + where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass) + ) a + ) f, + (select datadir from gp_segment_configuration where content = -1) d +\gset +:lswc +15 +15 +-- Check that we can read data from the tables +table t_orphaned_h; + i +--- +(0 rows) + +table t_orphaned_r; + i +--- +(0 rows) + +table t_orphaned_c; + i +--- +(0 rows) + +-- Clean up +drop table t_orphaned_h, t_orphaned_r, t_orphaned_c; +drop function createTables(); +-- Test case 3 +-- Check that orphaned files are not left on segments when the files are +-- created after checkpoint +create or replace function getTableSegFiles +(t regclass, out gp_contentid smallint, out filepath text) +as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' +language sql +execute on all segments; +-- Get list of the tables file names on each segment +create or replace function createTables() returns text as +$$ +declare + cmd text; +begin + -- Minimal fillfactor to minimize rows number for creating second main fork file + create table t_orphaned_h(i int) + with (fillfactor=10) + distributed by (i); + -- Create the .1 file. Separate insert to create FSM. + insert into t_orphaned_h select generate_series(1,9000000); + + create table t_orphaned_r(i int) + with (appendonly=true, orientation=row) + distributed by (i); + -- Create the .1 file + insert into t_orphaned_r select generate_series(1,100); + + -- Create the .128 file + create table t_orphaned_c + with (appendonly=true, orientation=column) as + select i as i, i*2 as j from generate_series(1,100) i + distributed by (i); + -- Create the .1 and .129 files + insert into t_orphaned_c + select i as i, i*2 as j from generate_series(1,100) i; + + -- Ensure that the mirrors have applied the filesystem changes + perform force_mirrors_to_catch_up(); + + -- The command do not output PGDATA directories to make it possible to run + -- the test without docker + select '\! ' || + string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select gp_contentid, + 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc + from ( + select gp_contentid, filepath || suf f + from getTableSegFiles('t_orphaned_h'), + (values(''), ('.1'), ('_fsm')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_r'), + (values(''), ('.1')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_c'), + (values(''), ('.1'), ('.128'), ('.129')) v(suf) + ) a + group by gp_contentid + ) f, + (select content, datadir from gp_segment_configuration where content > -1) d + where f.gp_contentid = d.content; + + return cmd; +end +$$ language plpgsql; +-- Test case 3.1 +-- Segfault on all segments +checkpoint; +-- Skip checkpoints +select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +-- Create tables in subtransactions +begin; +create table t_top(i int) distributed by (i); +savepoint sp1; +create table t_sub1(i int) distributed by (i); +savepoint sp2; +create table t_sub2(i int) distributed by (i); +commit; +-- Start transaction and create tables in it +begin; +select createTables() check_files +\gset +-- Make sure that all the tables files exist on the segments +:check_files +9 +9 +9 +9 +9 +9 +-- Get segfault on all segments +select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content != -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +select 1 from gp_dist_random('gp_id'); +ERROR: Error on receive from seg0 slice1 127.0.0.1:6002 pid=45456: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-- Rollback the transaction to make it possible to run queries after the error +rollback; +select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Check that the tables files don't exist on the segments +:check_files +0 +0 +0 +0 +0 +0 +-- Check that the segments recovery didn't remove files of the tables which +-- were created in subtransactions +table t_sub1; + i +--- +(0 rows) + +table t_sub2; + i +--- +(0 rows) + +-- Clean up +drop table t_top, t_sub1, t_sub2; +-- Test case 3.2 +-- Segfault on one segment +checkpoint; +-- Skip checkpoints +select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +-- Create tables in subtransactions +begin; +create table t_top(i int) distributed by (i); +savepoint sp1; +create table t_sub1(i int) distributed by (i); +savepoint sp2; +create table t_sub2(i int) distributed by (i); +commit; +-- Start transaction and create tables in it +begin; +select createTables() check_files +\gset +-- Make sure that all the tables files exist on the segments +:check_files +9 +9 +9 +9 +9 +9 +-- Get segfault on a segment +select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + gp_inject_fault +----------------- + Success: +(1 row) + +select 1 from gp_dist_random('gp_id'); +ERROR: Error on receive from seg1 slice1 127.0.0.1:6003 pid=64482: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-- Rollback the transaction to make it possible to run queries after the error +rollback; +select force_mirrors_to_catch_up(); + force_mirrors_to_catch_up +--------------------------- + +(1 row) + +-- Make a checkpoint to remove orphaned files from segments where segfault did +-- not happen +select gp_inject_fault_infinite('checkpoint', 'reset', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +checkpoint; +-- Check that the tables files don't exist on the segments +:check_files +0 +0 +0 +0 +0 +0 +-- Check that the segment recovery didn't remove files of the tables which +-- were created in subtransactions +table t_sub1; + i +--- +(0 rows) + +table t_sub2; + i +--- +(0 rows) + +-- Clean up +\unset check_files +drop table t_top, t_sub1, t_sub2; +drop function createTables(); +drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text); +-- start_ignore +-- end_ignore diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule index 42216d195377..bb84028de92a 100755 --- a/src/test/regress/greenplum_schedule +++ b/src/test/regress/greenplum_schedule @@ -313,4 +313,6 @@ test: gp_check_files test: gp_query_id +test: gp_orphaned_files + # end of tests diff --git a/src/test/regress/sql/gp_orphaned_files.sql b/src/test/regress/sql/gp_orphaned_files.sql new file mode 100644 index 000000000000..7af4dd17fc30 --- /dev/null +++ b/src/test/regress/sql/gp_orphaned_files.sql @@ -0,0 +1,360 @@ +-- start_ignore +create extension if not exists gp_inject_fault; +drop index if exists t_orphaned_r_i, t_orphaned_c_i; +drop table if exists t_orphaned_h, t_orphaned_r, t_orphaned_c, + t_top, t_sub1, t_sub2; +-- Increase the number of connection attempts to a segment to 120, reduce +-- the interval between attempts to 1 second. So the segments will have 120 +-- seconds to recover after segfault. The demo cluser don't fail over to +-- a mirror if 120 second is enough for recovery +\! gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly +\! gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly +\! gpstop -u +-- end_ignore + +-- start_matchsubs +-- m/ERROR: Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/ +-- s/ERROR: Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/ERROR: Error on receive from segX sliceX X.X.X.X:X pid=X: server closed the connection unexpectedly/ +-- end_matchsubs + +-- Test case 1 +-- Check that orphaned files are not left on the coordinator and the standby +-- when the files are created after checkpoint + +-- Create tables of different access methods and return command to check their +-- files existence on the coordinator and the standby +create or replace function createTables() returns text as +$$ +declare + cmd text; +begin + create table t_orphaned_h(i int) + distributed by (i); + + create table t_orphaned_r(i int) + with (appendonly=true, orientation=row) + distributed by (i); + -- Create index to create block directory table + create index t_orphaned_r_i on t_orphaned_r(i); + + create table t_orphaned_c(i int) + with (appendonly=true, orientation=column) + distributed by (i); + -- Create index to create block directory table + create index t_orphaned_c_i on t_orphaned_c(i); + + -- Ensure that the mirrors have applied the filesystem changes + perform force_mirrors_to_catch_up(); + + -- The command do not output PGDATA directories to make it possible to run + -- the test without docker + select '\! ' || + string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') + || ' 2>/dev/null | wc -l' lswc + from ( + select unnest(array['t_orphaned_h'::regclass, + 't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass, + 't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass]) + union all + select unnest(array[segrelid, + blkdirrelid, blkdiridxid, + visimaprelid, visimapidxid]) + from pg_catalog.pg_appendonly + where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass) + ) a + ) f, + (select datadir from gp_segment_configuration where content = -1) d; + + return cmd; +end +$$ language plpgsql; + +checkpoint; + +-- Skip checkpoints on the coordinator +select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + +-- Create tables in subtransactions +begin; +create table t_top(i int) distributed by (i); +savepoint sp1; +create table t_sub1(i int) distributed by (i); +savepoint sp2; +create table t_sub2(i int) distributed by (i); +commit; + +-- Start transaction and create tables in it +begin; +select createTables() check_files +\gset + +-- Make sure that the tables files exist on the coordinator and the standby +:check_files + +-- Get segfault on the coordinator and reconnect after its restart +select gp_inject_fault('exec_simple_query_start', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + +-- The error message from psql can be different, so ignore it +\! psql postgres -c "select 1" 2> /dev/null +-- Wait for the coordinator to be recovered +\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done; +\c regression + +-- All the inject faults have been reset after the coordinator restart + +select force_mirrors_to_catch_up(); + +-- Check that the tables files don't exist on the coordinator and the standby +:check_files + +-- Check that the coordinator recovery didn't remove files of the tables which +-- were created in subtransactions +table t_sub1; +table t_sub2; + +-- Clean up +drop table t_top, t_sub1, t_sub2; +\unset check_files + + +-- Test case 2 +-- Check that files are left untouched on the coordinator and the standby +-- when the corresponding distributed commit record exists in WAL +select gp_inject_fault('dtm_xlog_distributed_commit', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = -1; + +-- Create tables in a transaction. Get segfault right after the distributed +-- commit record is flushed +\! psql regression -c "begin; select createTables(); commit;" +-- Wait for the coordinator to be recovered +\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done; +\c regression + +select force_mirrors_to_catch_up(); + +-- Check that all the tables and its indexes files exist +select '\! ' || + string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) lswc + from ( + select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') + || ' 2>/dev/null | wc -l' lswc + from ( + select unnest(array['t_orphaned_h'::regclass, + 't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass, + 't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass]) + union all + select unnest(array[segrelid, + blkdirrelid, blkdiridxid, + visimaprelid, visimapidxid]) + from pg_catalog.pg_appendonly + where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass) + ) a + ) f, + (select datadir from gp_segment_configuration where content = -1) d +\gset + +:lswc + +-- Check that we can read data from the tables +table t_orphaned_h; +table t_orphaned_r; +table t_orphaned_c; + +-- Clean up +drop table t_orphaned_h, t_orphaned_r, t_orphaned_c; +drop function createTables(); + + +-- Test case 3 +-- Check that orphaned files are not left on segments when the files are +-- created after checkpoint + +create or replace function getTableSegFiles +(t regclass, out gp_contentid smallint, out filepath text) +as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' +language sql +execute on all segments; + +-- Get list of the tables file names on each segment +create or replace function createTables() returns text as +$$ +declare + cmd text; +begin + -- Minimal fillfactor to minimize rows number for creating second main fork file + create table t_orphaned_h(i int) + with (fillfactor=10) + distributed by (i); + -- Create the .1 file. Separate insert to create FSM. + insert into t_orphaned_h select generate_series(1,9000000); + + create table t_orphaned_r(i int) + with (appendonly=true, orientation=row) + distributed by (i); + -- Create the .1 file + insert into t_orphaned_r select generate_series(1,100); + + -- Create the .128 file + create table t_orphaned_c + with (appendonly=true, orientation=column) as + select i as i, i*2 as j from generate_series(1,100) i + distributed by (i); + -- Create the .1 and .129 files + insert into t_orphaned_c + select i as i, i*2 as j from generate_series(1,100) i; + + -- Ensure that the mirrors have applied the filesystem changes + perform force_mirrors_to_catch_up(); + + -- The command do not output PGDATA directories to make it possible to run + -- the test without docker + select '\! ' || + string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) + into cmd + from ( + select gp_contentid, + 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc + from ( + select gp_contentid, filepath || suf f + from getTableSegFiles('t_orphaned_h'), + (values(''), ('.1'), ('_fsm')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_r'), + (values(''), ('.1')) v(suf) + union all + select gp_contentid, filepath || suf + from getTableSegFiles('t_orphaned_c'), + (values(''), ('.1'), ('.128'), ('.129')) v(suf) + ) a + group by gp_contentid + ) f, + (select content, datadir from gp_segment_configuration where content > -1) d + where f.gp_contentid = d.content; + + return cmd; +end +$$ language plpgsql; + +-- Test case 3.1 +-- Segfault on all segments +checkpoint; + +-- Skip checkpoints +select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; + +-- Create tables in subtransactions +begin; +create table t_top(i int) distributed by (i); +savepoint sp1; +create table t_sub1(i int) distributed by (i); +savepoint sp2; +create table t_sub2(i int) distributed by (i); +commit; + +-- Start transaction and create tables in it +begin; +select createTables() check_files +\gset + +-- Make sure that all the tables files exist on the segments +:check_files + +-- Get segfault on all segments +select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content != -1; + +select 1 from gp_dist_random('gp_id'); + +-- Rollback the transaction to make it possible to run queries after the error +rollback; + +select force_mirrors_to_catch_up(); + +-- Check that the tables files don't exist on the segments +:check_files + +-- Check that the segments recovery didn't remove files of the tables which +-- were created in subtransactions +table t_sub1; +table t_sub2; + +-- Clean up +drop table t_top, t_sub1, t_sub2; + + +-- Test case 3.2 +-- Segfault on one segment +checkpoint; + +-- Skip checkpoints +select gp_inject_fault_infinite('checkpoint', 'skip', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; + +-- Create tables in subtransactions +begin; +create table t_top(i int) distributed by (i); +savepoint sp1; +create table t_sub1(i int) distributed by (i); +savepoint sp2; +create table t_sub2(i int) distributed by (i); +commit; + +-- Start transaction and create tables in it +begin; +select createTables() check_files +\gset + +-- Make sure that all the tables files exist on the segments +:check_files + +-- Get segfault on a segment +select gp_inject_fault('qe_exec_finished', 'segv', dbid) + from gp_segment_configuration + where role = 'p' and content = 1; + +select 1 from gp_dist_random('gp_id'); + +-- Rollback the transaction to make it possible to run queries after the error +rollback; + +select force_mirrors_to_catch_up(); + +-- Make a checkpoint to remove orphaned files from segments where segfault did +-- not happen +select gp_inject_fault_infinite('checkpoint', 'reset', dbid) + from gp_segment_configuration + where role = 'p' and content > -1; +checkpoint; + +-- Check that the tables files don't exist on the segments +:check_files + +-- Check that the segment recovery didn't remove files of the tables which +-- were created in subtransactions +table t_sub1; +table t_sub2; + + +-- Clean up +\unset check_files +drop table t_top, t_sub1, t_sub2; +drop function createTables(); +drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text); +-- start_ignore +\! gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly +\! gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly +\! gpstop -u +-- end_ignore diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 760e74516117..684d8092aceb 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -637,6 +637,13 @@ Form_pg_ts_template Form_pg_type Form_pg_user_mapping FormatNode +FreePageBtree +FreePageBtreeHeader +FreePageBtreeInternalKey +FreePageBtreeLeafKey +FreePageBtreeSearchResult +FreePageManager +FreePageSpanLeader FromCharDateMode FromExpr FuncCall @@ -2132,6 +2139,12 @@ dlist_iter dlist_mutable_iter dlist_node ds_state +dsa_area +dsa_area_control +dsa_area_pool +dsa_area_span +dsa_segment_header +dsa_segment_map dsm_control_header dsm_control_item dsm_handle