diff --git a/.abi-check/6.27.1_arenadata63/postgres.symbols.ignore b/.abi-check/6.27.1_arenadata63/postgres.symbols.ignore
new file mode 100644
index 000000000000..2c3d6c6c1abe
--- /dev/null
+++ b/.abi-check/6.27.1_arenadata63/postgres.symbols.ignore
@@ -0,0 +1,2 @@
+ConfigureNamesBool_gp
+log_smgrcreate
diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c
index 65ff67428611..2a18785431b2 100644
--- a/src/backend/access/rmgrdesc/smgrdesc.c
+++ b/src/backend/access/rmgrdesc/smgrdesc.c
@@ -24,7 +24,7 @@ smgr_desc(StringInfo buf, XLogRecord *record)
 	uint8           info = record->xl_info & ~XLR_INFO_MASK;
 	char            *rec = XLogRecGetData(record);
 
-	if (info == XLOG_SMGR_CREATE)
+	if ((info == XLOG_SMGR_CREATE) || (info == XLOG_SMGR_CREATE_PDL))
 	{
 		xl_smgr_create *xlrec = (xl_smgr_create *) rec;
 		char	   *path = relpathperm(xlrec->rnode, xlrec->forkNum);
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index e68c2b11639d..898f5a9fed78 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -17,6 +17,7 @@
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "catalog/pg_control.h"
+#include "catalog/storage_pending_deletes.h"
 #include "utils/guc.h"
 #include "utils/timestamp.h"
 
@@ -233,6 +234,11 @@ xlog_desc(StringInfo buf, XLogRecord *record)
 						 (uint32) xlrec.overwritten_lsn,
 						 timestamptz_to_str(xlrec.overwrite_time));
 	}
+	else if (info == XLOG_PENDING_DELETE)
+	{
+		appendStringInfo(buf, "orphaned relfilenodes to delete: %zu",
+						 ((PendingRelXactDeleteArray *)rec)->count);
+	}
 	else
 		appendStringInfoString(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 8e6f3a4daf20..c99a8ae10f98 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -56,6 +56,7 @@
 #include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "catalog/storage.h"
+#include "catalog/storage_pending_deletes_redo.h"
 #include "catalog/storage_tablespace.h"
 #include "catalog/storage_database.h"
 #include "funcapi.h"
@@ -2459,3 +2460,85 @@ getTwoPhaseOldestPreparedTransactionXLogRecPtr(prepared_transaction_agg_state *p
 	return oldest;
 
 }  /* end getTwoPhaseOldestPreparedTransactionXLogRecPtr */
+
+bool
+RemovePendingDeletesForPreparedTransactions()
+{
+	HASH_SEQ_STATUS scan_status;
+	prpt_map   *entry;
+	XLogReaderState *xlogreader;
+	volatile bool result = true;
+	XLogRecord *xlogrec = NULL;
+	MemoryContext oldcontext = CurrentMemoryContext;
+
+	if (NULL == crashRecoverPostCheckpointPreparedTransactions_map_ht)
+		return result;
+
+	xlogreader = XLogReaderAllocate(&read_local_xlog_page, NULL);
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+		   errdetail("Failed while allocating an XLog reading processor.")));
+
+	hash_seq_init(&scan_status,
+				  crashRecoverPostCheckpointPreparedTransactions_map_ht);
+	while ((entry = (prpt_map *) hash_seq_search(&scan_status)) != NULL)
+	{
+		char	   *errormsg = NULL;
+		TwoPhaseFileHeader *hdr;
+
+		if (entry->xlogrecptr == InvalidXLogRecPtr)
+			continue;
+
+		int savedInterruptHoldoffCount = InterruptHoldoffCount;
+		PG_TRY();
+		{
+			xlogrec = XLogReadRecord(xlogreader, entry->xlogrecptr, &errormsg);
+		}
+		PG_CATCH();
+		{
+			MemoryContextSwitchTo(oldcontext);
+			InterruptHoldoffCount = savedInterruptHoldoffCount;
+			FlushErrorState();
+			result = false;
+		}
+		PG_END_TRY();
+
+		if (!result)
+		{
+			elog(LOG, "Failed to read WAL record %X/%X for XID %u in %s",
+				 (uint32) (entry->xlogrecptr >> 32),
+				 (uint32) entry->xlogrecptr,
+				 entry->xid,
+				 __func__);
+			break;
+		}
+
+		if (NULL == xlogrec)
+		{
+			if (errormsg)
+				ereport(ERROR,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("xlog record is invalid"),
+						 errdetail("%s", errormsg)));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("xlog record is invalid")));
+		}
+
+		hdr = (TwoPhaseFileHeader *) XLogRecGetData(xlogrec);
+
+		TransactionId *subxids = (hdr->nsubxacts > 0) ?
+			(TransactionId *)
+				((char *) hdr + MAXALIGN(sizeof(TwoPhaseFileHeader))) :
+			NULL;
+
+		PdlRedoRemoveTree(hdr->xid, subxids, hdr->nsubxacts);
+	}
+
+	XLogReaderFree(xlogreader);
+
+	return result;
+}  /* end RemovePendingDeletesForPreparedTransactions */
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 618ce3f536ae..c9dad916f323 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -32,6 +32,7 @@
 #include "catalog/namespace.h"
 #include "catalog/oid_dispatch.h"
 #include "catalog/storage.h"
+#include "catalog/storage_pending_deletes_redo.h"
 #include "catalog/storage_tablespace.h"
 #include "catalog/storage_database.h"
 #include "commands/async.h"
@@ -6223,6 +6224,8 @@ xact_redo_commit_internal(TransactionId xid, XLogRecPtr lsn,
 
 	DoTablespaceDeletionForRedoXlog(tablespace_oid_to_delete);
 
+	PdlRedoRemoveTree(xid, sub_xids, nsubxacts);
+
 	/*
 	 * We issue an XLogFlush() for the same reason we emit ForceSyncCommit()
 	 * in normal operation. For example, in CREATE DATABASE, we copy all files
@@ -6380,6 +6383,8 @@ xact_redo_distributed_commit(xl_xact_commit *xlrec, TransactionId xid)
 		DropRelationFiles(xlrec->xnodes, xlrec->nrels, true);
 		DropDatabaseDirectories(deldbs, xlrec->ndeldbs, true);
 		DoTablespaceDeletionForRedoXlog(xlrec->tablespace_oid_to_delete_on_commit);
+
+		PdlRedoRemoveTree(xid, sub_xids, xlrec->nsubxacts);
 	}
 
 	/*
@@ -6455,6 +6460,8 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
 	DropRelationFiles(xlrec->xnodes, xlrec->nrels, true);
 	DropDatabaseDirectories(deldbs, xlrec->ndeldbs, true);
 	DoTablespaceDeletionForRedoXlog(xlrec->tablespace_oid_to_delete_on_abort);
+
+	PdlRedoRemoveTree(xid, sub_xids, xlrec->nsubxacts);
 }
 
 static void
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 31a97a196adb..a5ec3a9e31c3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -36,6 +36,7 @@
 #include "catalog/catversion.h"
 #include "catalog/pg_control.h"
 #include "catalog/pg_database.h"
+#include "catalog/storage_pending_deletes_redo.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
@@ -7451,6 +7452,40 @@ StartupXLOG(void)
 					TimeLineID	newTLI = ThisTimeLineID;
 					TimeLineID	prevTLI = ThisTimeLineID;
 
+					if ((info == XLOG_CHECKPOINT_SHUTDOWN) ||
+						(info == XLOG_END_OF_RECOVERY))
+					{
+						/*
+						 * At this point we may encounter a situation, when some
+						 * prepared transaction is yet not committed/aborted,
+						 * but the respective WAL segment file is already
+						 * recycled. It may happen is some corner cases, like:
+						 * 1. Primary successfully performs Prepare for a
+						 * transaction;
+						 * 2. Primary stops responding and Mirror is promoted;
+						 * 3. New Primary (ex Mirror) commits the transaction;
+						 * 4. New Primary (ex Mirror) recycles WAL segment with
+						 * the Prepare record (because both Primary and Mirror
+						 * has done the Prepare);
+						 * 5. Ex Primary is recovered as new Mirror, it has the
+						 * the transaction in the list of prepared transactions,
+						 * but doesn't have the WAL segment. And the new Mirror
+						 * should soon see the commit REDO record from the new
+						 * Primary (and remove the transaction from the list of
+						 * prepared transactions).
+						 *
+						 * In such a case
+						 * RemovePendingDeletesForPreparedTransactions() will
+						 * return FALSE. And we postpone the removal of orphaned
+						 * files until all such prepared transactions without
+						 * WAL segment files are wiped out from the list of
+						 * prepared transactions.
+						 */
+						if (RemovePendingDeletesForPreparedTransactions())
+							/* Clean up orphaned files */
+							PdlRedoDropFiles();
+					}
+
 					if (info == XLOG_CHECKPOINT_SHUTDOWN)
 					{
 						CheckPoint	checkPoint;
@@ -7969,6 +8004,21 @@ StartupXLOG(void)
 
 		UtilityModeCloseDtmRedoFile();
 
+		/*
+		 * By this moment, there shouldn't be any prepared transaction with
+		 * missing respective WAL segment file, meaning
+		 * RemovePendingDeletesForPreparedTransactions() should return TRUE.
+		 * If not, most likely the respective WAL segment file is recycled
+		 * illegally, and we do not perform orphaned files removal (as we might
+		 * remove smth that is already committed). Instead, we emit a warning.
+		 */
+		if (RemovePendingDeletesForPreparedTransactions())
+			/* Clean up orphaned files */
+			PdlRedoDropFiles();
+		else
+			ereport(WARNING, (errmsg(
+					"Couldn't drop orphaned files")));
+
 		/*
 		 * And finally, execute the recovery_end_command, if any.
 		 */
@@ -9316,6 +9366,9 @@ CreateCheckPoint(int flags)
 	 */
 	getDtxCheckPointInfo(&dtxCheckPointInfo, &dtxCheckPointInfoSize);
 
+	if (!shutdown)
+		PdlXLogInsert();
+
 	CheckPointGuts(checkPoint.redo, flags);
 
 	/*
@@ -10782,6 +10835,10 @@ xlog_redo(XLogRecPtr beginLoc __attribute__((unused)), XLogRecPtr lsn __attribut
 		/* Keep track of full_page_writes */
 		lastFullPageWrites = fpw;
 	}
+	else if (info == XLOG_PENDING_DELETE)
+	{
+		PdlRedoXLogRecord(record);
+	}
 }
 
 /*
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index a766ce6a0ca6..2e6cdc3b4e1d 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -23,7 +23,8 @@ OBJS += pg_exttable.o pg_extprotocol.o \
        pg_attribute_encoding.o pg_compression.o aovisimap.o \
        pg_appendonly.o \
        oid_dispatch.o aocatalog.o storage_tablespace.o storage_database.o \
-       storage_tablespace_twophase.o storage_tablespace_xact.o
+       storage_tablespace_twophase.o storage_tablespace_xact.o \
+       storage_pending_deletes_redo.o storage_pending_deletes.o
 
 
 BKIFILES = postgres.bki postgres.description postgres.shdescription
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 773d3bc32d9f..88a96cf04c5e 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1895,7 +1895,9 @@ heap_create_init_fork(Relation rel)
 {
 	RelationOpenSmgr(rel);
 	smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
-	log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
+	log_smgrcreate(&rel->rd_smgr->smgr_rnode.node,
+				   INIT_FORKNUM,
+				   rel->rd_rel->relstorage);
 	smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
 }
 
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 20c2da39157c..bf03d1829208 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -19,11 +19,13 @@
 
 #include "postgres.h"
 
+#include "access/transam.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "catalog/storage.h"
+#include "catalog/storage_pending_deletes_redo.h"
 #include "catalog/storage_xlog.h"
 #include "common/relpath.h"
 #include "commands/dbcommands.h"
@@ -57,11 +59,21 @@ typedef struct PendingRelDelete
 	RelFileNodePendingDelete relnode;		/* relation that may need to be deleted */
 	bool		atCommit;		/* T=delete at commit; F=delete at abort */
 	int			nestLevel;		/* xact nesting level of request */
+	dsa_pointer shmemPtr;		/* ptr to shared pending delete list node */
 	struct PendingRelDelete *next;		/* linked-list link */
 } PendingRelDelete;
 
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
 
+static void
+PendingRelDeleteFree(PendingRelDelete *pending)
+{
+	Assert(pending != NULL);
+	if (DsaPointerIsValid(pending->shmemPtr))
+		PdlShmemRemove(pending->shmemPtr);
+	pfree(pending);
+}
+
 /*
  * RelationCreateStorage
  *		Create physical storage for a relation.
@@ -80,6 +92,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, char relstorage)
 	SMgrRelation srel;
 	BackendId	backend;
 	bool		needs_wal;
+	TransactionId xid = InvalidTransactionId;
 
 	switch (relpersistence)
 	{
@@ -104,7 +117,14 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, char relstorage)
 	smgrcreate(srel, MAIN_FORKNUM, false);
 
 	if (needs_wal)
-		log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
+	{
+		/* 
+		 * Call GetCurrentTransactionId before log_smgrcreate, because
+		 * XLOG_SMGR_CREATE_PDL WAL record should be always linked to XID
+		 */
+		xid = GetCurrentTransactionId();
+		log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM, relstorage);
+	}
 
 	/* Add the relation to the list of stuff to delete at abort */
 	pending = (PendingRelDelete *)
@@ -115,30 +135,34 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, char relstorage)
 	pending->atCommit = false;	/* delete if abort */
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->next = pendingDeletes;
+	pending->shmemPtr = PdlShmemAdd(&pending->relnode, xid);
 	pendingDeletes = pending;
 }
 
 /*
- * Perform XLogInsert of a XLOG_SMGR_CREATE record to WAL.
+ * Perform XLogInsert of a XLOG_SMGR_CREATE_PDL record to WAL.
  */
 void
-log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
+log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum, char relstorage)
 {
-	xl_smgr_create xlrec;
+	xl_smgr_create_pdl xlrec;
 	XLogRecData rdata;
 
 	/*
 	 * Make an XLOG entry reporting the file creation.
 	 */
-	xlrec.rnode = *rnode;
-	xlrec.forkNum = forkNum;
+	xlrec.createrec.rnode = *rnode;
+	xlrec.createrec.forkNum = forkNum;
+	xlrec.relstorage = relstorage;
 
 	rdata.data = (char *) &xlrec;
 	rdata.len = sizeof(xlrec);
 	rdata.buffer = InvalidBuffer;
 	rdata.next = NULL;
 
-	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata);
+	XLogRecPtr recptr = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE_PDL, &rdata);
+
+	XLogFlush(recptr);
 }
 
 /*
@@ -159,6 +183,7 @@ RelationDropStorage(Relation rel)
 	pending->atCommit = true;	/* delete if commit */
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->next = pendingDeletes;
+	pending->shmemPtr = InvalidDsaPointer;
 	pendingDeletes = pending;
 
 	/*
@@ -210,7 +235,7 @@ RelationPreserveStorage(RelFileNode rnode, bool atCommit)
 				prev->next = next;
 			else
 				pendingDeletes = next;
-			pfree(pending);
+			PendingRelDeleteFree(pending);
 			/* prev does not change */
 		}
 		else
@@ -366,7 +391,7 @@ smgrDoPendingDeletes(bool isCommit)
 				srels[nrels++] = srel;
 			}
 			/* must explicitly free the list entry */
-			pfree(pending);
+			PendingRelDeleteFree(pending);
 			/* prev does not change */
 		}
 	}
@@ -467,7 +492,7 @@ PostPrepare_smgr(void)
 		next = pending->next;
 		pendingDeletes = next;
 		/* must explicitly free the list entry */
-		pfree(pending);
+		PendingRelDeleteFree(pending);
 	}
 }
 
@@ -518,6 +543,30 @@ smgr_redo(XLogRecPtr beginLoc, XLogRecPtr lsn, XLogRecord *record)
 		reln = smgropen(xlrec->rnode, InvalidBackendId);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
+	else if (info == XLOG_SMGR_CREATE_PDL)
+	{
+		xl_smgr_create_pdl *xlrec =
+			(xl_smgr_create_pdl *) XLogRecGetData(record);
+		PendingRelXactDelete pd =
+		{
+			.relnode =
+			{
+				.node = xlrec->createrec.rnode,
+				/*
+				 * Temp relations are not logged in WAL, so it is always false
+				 * here.
+				 */
+				.isTempRelation = false,
+				.relstorage = xlrec->relstorage
+			},
+			.xid = record->xl_xid
+		};
+
+		SMgrRelation reln = smgropen(xlrec->createrec.rnode, InvalidBackendId);
+		smgrcreate(reln, xlrec->createrec.forkNum, true);
+
+		PdlRedoAdd(&pd);
+	}
 	else if (info == XLOG_SMGR_TRUNCATE)
 	{
 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
diff --git a/src/backend/catalog/storage_pending_deletes.c b/src/backend/catalog/storage_pending_deletes.c
new file mode 100644
index 000000000000..bf1d4573f77f
--- /dev/null
+++ b/src/backend/catalog/storage_pending_deletes.c
@@ -0,0 +1,303 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_pending_deletes.c
+ *	  code to support collecting of pending deletes from backends
+ *
+ * Copyright (c) 2025 Greengage Community
+ *
+ *	  src/backend/catalog/storage_pending_deletes.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/storage_pending_deletes.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/dsa.h"
+#include "utils/guc.h"
+
+typedef struct PendingDeleteListNode
+{
+	PendingRelXactDelete xrelnode;
+	dsa_pointer next;
+	dsa_pointer prev;
+}	PendingDeleteListNode;
+
+typedef struct PendingDeletesList
+{
+	LWLock	   *lock;			/* protects the list */
+	dsa_pointer head;			/* ptr to PendingDeleteListNode list head */
+}	PendingDeletesList;
+
+typedef struct BackendsPendingDeletesArray
+{
+	PendingDeletesList *array;
+	char		dsa_mem[FLEXIBLE_ARRAY_MEMBER];
+}	BackendsPendingDeletesArray;
+
+static BackendsPendingDeletesArray *BackendsPendingDeletes = NULL;
+
+static inline bool
+is_tracking_enabled()
+{
+	return !IsBootstrapProcessingMode() &&
+		gp_track_pending_delete &&
+		dynamic_shared_memory_type != DSM_IMPL_NONE;
+}
+
+/* Memory required for the BackendsPendingDeletesArray structure */
+static inline Size
+PdlStructSize(void)
+{
+	return add_size(offsetof(BackendsPendingDeletesArray, dsa_mem),
+					dsa_minimum_size());
+}
+
+/* Memory required for array of PendingDeletesList-s */
+static inline Size
+PdlListArraySize(void)
+{
+	return mul_size(sizeof(PendingDeletesList), MaxBackends);
+}
+
+/*
+ * Calculate shmem size for pending deletes.
+ * BackendsPendingDeletesArray.dsa_mem should fit DSA.
+ */
+Size
+PdlShmemSize(void)
+{
+	if (!gp_track_pending_delete)
+		return 0;
+
+	return add_size(PdlStructSize(), PdlListArraySize());
+}
+
+/* Initialize shared memory pending delete lists for all backends */
+void
+PdlShmemInit(void)
+{
+	if (!is_tracking_enabled())
+		return;
+
+	bool		found;
+
+	BackendsPendingDeletes = (BackendsPendingDeletesArray *)
+		ShmemInitStruct("Pending deletes array", PdlStructSize(), &found);
+	if (found)
+		return;
+
+	BackendsPendingDeletes->array = (PendingDeletesList *)
+		ShmemAlloc(PdlListArraySize());
+	if (BackendsPendingDeletes->array == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("Not enough memory to create pending deletes lists.")));
+	
+	for (int i = 0; i < MaxBackends; i++)
+		BackendsPendingDeletes->array[i] = (PendingDeletesList)
+		{
+			.head = InvalidDsaPointer,
+			.lock = LWLockAssign()
+		};
+
+	dsa_area   *dsa = dsa_create_in_place(
+						 BackendsPendingDeletes->dsa_mem, dsa_minimum_size(),
+						 LWLockNewTrancheId(), "storage_pending_deletes", NULL);
+
+	on_shmem_exit(dsa_on_shmem_exit_release_in_place,
+				  (Datum) BackendsPendingDeletes->dsa_mem);
+	dsa_detach(dsa);
+}
+
+/*
+ * Cleanup pending deletes list.
+ * When the function is called, the list should be empty
+ */
+static void
+pdl_beshutdown_hook(int code, Datum arg)
+{
+	dsa_release_in_place(BackendsPendingDeletes->dsa_mem);
+
+	if (MyBackendId == InvalidBackendId)
+		return;
+
+	PendingDeletesList *list = &BackendsPendingDeletes->array[MyBackendId];
+
+	if (!DsaPointerIsValid(list->head))
+		return;
+
+	/* Assert on debug build and warning on release */
+	Assert(false);
+	ereport(WARNING,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			 errmsg("Pending deletes list is not empty. "
+					"MyBackend: %d, MyProcPid: %d", MyBackendId, MyProcPid)));
+	list->head = InvalidDsaPointer;
+}
+
+/* Attach DSA once per process. */
+static dsa_area *
+PdlAttachDsa(void)
+{
+	static dsa_area *dsa = NULL;	/* ptr to DSA area attached by
+									 * current process */
+
+	if (dsa)
+		return dsa;
+
+	/*
+	 * Keep the DSA area ptr in TopMemoryContext to avoid excessive
+	 * attach/detach at every add/remove
+	 */
+	MemoryContext oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+
+	dsa = dsa_attach_in_place(BackendsPendingDeletes->dsa_mem, NULL);
+	MemoryContextSwitchTo(oldcxt);
+
+	/* pin mappings, so they can survive res owner life end */
+	dsa_pin_mapping(dsa);
+
+	on_shmem_exit(pdl_beshutdown_hook, 0);
+
+	return dsa;
+}
+
+/*
+ * Add pending delete node to the list of current backend.
+ * Return DSA ptr of a created node. This ptr can be passed to PdlShmemRemove.
+ */
+dsa_pointer
+PdlShmemAdd(const RelFileNodePendingDelete * relnode, TransactionId xid)
+{
+	if (!is_tracking_enabled() || xid == InvalidTransactionId ||
+		MyBackendId == InvalidBackendId)
+		return InvalidDsaPointer;
+
+	PendingDeleteListNode *node;
+	dsa_area   *dsa = PdlAttachDsa();
+	const dsa_pointer node_dsa = dsa_allocate(dsa, sizeof(*node));
+
+	if (!DsaPointerIsValid(node_dsa))
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("Not enough memory to add pending delete node. "
+				   "MyBackend: %d, MyProcPid: %d", MyBackendId, MyProcPid)));
+
+	node = dsa_get_address(dsa, node_dsa);
+	*node = (PendingDeleteListNode)
+	{
+		.xrelnode =
+		{
+			.relnode = *relnode,
+			.xid = xid
+		},
+		.prev = InvalidDsaPointer
+	};
+
+	PendingDeletesList *list = &BackendsPendingDeletes->array[MyBackendId];
+
+	LWLockAcquire(list->lock, LW_EXCLUSIVE);
+	node->next = list->head;
+	if (DsaPointerIsValid(node->next))
+	{
+		PendingDeleteListNode *next_node = (PendingDeleteListNode *)
+			dsa_get_address(dsa, node->next);
+
+		next_node->prev = node_dsa;
+	}
+	list->head = node_dsa;
+	LWLockRelease(list->lock);
+
+	return node_dsa;
+}
+
+/*
+ * Remove pending delete node from the list of current backend.
+ * node_ptr is a ptr to already added node (see PdlShmemAdd)
+ */
+void
+PdlShmemRemove(dsa_pointer node_ptr)
+{
+	if (!is_tracking_enabled() || MyBackendId == InvalidBackendId)
+		return;
+
+	Assert(DsaPointerIsValid(node_ptr));
+
+	dsa_area   *dsa = PdlAttachDsa();
+	PendingDeletesList *list = &BackendsPendingDeletes->array[MyBackendId];
+	const PendingDeleteListNode *node = dsa_get_address(dsa, node_ptr);
+
+	LWLockAcquire(list->lock, LW_EXCLUSIVE);
+	if (DsaPointerIsValid(node->next))
+	{
+		PendingDeleteListNode *next_node = dsa_get_address(dsa, node->next);
+
+		next_node->prev = node->prev;
+	}
+
+	if (DsaPointerIsValid(node->prev))
+	{
+		PendingDeleteListNode *prev_node = dsa_get_address(dsa, node->prev);
+
+		prev_node->next = node->next;
+	}
+	else
+		list->head = node->next;
+
+	LWLockRelease(list->lock);
+
+	dsa_free(dsa, node_ptr);
+}
+
+/*
+ * Collect info about pending deletes from all backends and return
+ * the accumulated result. Return NULL if there are no nodes in the lists.
+ * Note: the returned result is palloc'ed. Caller is responsible for
+ * freeing it.
+ */
+PendingRelXactDeleteArray *
+PdlXLogShmemDump(void)
+{
+	dsa_area   *dsa = PdlAttachDsa();
+	PendingRelXactDeleteArray *ret = NULL;
+	Size		size = offsetof(PendingRelXactDeleteArray, array);
+	Size		step = sizeof(*ret->array) * 32;
+
+	for (int i = 0; i < MaxBackends; i++)
+	{
+		PendingDeletesList *list = &BackendsPendingDeletes->array[i];
+
+		LWLockAcquire(list->lock, LW_SHARED);
+
+		for (dsa_pointer pdl_node_dsa = list->head;
+			 DsaPointerIsValid(pdl_node_dsa);)
+		{
+			const PendingDeleteListNode *pdl_node = dsa_get_address(dsa,
+															   pdl_node_dsa);
+
+			if (ret == NULL)
+			{
+				size += step;
+				ret = palloc(size);
+				ret->count = 0;
+			}
+			else if (PdlDumpSize(ret->count + 1) > size)
+			{
+				step *= 2;
+				size += step;
+				ret = repalloc(ret, size);
+			}
+
+			ret->array[ret->count++] = pdl_node->xrelnode;
+			pdl_node_dsa = pdl_node->next;
+		}
+
+		LWLockRelease(list->lock);
+	}
+
+	return ret;
+}
diff --git a/src/backend/catalog/storage_pending_deletes_redo.c b/src/backend/catalog/storage_pending_deletes_redo.c
new file mode 100644
index 000000000000..59d7505451d2
--- /dev/null
+++ b/src/backend/catalog/storage_pending_deletes_redo.c
@@ -0,0 +1,345 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_pending_deletes_redo.c
+ *	  code to support processing of pending deletes (orphaned files) in WAL
+ *
+ * Copyright (c) 2025 Greengage Community
+ *
+ *	  src/backend/catalog/storage_pending_deletes_redo.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/transam.h"
+#include "catalog/storage_pending_deletes_redo.h"
+#include "miscadmin.h"
+#include "storage/md.h"
+#include "utils/elog.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+
+/*
+ * HTAB entry for pending deletes for the given xid.
+ */
+typedef struct PendingDeleteHtabNode
+{
+	TransactionId xid;
+	List	   *relnode_list;	/* list of RelFileNodePendingDelete */
+}	PendingDeleteHtabNode;
+
+/*
+ * Hash table for pending deletes relfilenodes for a given xid.
+ */
+static HTAB *pendingDeletesRedo = NULL;
+
+static bool
+PdlTrackingDisabled()
+{
+	return IsBootstrapProcessingMode() || !gp_track_pending_delete;
+}
+
+/*
+ * This function inserts XLOG_PENDING_DELETE record into WAL.
+ */
+void
+PdlXLogInsert()
+{
+	if (PdlTrackingDisabled())
+		return;
+
+	PendingRelXactDeleteArray *arr = PdlXLogShmemDump();
+
+	if (arr != NULL)
+	{
+		XLogRecPtr	rec;
+		XLogRecData rdata =
+		{
+			.buffer = InvalidBuffer,
+			.data = (char *) arr,
+			.len = PdlDumpSize(arr->count),
+			.next = NULL,
+			.buffer_std = false
+		};
+
+		rec = XLogInsert(RM_XLOG_ID, XLOG_PENDING_DELETE, &rdata);
+
+		XLogFlush(rec);
+
+		elog(DEBUG1, "Pending delete XLog record inserted");
+
+		pfree(arr);
+	}
+}
+
+/*
+ * This function adds pending delete node to a pendingDeletesRedo hash-table
+ * during WAL redo processing.
+ */
+void
+PdlRedoAdd(PendingRelXactDelete * pd)
+{
+	Assert(pd);
+
+	if (PdlTrackingDisabled() || (pd->xid == InvalidTransactionId))
+		return;
+
+	if (NULL == pendingDeletesRedo)
+	{
+		HASHCTL		info =
+		{
+			.keysize = sizeof(TransactionId),
+			.entrysize = sizeof(PendingDeleteHtabNode)
+		};
+
+		pendingDeletesRedo = hash_create("pendingDeletesRedo hash",
+										 32,
+										 &info,
+										 HASH_ELEM);
+	}
+
+	bool		found = false;
+
+	PendingDeleteHtabNode *entry = (PendingDeleteHtabNode *)
+		hash_search(pendingDeletesRedo, &pd->xid, HASH_ENTER, &found);
+
+	if (!found)
+	{
+		entry->xid = pd->xid;
+		entry->relnode_list = NIL;
+	}
+
+	RelFileNodePendingDelete *data = (RelFileNodePendingDelete *)
+		palloc(sizeof(*data));
+
+	*data = pd->relnode;
+	entry->relnode_list = lappend(entry->relnode_list, data);
+}
+
+/*
+ * This function replays XLOG_PENDING_DELETE xlog record.
+ */
+void
+PdlRedoXLogRecord(XLogRecord *record)
+{
+	Assert(record);
+
+	if (PdlTrackingDisabled())
+		return;
+
+	PendingRelXactDeleteArray *arr = (PendingRelXactDeleteArray *)
+		XLogRecGetData(record);
+
+	TransactionId oldest_xid = ShmemVariableCache->oldestXid;
+
+	Assert(arr->count);
+
+	for (int i = 0; i < arr->count; i++)
+	{
+		PendingRelXactDelete *pd = &(arr->array[i]);
+
+		/*
+		 * This function should check transaction status before adding
+		 * relfilenode to a pendingDeletesRedo hash table. Concurrent xlog
+		 * inserts (concurrent to a checkpointing process) of commit or abort
+		 * xlog records may out out-date pending deletes list. We don't want
+		 * to use aggressive locking of shared structures in order to avoid
+		 * performance drawbacks of concurrent commits or aborts. So the
+		 * strategy is to double-check relfilenodes with it's transaction
+		 * status. If it's TRANSACTION_STATUS_IN_PROGRESS, then it's
+		 * permitted to delete files (it's orphaned), if it's in some other
+		 * status - don't touch it. Also we should check transaction xid
+		 * doesn't cross "freeze horizon" and compare it with current
+		 * oldestXid value. Motivation of this check is that clog might get
+		 * truncated after REDO point and before replaying XLOG_PENDING_DELETE
+		 * record (though that looks like unlikely will happen in real-world,
+		 * but still needs to be considered as possible scenario). So in that
+		 * case we can't rely on xid status of that frozen transactions.
+		 * Second point is that there is no way that clog would be truncated
+		 * when transaction is in progress, so it's either been committed or
+		 * aborted before that.
+		 */
+
+		if (TransactionIdPrecedes(pd->xid, oldest_xid))
+			ereport(LOG, (errmsg(
+					"Prevented adding node for XLOG_PENDING_DELETE "
+					"record for xid: %u, oldestXid: %u",
+					pd->xid, oldest_xid)));
+		else
+		{
+			XLogRecPtr	result;
+			XidStatus	status = TransactionIdGetStatus(pd->xid, &result);
+
+			if (status == TRANSACTION_STATUS_IN_PROGRESS)
+				PdlRedoAdd(pd);
+			else
+				ereport(LOG, (errmsg(
+						"Prevented adding node for XLOG_PENDING_DELETE "
+						"record for xid: %u, status: %d",
+						pd->xid, status)));
+		}
+	}
+}
+
+static void
+PdlRedoRemove(TransactionId xid)
+{
+	if ((xid == InvalidTransactionId) ||
+		(NULL == pendingDeletesRedo))
+		return;
+
+	PendingDeleteHtabNode *entry = (PendingDeleteHtabNode *)
+		hash_search(pendingDeletesRedo, &xid, HASH_REMOVE, NULL);
+
+	if (entry)
+		list_free_deep(entry->relnode_list);
+}
+
+/*
+ * This function removes pending delete nodes from redo hash-table
+ * (pendingDeleteRedo) for a given transaction identified by it's xid and
+ * sub-transactions (if there are).
+ */
+void
+PdlRedoRemoveTree(TransactionId xid,
+				  TransactionId *sub_xids, int nsubxacts)
+{
+	if (PdlTrackingDisabled())
+		return;
+
+	for (int i = 0; i < nsubxacts; i++)
+		PdlRedoRemove(sub_xids[i]);
+
+	PdlRedoRemove(xid);
+}
+
+/*
+ * This function serializes the contents of hash table entry into a structure
+ * suitable to pass into DropRelationFiles() functions.
+ */
+static RelFileNodePendingDelete *
+PdlRedoPrepareArrayForDrop(PendingDeleteHtabNode *hnode, int *ndelrels)
+{
+	ListCell   *cell;
+
+	foreach(cell, hnode->relnode_list)
+	{
+		RelFileNodePendingDelete *pending_delete_node =
+			(RelFileNodePendingDelete *) lfirst(cell);
+		ListCell   *i_cell = lnext(cell);
+		ListCell   *i_cell_prev = cell;
+
+		while (i_cell)
+		{
+			ListCell   *i_cell_next = lnext(i_cell);
+			RelFileNodePendingDelete *i_relnode =
+				(RelFileNodePendingDelete *) lfirst(i_cell);
+
+			if (RelFileNodeEquals(pending_delete_node->node, i_relnode->node))
+			{
+				elog(DEBUG1,
+					 "Duplicate pending delete node found: "
+					 "(rel: (%u: %u: %u); xid: %u)",
+					 pending_delete_node->node.spcNode,
+					 pending_delete_node->node.dbNode,
+					 pending_delete_node->node.relNode,
+					 hnode->xid);
+
+				hnode->relnode_list =
+					list_delete_cell(hnode->relnode_list, i_cell, i_cell_prev);
+				pfree(i_relnode);
+			}
+			else
+				i_cell_prev = i_cell;
+
+			i_cell = i_cell_next;
+		}
+	}
+
+	*ndelrels = list_length(hnode->relnode_list);
+
+	if (*ndelrels <= 0)
+	{
+		ereport(WARNING, (errmsg("Empty list for xid: %u", hnode->xid)));
+		return NULL;
+	}
+
+	RelFileNodePendingDelete *delrels = (RelFileNodePendingDelete *)
+		palloc((*ndelrels) * sizeof(*delrels));
+
+	int			i = 0;
+
+	foreach_with_count(cell, hnode->relnode_list, i)
+	{
+		RelFileNodePendingDelete *pending_delete_node =
+			(RelFileNodePendingDelete *) lfirst(cell);
+
+		ereport(LOG, (errmsg(
+				"Prepare to drop node (%u: %u: %u) for xid: %u",
+				pending_delete_node->node.spcNode,
+				pending_delete_node->node.dbNode,
+				pending_delete_node->node.relNode,
+				hnode->xid)));
+
+		delrels[i] = *pending_delete_node;
+	}
+
+	return delrels;
+}
+
+/*
+ * This function deletes files for pending delete nodes.
+ */
+void
+PdlRedoDropFiles()
+{
+	if (PdlTrackingDisabled() ||
+		(NULL == pendingDeletesRedo) ||
+		(hash_get_num_entries(pendingDeletesRedo) == 0))
+		return;
+
+	TransactionId oldest_xid = ShmemVariableCache->oldestXid;
+	HASH_SEQ_STATUS scan_status = {0};
+	PendingDeleteHtabNode *node;
+
+	hash_seq_init(&scan_status, pendingDeletesRedo);
+	while ((node = (PendingDeleteHtabNode *) hash_seq_search(&scan_status)) != NULL)
+	{
+		if (TransactionIdPrecedes(node->xid, oldest_xid))
+			ereport(WARNING, (errmsg(
+					"Prevented drop files for xid: %u, oldestXid: %u",
+					node->xid, oldest_xid)));
+		else
+		{
+			XLogRecPtr	result;
+			XidStatus	status = TransactionIdGetStatus(node->xid, &result);
+
+			if (status != TRANSACTION_STATUS_IN_PROGRESS)
+				ereport(WARNING, (errmsg(
+						"Prevented drop files for xid: %u, status: %d",
+						node->xid, status)));
+			else
+			{
+				int			ndelrels = 0;
+				RelFileNodePendingDelete *delrels =
+					PdlRedoPrepareArrayForDrop(node, &ndelrels);
+
+				DropRelationFiles(delrels, ndelrels, true);
+
+				ereport(LOG, (errmsg(
+						"Pending delete rels were dropped (count: %d; xid: %d).",
+						ndelrels,
+						node->xid)));
+
+				pfree(delrels);
+			}
+		}
+
+		list_free_deep(node->relnode_list);
+	}
+
+	hash_destroy(pendingDeletesRedo);
+	pendingDeletesRedo = NULL;
+}
diff --git a/src/backend/catalog/test/Makefile b/src/backend/catalog/test/Makefile
index 656951a3f0fa..50e08514554a 100644
--- a/src/backend/catalog/test/Makefile
+++ b/src/backend/catalog/test/Makefile
@@ -5,16 +5,22 @@ subdir = src/backend/catalog
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/src/Makefile.mock
 
-
 TARGETS += storage_tablespace
+TARGETS += storage_pending_deletes
+TARGETS += storage_pending_deletes_redo
 
+include $(top_builddir)/src/backend/mock.mk
 
 storage_tablespace.t: $(top_srcdir)/src/backend/catalog/storage_tablespace.o
 	make -C $(top_srcdir)/src/backend/catalog/ && \
 	$(CC) $(CFLAGS) $(LDFLAGS) $(CMOCKERY_OBJS) $(CPPFLAGS) \
 		$(top_srcdir)/src/backend/catalog/storage_tablespace.o \
 		storage_tablespace_test.c \
-		-o storage_tablespace_test.o && ./storage_tablespace_test.o
+		-o storage_tablespace.t
 
+storage_pending_deletes.t: \
+	$(top_builddir)/src/backend/catalog/storage_pending_deletes.o
 
-check: storage_tablespace.t
\ No newline at end of file
+storage_pending_deletes_redo.t: \
+	$(top_builddir)/src/backend/catalog/storage_pending_deletes_redo.o \
+	$(top_builddir)/src/backend/catalog/storage_pending_deletes.o
diff --git a/src/backend/catalog/test/storage_pending_deletes_redo_test.c b/src/backend/catalog/test/storage_pending_deletes_redo_test.c
new file mode 100644
index 000000000000..deb916b1e3f5
--- /dev/null
+++ b/src/backend/catalog/test/storage_pending_deletes_redo_test.c
@@ -0,0 +1,1103 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_pending_deletes_redo_test.c
+ *	  code to test functionality from storage_pending_deletes_redo.c
+ *
+ * Copyright (c) 2025 Greengage Community
+ *
+ *	  src/backend/catalog/test/storage_pending_deletes_redo_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include "cmockery.h"
+
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/transam.h"
+#include "catalog/storage_pending_deletes_redo.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+#define TEST_TABLESPACE_OID1	11111
+#define TEST_TABLESPACE_OID2	11112
+
+#define TEST_DB_OID1			11121
+#define TEST_DB_OID2			11122
+
+#define TEST_REL_OID1			11211
+#define TEST_REL_OID2			11212
+
+#define TEST_XID 10
+
+#define TEST_XLOG_REC_PTR 100
+
+void
+__wrap_DropRelationFiles(RelFileNodePendingDelete *delrels,
+								int ndelrels,
+								bool isRedo);
+
+XidStatus
+__wrap_TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn);
+
+PendingRelXactDeleteArray *
+__wrap_PdlXLogShmemDump(void);
+
+XLogRecPtr
+__wrap_XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
+
+void
+			__wrap_XLogFlush(XLogRecPtr record);
+
+/* id of test, which is currently being executed */
+static int	test_number = 0;
+
+/*
+ * counter to accumulate how many times DropRelationFiles() was called during
+ * test
+ */
+static int	DropRelationFiles_call_count = 0;
+
+/* counter to accumulate how many times XLogInsert() was called during test */
+static int	XLogInsert_call_count = 0;
+
+/*
+ * counter to accumulate how many times PdlXLogShmemDump() was called during
+ * test
+ */
+static int	PdlXLogShmemDump_call_count = 0;
+
+/*
+ * array of relnodes expected by test, in case there are more than 1-2 nodes
+ * involved
+ */
+#define TEST_EXPECTED_NOTES_COUNT 20
+static RelFileNode test_expected_relnodes[TEST_EXPECTED_NOTES_COUNT];
+
+/*
+ * List with transaction IDs, that will report complete status from
+ * TransactionIdGetStatus().
+ */
+static List *ls_transactions_comlpete = NIL;
+
+static void
+setup(int test)
+{
+	static VariableCacheData test_cache = {0};
+
+	ShmemVariableCache = &test_cache;
+
+	DropRelationFiles_call_count = 0;
+	XLogInsert_call_count = 0;
+	PdlXLogShmemDump_call_count = 0;
+
+	test_number = test;
+}
+
+void
+__wrap_DropRelationFiles(RelFileNodePendingDelete *delrels,
+								int ndelrels,
+								bool isRedo)
+{
+	DropRelationFiles_call_count++;
+	switch (test_number)
+	{
+		case 1:
+		case 8:
+		case 9:
+		case 13:
+		case 18:
+		case 19:
+			{
+				assert_int_equal(ndelrels, 1);
+				assert_true(isRedo);
+				RelFileNodePendingDelete *pd = &(delrels[0]);
+
+				assert_false(pd->isTempRelation);
+				assert_int_equal(pd->node.spcNode, TEST_TABLESPACE_OID1);
+				assert_int_equal(pd->node.dbNode, TEST_DB_OID1);
+				assert_int_equal(pd->node.relNode, TEST_REL_OID1);
+				break;
+			}
+		case 3:
+			{
+				static RelFileNode test_3_expected_results[] =
+				{
+					[0] =
+					{
+						.spcNode = TEST_TABLESPACE_OID1,
+						.dbNode = TEST_DB_OID1,
+						.relNode = TEST_REL_OID1,
+					},
+					[1] =
+					{
+						.spcNode = TEST_TABLESPACE_OID2,
+						.dbNode = TEST_DB_OID2,
+						.relNode = TEST_REL_OID2,
+					}
+				};
+
+				assert_int_equal(ndelrels, 1);
+				assert_true(isRedo);
+				RelFileNodePendingDelete *pd = &(delrels[0]);
+
+				/*
+				 * We can't guarantee that the order of relnodes dropping will
+				 * be the same as the order of adding the pending delete
+				 * nodes. So we just need to ensure that we got all the
+				 * expected relnodes (and only them). We check it by excluding
+				 * values from the array of expected relnodes by replacing
+				 * them with InvalidOid. And we will check that all values are
+				 * excluded as the last step.
+				 */
+				for (int i = 0; i < ARRAY_SIZE(test_3_expected_results); i++)
+				{
+					if (RelFileNodeEquals(test_3_expected_results[i], pd->node))
+					{
+						test_3_expected_results[i].spcNode = InvalidOid;
+						test_3_expected_results[i].dbNode = InvalidOid;
+						test_3_expected_results[i].relNode = InvalidOid;
+					}
+				}
+
+				if (DropRelationFiles_call_count == 2)
+				{
+					for (int i = 0; i < ARRAY_SIZE(test_3_expected_results); i++)
+					{
+						assert_int_equal(test_3_expected_results[i].spcNode,
+										 InvalidOid);
+						assert_int_equal(test_3_expected_results[i].dbNode,
+										 InvalidOid);
+						assert_int_equal(test_3_expected_results[i].relNode,
+										 InvalidOid);
+					}
+				}
+
+				break;
+			}
+		case 4:
+			{
+				assert_int_equal(ndelrels, 2);
+				assert_true(isRedo);
+
+				RelFileNodePendingDelete *pd;
+
+				pd = &(delrels[0]);
+				assert_false(pd->isTempRelation);
+				assert_int_equal(pd->node.spcNode, TEST_TABLESPACE_OID1);
+				assert_int_equal(pd->node.dbNode, TEST_DB_OID1);
+				assert_int_equal(pd->node.relNode, TEST_REL_OID1);
+
+				pd = &(delrels[1]);
+				assert_false(pd->isTempRelation);
+				assert_int_equal(pd->node.spcNode, TEST_TABLESPACE_OID2);
+				assert_int_equal(pd->node.dbNode, TEST_DB_OID2);
+				assert_int_equal(pd->node.relNode, TEST_REL_OID2);
+
+				break;
+			}
+		case 5:
+		case 11:
+		case 12:
+		case 14:
+			{
+				assert_int_equal(ndelrels, 1);
+				assert_true(isRedo);
+				RelFileNodePendingDelete *pd = &(delrels[0]);
+
+				assert_false(pd->isTempRelation);
+
+				/*
+				 * We can't guarantee that the order of relnodes dropping will
+				 * be the same as the order of adding the pending delete
+				 * nodes. So we just need to ensure that we got all the
+				 * expected relnodes (and only them). We check it by excluding
+				 * values from the array of expected relnodes by replacing
+				 * them with InvalidOid. And we will check that all values are
+				 * excluded in the end of the test.
+				 */
+				for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+				{
+					assert_true(pd->node.relNode != InvalidOid);
+					if (RelFileNodeEquals(test_expected_relnodes[i], pd->node))
+					{
+						test_expected_relnodes[i].relNode = InvalidOid;
+						return;
+					}
+				}
+
+				/*
+				 * If we are here, then we didn't find the relnode in the
+				 * expected data, and it is a problem, so fail.
+				 */
+				assert_true(false);
+				break;
+			}
+		default:
+			{
+				/* we shouldn't even get here */
+				assert_true(false);
+				break;
+			}
+	}
+}
+
+XidStatus
+__wrap_TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
+{
+	ListCell   *cell;
+
+	foreach(cell, ls_transactions_comlpete)
+	{
+		TransactionId xid_complete = (TransactionId) lfirst_int(cell);
+
+		if (xid == xid_complete)
+			return TRANSACTION_STATUS_COMMITTED;
+	}
+	return TRANSACTION_STATUS_IN_PROGRESS;
+}
+
+PendingRelXactDeleteArray *
+__wrap_PdlXLogShmemDump(void)
+{
+	PdlXLogShmemDump_call_count++;
+	if (test_number == 16)
+		return NULL;
+
+	/* return something valid */
+	int			node_count = 1;
+
+	char	   *buffer = palloc(PdlDumpSize(node_count));
+
+	PendingRelXactDeleteArray *pending_deletes =
+		(PendingRelXactDeleteArray *) buffer;
+
+	pending_deletes->count = node_count;
+
+	PendingRelXactDelete *pd = &(pending_deletes->array[0]);
+
+	pd->xid = TEST_XID;
+	pd->relnode.isTempRelation = false;
+	pd->relnode.node.spcNode = TEST_TABLESPACE_OID1;
+	pd->relnode.node.dbNode = TEST_DB_OID1;
+	pd->relnode.node.relNode = TEST_REL_OID1;
+
+	return pending_deletes;
+}
+
+XLogRecPtr
+__wrap_XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
+{
+	assert_int_equal(test_number, 17);	/* currently we should get here only
+										 * in test_17 */
+	XLogInsert_call_count++;
+
+	assert_int_equal(rmid, RM_XLOG_ID);
+	assert_int_equal(info, XLOG_PENDING_DELETE);
+	assert_int_equal(rdata->buffer, InvalidBuffer);
+	assert_false(rdata->buffer_std);
+	assert_true(rdata->next == NULL);
+	assert_true(rdata->len == (sizeof(Size) + sizeof(PendingRelXactDelete)));
+
+	PendingRelXactDeleteArray *pending_deletes =
+		(PendingRelXactDeleteArray *) rdata->data;
+
+	assert_int_equal(pending_deletes->count, 1);
+
+	PendingRelXactDelete *pd = &(pending_deletes->array[0]);
+
+	assert_int_equal(pd->xid, TEST_XID);
+	assert_false(pd->relnode.isTempRelation);
+	assert_int_equal(pd->relnode.node.spcNode, TEST_TABLESPACE_OID1);
+	assert_int_equal(pd->relnode.node.dbNode, TEST_DB_OID1);
+	assert_int_equal(pd->relnode.node.relNode, TEST_REL_OID1);
+
+	return TEST_XLOG_REC_PTR;
+}
+
+void
+__wrap_XLogFlush(XLogRecPtr record)
+{
+	assert_int_equal(test_number, 17);	/* currently we should get here only
+										 * in test_17 */
+	assert_int_equal(record, TEST_XLOG_REC_PTR);
+}
+
+/*
+ * Tests
+ */
+
+/*
+ * Scenario:
+ * add single pending delete node
+ * and then drop files.
+ */
+static void
+test_1(void **state)
+{
+	setup(1);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+}
+
+/*
+ * Scenario:
+ * add single pending delete node
+ * and datfrozenxid is above the node's xid
+ * and then drop files.
+ */
+static void
+test_2(void **state)
+{
+	setup(2);
+	ShmemVariableCache->oldestXid = (TransactionId) 2;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+}
+
+/*
+ * Scenario:
+ * add 2 pending delete nodes with different xids and different relnodes
+ * and then drop files.
+ */
+static void
+test_3(void **state)
+{
+	setup(3);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	pd.xid = (TransactionId) 2;
+	pd.relnode.node.spcNode = TEST_TABLESPACE_OID2;
+	pd.relnode.node.dbNode = TEST_DB_OID2;
+	pd.relnode.node.relNode = TEST_REL_OID2;
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 2);
+}
+
+/*
+ * Scenario:
+ * add 2 pending delete nodes with same xid and different relnodes
+ * and then drop files.
+ */
+static void
+test_4(void **state)
+{
+	setup(4);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	pd.relnode.node.spcNode = TEST_TABLESPACE_OID2;
+	pd.relnode.node.dbNode = TEST_DB_OID2;
+	pd.relnode.node.relNode = TEST_REL_OID2;
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+}
+
+/*
+ * Scenario:
+ * add many pending delete nodes with different xids and different relnodes
+ * and some xids precede datfrozenxid
+ * and some transactions are not in progress
+ * and then drop files.
+ */
+static void
+test_5(void **state)
+{
+	setup(5);
+	ShmemVariableCache->oldestXid = (TransactionId) 5;
+
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		PendingRelXactDelete pd =
+		{
+			.xid = (TransactionId) i,
+			.relnode.isTempRelation = false,
+			.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+			.relnode.node.dbNode = TEST_DB_OID1,
+			.relnode.node.relNode = TEST_REL_OID1 + i
+		};
+
+		PdlRedoAdd(&pd);
+
+		/* and fill data which is expected... */
+		if (TransactionIdPrecedes(pd.xid, ShmemVariableCache->oldestXid))
+			test_expected_relnodes[i].relNode = (Oid) -1;
+		else
+			test_expected_relnodes[i] = pd.relnode.node;
+	}
+
+	/* mark some transactions as complete, let's say XIDs: 10, 12, 15 */
+	TransactionId complete_xids[] = {10, 12, 15};
+	int			complete_xids_count = ARRAY_SIZE(complete_xids);
+
+	for (int i = 0; i < complete_xids_count; i++)
+	{
+		ls_transactions_comlpete = lappend_int(ls_transactions_comlpete,
+											   complete_xids[i]);
+	}
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count,
+					TEST_EXPECTED_NOTES_COUNT - ShmemVariableCache->oldestXid -
+					complete_xids_count);
+
+	/* Check that data for complete xids is not touched by PdlRedoDropFiles */
+	for (int i = 0; i < complete_xids_count; i++)
+	{
+		ls_transactions_comlpete = lappend_int(ls_transactions_comlpete,
+											   complete_xids[i]);
+		assert_int_equal(test_expected_relnodes[complete_xids[i]].relNode,
+						 TEST_REL_OID1 + complete_xids[i]);
+		/* Replace it with InvalidOid to simplify further check */
+		test_expected_relnodes[complete_xids[i]].relNode = InvalidOid;
+	}
+
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		/*
+		 * Check that data for xids preceding datfrozenxid is not touched by
+		 * PdlRedoDropFiles, while all other is replaced with InvalidOid.
+		 */
+		if (TransactionIdPrecedes((TransactionId) i, ShmemVariableCache->oldestXid))
+			assert_int_equal(test_expected_relnodes[i].relNode, (Oid) -1);
+		else
+			assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid);
+	}
+
+	list_free(ls_transactions_comlpete);
+	ls_transactions_comlpete = NIL;
+}
+
+/*
+ * Scenario:
+ * add single pending delete node
+ * and transaction status of the node is not in progress
+ * and then drop files.
+ */
+static void
+test_6(void **state)
+{
+	setup(6);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	ls_transactions_comlpete = lappend_int(ls_transactions_comlpete, pd.xid);
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+
+	list_free(ls_transactions_comlpete);
+	ls_transactions_comlpete = NIL;
+}
+
+/*
+ * Scenario:
+ * add single pending delete node
+ * and remove pending deletes for that node's xid
+ * and then drop files.
+ */
+static void
+test_7(void **state)
+{
+	setup(7);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoRemoveTree(pd.xid, NULL, 0);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+}
+
+/*
+ * Scenario:
+ * add single pending delete node
+ * and remove pending deletes for different xid
+ * and then drop files.
+ */
+static void
+test_8(void **state)
+{
+	setup(8);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoRemoveTree(pd.xid + 1, NULL, 0);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+}
+
+/*
+ * Scenario:
+ * add single pending delete node
+ * and remove pending deletes for invalid xid
+ * and then drop files.
+ */
+static void
+test_9(void **state)
+{
+	setup(9);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoRemoveTree(InvalidTransactionId, NULL, 0);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+}
+
+/*
+ * Scenario:
+ * add several pending delete nodes with the same xid
+ * and remove pending deletes for that xid
+ * and then drop files.
+ */
+static void
+test_10(void **state)
+{
+	setup(10);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	PdlRedoAdd(&pd);
+
+	pd.relnode.node.relNode = TEST_REL_OID2;
+
+	PdlRedoAdd(&pd);
+
+	pd.relnode.node.dbNode = TEST_DB_OID2;
+	pd.relnode.node.relNode = TEST_REL_OID1;
+
+	PdlRedoAdd(&pd);
+
+	PdlRedoRemoveTree(pd.xid, NULL, 0);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+}
+
+/*
+ * Scenario:
+ * add several pending delete nodes with the different xids
+ * and remove pending deletes for one of the xids
+ * and then drop files.
+ */
+static void
+test_11(void **state)
+{
+	setup(11);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd = {0};
+
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		PendingRelXactDelete pd =
+		{
+			/* add oldest xid here just to ensure that all nodes will be added */
+			.xid = ShmemVariableCache->oldestXid + (TransactionId) i,
+			.relnode.isTempRelation = false,
+			.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+			.relnode.node.dbNode = TEST_DB_OID1,
+			.relnode.node.relNode = TEST_REL_OID1 + i
+		};
+
+		PdlRedoAdd(&pd);
+
+		/* and fill data which is expected... */
+		test_expected_relnodes[i] = pd.relnode.node;
+	}
+
+	PdlRedoAdd(&pd);
+
+	TransactionId xid_to_remove = 5;
+
+	PdlRedoRemoveTree(xid_to_remove, NULL, 0);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, TEST_EXPECTED_NOTES_COUNT - 1);
+
+	/*
+	 * Check that data for removed xids is not touched by PdlRedoDropFiles and
+	 * replace it with InvalidOid to simplify further check.
+	 */
+	int			idx = xid_to_remove - ShmemVariableCache->oldestXid;
+
+	assert_int_equal(test_expected_relnodes[idx].relNode, TEST_REL_OID1 + idx);
+	test_expected_relnodes[idx].relNode = InvalidOid;
+
+	/*
+	 * Check that all other are replaced with InvalidOid.
+	 */
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid);
+	}
+}
+
+/*
+ * Scenario:
+ * add several pending delete nodes with the different xids
+ * and remove pending deletes for one of the xids + some sub_xids
+ * and then drop files.
+ */
+static void
+test_12(void **state)
+{
+	setup(12);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		PendingRelXactDelete pd =
+		{
+			/* add oldest xid here just to ensure that all nodes will be added */
+			.xid = ShmemVariableCache->oldestXid + (TransactionId) i,
+			.relnode.isTempRelation = false,
+			.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+			.relnode.node.dbNode = TEST_DB_OID1,
+			.relnode.node.relNode = TEST_REL_OID1 + i
+		};
+
+		PdlRedoAdd(&pd);
+
+		/* and fill data which is expected... */
+		test_expected_relnodes[i] = pd.relnode.node;
+	}
+
+	TransactionId xid_to_remove = 5;
+	TransactionId sub_xids_to_remove[] = {10, 11, 12, 15};
+	int			nsubxacts = ARRAY_SIZE(sub_xids_to_remove);
+
+	PdlRedoRemoveTree(xid_to_remove, sub_xids_to_remove, nsubxacts);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, TEST_EXPECTED_NOTES_COUNT - 5);
+
+	/*
+	 * Check that data for removed xids is not touched by PdlRedoDropFiles and
+	 * replace it with InvalidOid to simplify further check...
+	 */
+	int			idx = xid_to_remove - ShmemVariableCache->oldestXid;
+
+	assert_int_equal(test_expected_relnodes[idx].relNode, TEST_REL_OID1 + idx);
+	test_expected_relnodes[idx].relNode = InvalidOid;
+	/* ...including all subtransactions. */
+	for (int j = 0; j < nsubxacts; j++)
+	{
+		idx = sub_xids_to_remove[j] - ShmemVariableCache->oldestXid;
+		assert_int_equal(test_expected_relnodes[idx].relNode, TEST_REL_OID1 + idx);
+		test_expected_relnodes[idx].relNode = InvalidOid;
+	}
+
+	/*
+	 * Check that now all expected nodes are replaced with InvalidOid.
+	 */
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid);
+	}
+}
+
+static XLogRecord *
+test_create_xlog_record(int pending_deletes_count)
+{
+	Size buffer_size = SizeOfXLogRecord + sizeof(Size) + 
+		sizeof(PendingRelXactDelete) * pending_deletes_count;
+
+	return (XLogRecord *)palloc0(buffer_size);
+}
+
+/*
+ * Scenario:
+ * process PENDING_DELETE wal record with 1 pending delete node
+ * and then drop files.
+ */
+static void
+test_13(void **state)
+{
+	setup(13);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	int			pending_deletes_count = 1;
+
+	XLogRecord *record = test_create_xlog_record(pending_deletes_count);
+
+	PendingRelXactDeleteArray *pending_deletes =
+		(PendingRelXactDeleteArray *) ((char *) record + SizeOfXLogRecord);
+
+	pending_deletes->count = pending_deletes_count;
+
+	PendingRelXactDelete *pd = &(pending_deletes->array[0]);
+
+	pd->xid = (TransactionId) 1;
+	pd->relnode.isTempRelation = false;
+	pd->relnode.node.spcNode = TEST_TABLESPACE_OID1;
+	pd->relnode.node.dbNode = TEST_DB_OID1;
+	pd->relnode.node.relNode = TEST_REL_OID1;
+
+	PdlRedoXLogRecord(record);
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+
+	pfree(record);
+}
+
+
+/*
+ * Scenario:
+ * process PENDING_DELETE wal record with several pending delete nodes
+ * and datfrozenxid is above the some node's xid
+ * and some transactions are not in progress
+ * and then drop files.
+ */
+static void
+test_14(void **state)
+{
+	setup(14);
+	ShmemVariableCache->oldestXid = (TransactionId) 2;
+
+	int			pending_deletes_count = 5;
+
+	XLogRecord *record = test_create_xlog_record(pending_deletes_count);
+
+	PendingRelXactDeleteArray *pending_deletes =
+		(PendingRelXactDeleteArray *) ((char *) record + SizeOfXLogRecord);
+
+	pending_deletes->count = pending_deletes_count;
+
+	memset(test_expected_relnodes, 0, sizeof(test_expected_relnodes));
+
+	for (int i = 0; i < pending_deletes_count; i++)
+	{
+		PendingRelXactDelete *pd = &(pending_deletes->array[i]);
+
+		pd->xid = (TransactionId) (i + 1);
+		pd->relnode.isTempRelation = false;
+		pd->relnode.node.spcNode = TEST_TABLESPACE_OID1;
+		pd->relnode.node.dbNode = TEST_DB_OID1;
+		pd->relnode.node.relNode = TEST_REL_OID1 + i;
+
+		test_expected_relnodes[i] = pd->relnode.node;
+	}
+
+	/* mark some transaction as complete, let's say XID: 3 */
+	ls_transactions_comlpete = lappend_int(ls_transactions_comlpete,
+										   (TransactionId) 3);
+
+	PdlRedoXLogRecord(record);
+
+	PdlRedoDropFiles();
+
+	/*
+	 * The xids that should have been skipped due to datfrozenxid or
+	 * transaction status. Their enties in the expected nodes should be
+	 * untouched. Check it and replace it with InvalidOid to simplify further
+	 * check...
+	 */
+	TransactionId skipped_xids[] = {1, 3};
+
+	for (int i = 0; i < ARRAY_SIZE(skipped_xids); i++)
+	{
+		int			idx = skipped_xids[i] - 1;
+
+		assert_int_equal(test_expected_relnodes[idx].relNode,
+						 TEST_REL_OID1 + idx);
+		test_expected_relnodes[idx].relNode = InvalidOid;
+	}
+
+	assert_int_equal(DropRelationFiles_call_count, 3);
+
+	/*
+	 * Check that now all expected nodes are replaced with InvalidOid.
+	 */
+	for (int i = 0; i < TEST_EXPECTED_NOTES_COUNT; i++)
+	{
+		assert_int_equal(test_expected_relnodes[i].relNode, InvalidOid);
+	}
+
+	pfree(record);
+
+	list_free(ls_transactions_comlpete);
+	ls_transactions_comlpete = NIL;
+}
+
+
+/*
+ * Scenario:
+ * check PdlXlogInsert() if PdlXLogShmemDump returned NULL.
+ */
+static void
+test_16(void **state)
+{
+	setup(16);
+
+	PdlXLogInsert();
+
+	assert_int_equal(PdlXLogShmemDump_call_count, 1);
+	assert_int_equal(XLogInsert_call_count, 0);
+}
+
+/*
+ * Scenario:
+ * check PdlXlogInsert() if PdlXLogShmemDump provided valid nodes.
+ */
+static void
+test_17(void **state)
+{
+	setup(17);
+
+	PdlXLogInsert();
+
+	assert_int_equal(PdlXLogShmemDump_call_count, 1);
+	assert_int_equal(XLogInsert_call_count, 1);
+}
+
+/*
+ * Scenario:
+ * guc is disabled
+ */
+static void
+test_18(void **state)
+{
+	setup(18);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	gp_track_pending_delete = false;
+	PdlRedoAdd(&pd);
+	gp_track_pending_delete = true;
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+
+	PdlRedoAdd(&pd);
+
+	gp_track_pending_delete = false;
+	PdlRedoDropFiles();
+	gp_track_pending_delete = true;
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+
+	gp_track_pending_delete = false;
+	PdlRedoRemoveTree(pd.xid, NULL, 0);
+	gp_track_pending_delete = true;
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+
+	gp_track_pending_delete = false;
+	PdlXLogInsert();
+	gp_track_pending_delete = true;
+
+	assert_int_equal(PdlXLogShmemDump_call_count, 0);
+	assert_int_equal(XLogInsert_call_count, 0);
+}
+
+/*
+ * Scenario:
+ * IsBootstrapProcessingMode is true
+ */
+static void
+test_19(void **state)
+{
+	setup(19);
+	ShmemVariableCache->oldestXid = (TransactionId) 1;
+
+	PendingRelXactDelete pd =
+	{
+		.xid = (TransactionId) 1,
+		.relnode.isTempRelation = false,
+		.relnode.node.spcNode = TEST_TABLESPACE_OID1,
+		.relnode.node.dbNode = TEST_DB_OID1,
+		.relnode.node.relNode = TEST_REL_OID1
+	};
+
+	Mode = BootstrapProcessing;
+	PdlRedoAdd(&pd);
+	Mode = NormalProcessing;
+
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+
+	PdlRedoAdd(&pd);
+
+	Mode = BootstrapProcessing;
+	PdlRedoDropFiles();
+	Mode = NormalProcessing;
+
+	assert_int_equal(DropRelationFiles_call_count, 0);
+
+	Mode = BootstrapProcessing;
+	PdlRedoRemoveTree(pd.xid, NULL, 0);
+	Mode = NormalProcessing;
+	PdlRedoDropFiles();
+
+	assert_int_equal(DropRelationFiles_call_count, 1);
+
+	Mode = BootstrapProcessing;
+	PdlXLogInsert();
+	Mode = NormalProcessing;
+
+	assert_int_equal(PdlXLogShmemDump_call_count, 0);
+	assert_int_equal(XLogInsert_call_count, 0);
+}
+
+int
+main(int argc, char *argv[])
+{
+	cmockery_parse_arguments(argc, argv);
+
+	const UnitTest tests[] = {
+		unit_test(test_1),
+		unit_test(test_2),
+		unit_test(test_3),
+		unit_test(test_4),
+		unit_test(test_5),
+		unit_test(test_6),
+		unit_test(test_7),
+		unit_test(test_8),
+		unit_test(test_9),
+		unit_test(test_10),
+		unit_test(test_11),
+		unit_test(test_12),
+		unit_test(test_13),
+		unit_test(test_14),
+		unit_test(test_16),
+		unit_test(test_17),
+		unit_test(test_18),
+		unit_test(test_19)
+	};
+
+	MemoryContextInit();
+
+	return run_tests(tests);
+}
diff --git a/src/backend/catalog/test/storage_pending_deletes_test.c b/src/backend/catalog/test/storage_pending_deletes_test.c
new file mode 100644
index 000000000000..e65d0dfe023d
--- /dev/null
+++ b/src/backend/catalog/test/storage_pending_deletes_test.c
@@ -0,0 +1,567 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_pending_deletes_test.c
+ *	  code to test functionality from storage_pending_deletes.c
+ *
+ * Copyright (c) 2025 Greengage Community
+ *
+ *	  src/backend/catalog/test/storage_pending_deletes_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include "cmockery.h"
+
+#include "catalog/storage_pending_deletes.h"
+#include "storage/pg_shmem.h"
+#include "storage/proc.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+enum
+{
+	TEST_TABLESPACE_OID1 = 11111,
+	TEST_TABLESPACE_OID2 = 11112,
+
+	TEST_DB_OID1 = 11121,
+	TEST_DB_OID2 = 11122,
+
+	TEST_REL_OID1 = 11211,
+	TEST_REL_OID2 = 11212,
+
+	TEST_XID1 = 10,
+	TEST_XID2 = TEST_XID1 + 1,
+	TEST_XID3 = TEST_XID1 + 3,
+	TEST_XID4 = TEST_XID1 + 8,
+};
+
+/* Don't try to read a non-existent postmaster.pid file */
+void		__wrap_AddToDataDirLockFile(int target_line, const char *str);
+void
+__wrap_AddToDataDirLockFile(int target_line, const char *str)
+{
+}
+
+
+/* Function to sort array of PendingRelXactDelete using qsort */
+static int
+cmp_pdl(const void *p1, const void *p2)
+{
+	return memcmp(p1, p2, sizeof(PendingRelXactDelete));
+}
+
+/* Check if PdlXLogShmemDump returns expected array */
+static void 
+check_array(PendingRelXactDeleteArray *arr,
+			PendingRelXactDelete *expected, Size expectedCnt)
+{
+	assert_true(arr != NULL);
+
+	assert_int_equal(arr->count, expectedCnt);
+
+	/* Order doesn't matter */
+	qsort (expected,   expectedCnt, sizeof(*expected), cmp_pdl);
+	qsort (arr->array, expectedCnt, sizeof(*expected), cmp_pdl);
+	assert_memory_equal(arr->array, expected, expectedCnt*sizeof(*expected));
+}
+
+/* Remove nodes received in the p array from backends lists */
+static void 
+clean_lists(dsa_pointer *p, Size pCnt)
+{
+	for (int i = 0; i < pCnt; i++)
+		PdlShmemRemove(p[i]);
+
+	/* Check whether cleanup is ok */
+	assert_true(PdlXLogShmemDump() == NULL);
+}
+
+/* Call PdlXLogShmemDump(), check its result and clean up */
+static void 
+check_dump(PendingRelXactDelete *expected, Size expectedCnt)
+{
+	PendingRelXactDeleteArray   *arr = PdlXLogShmemDump();
+
+	check_array(arr, expected, expectedCnt);
+	pfree(arr);
+}
+
+
+/* Dump without additions */
+static void
+test_empty(void **state)
+{
+	assert_true(PdlXLogShmemDump() == NULL);
+}
+
+/* Add single pending delete node */
+static void
+test_1(void **state)
+{
+	const RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		},
+		.relstorage = RELSTORAGE_HEAP
+	};
+
+	dsa_pointer p = PdlShmemAdd(&relnode, TEST_XID1);
+	
+	PendingRelXactDelete expected = 
+	{
+		.relnode = relnode,
+		.xid = TEST_XID1
+	};
+
+	check_dump(&expected, 1);
+	clean_lists(&p, 1);
+}
+
+/* Add nodes, remove the first one, add a node */
+static void
+test_remove_fisrt(void **state)
+{
+	RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	dsa_pointer p_first = PdlShmemAdd(&relnode, TEST_XID1);
+
+	dsa_pointer p[4];
+	
+	relnode.node.spcNode = TEST_TABLESPACE_OID2;
+	p[0] = PdlShmemAdd(&relnode, TEST_XID2);
+
+	relnode.node.dbNode = TEST_DB_OID2;
+	p[1] = PdlShmemAdd(&relnode, TEST_XID3);
+
+	relnode.node.relNode = TEST_REL_OID2;
+	p[2] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	PdlShmemRemove(p_first);
+
+	relnode.node.spcNode = TEST_TABLESPACE_OID1;
+	p[3] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	PendingRelXactDelete expected[] = 
+	{
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID2
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID1}},
+			.xid = TEST_XID3
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID2}},
+			.xid = TEST_XID1
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID2, TEST_REL_OID2}},
+			.xid = TEST_XID1
+		},
+	};
+
+	check_dump(expected, ARRAY_SIZE(expected));
+	clean_lists(p, ARRAY_SIZE(p));
+}
+
+/* Add nodes, remove a node from the middle, add a node */
+static void
+test_remove_middle(void **state)
+{
+	RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	dsa_pointer p[4];
+
+	p[0] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	relnode.node.spcNode = TEST_TABLESPACE_OID2;
+	p[1] = PdlShmemAdd(&relnode, TEST_XID2);
+
+	relnode.node.dbNode = TEST_DB_OID2;
+	dsa_pointer p_middle = PdlShmemAdd(&relnode, TEST_XID1);
+
+	relnode.node.relNode = TEST_REL_OID2;
+	p[2] = PdlShmemAdd(&relnode, TEST_XID3);
+
+	PdlShmemRemove(p_middle);
+
+	relnode.node.spcNode = TEST_TABLESPACE_OID1;
+	p[3] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	PendingRelXactDelete expected[] = 
+	{
+		{
+			.relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID1
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID2
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID2}},
+			.xid = TEST_XID3
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID2, TEST_REL_OID2}},
+			.xid = TEST_XID1
+		},
+	};
+
+	check_dump(expected, ARRAY_SIZE(expected));
+	clean_lists(p, ARRAY_SIZE(p));
+}
+
+/* Add nodes, remove the last one, add a node */
+static void
+test_remove_last(void **state)
+{
+	RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	dsa_pointer p[4];
+
+	p[0] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	relnode.node.spcNode = TEST_TABLESPACE_OID2;
+	p[1] = PdlShmemAdd(&relnode, TEST_XID2);
+
+	relnode.node.dbNode = TEST_DB_OID2;
+	p[2] = PdlShmemAdd(&relnode, TEST_XID3);
+
+	relnode.node.relNode = TEST_REL_OID2;
+	dsa_pointer p_last = PdlShmemAdd(&relnode, TEST_XID1);
+
+	PdlShmemRemove(p_last);
+
+	relnode.node.dbNode = TEST_DB_OID1;
+	p[3] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	PendingRelXactDelete expected[] = 
+	{
+		{
+			.relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID1
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID2
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID1}},
+			.xid = TEST_XID3
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID2}},
+			.xid = TEST_XID1
+		},
+	};
+
+	check_dump(expected, ARRAY_SIZE(expected));
+	clean_lists(p, ARRAY_SIZE(p));
+}
+
+/* Add node with invalid transaction id */
+static void
+test_invalid_xid(void **state)
+{
+	const RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	assert_false(DsaPointerIsValid(
+							   PdlShmemAdd(&relnode, InvalidTransactionId)));
+	assert_true(PdlXLogShmemDump() == NULL);
+}
+
+/* Add node when MyBackendId is invalid */
+static void
+test_invalid_backend(void **state)
+{
+	const RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	BackendId	old = MyBackendId;
+
+	MyBackendId = InvalidBackendId;
+
+	assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1)));
+	assert_true(PdlXLogShmemDump() == NULL);
+
+	/* Clean up */
+	MyBackendId = old;
+}
+
+/* Add node when Mode == BootstrapProcessing */
+static void
+test_invalid_mode(void **state)
+{
+	const RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	ProcessingMode old = Mode;
+
+	Mode = BootstrapProcessing;
+
+	assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1)));
+	assert_true(PdlXLogShmemDump() == NULL);
+
+	/* Clean up */
+	Mode = old;
+}
+
+/* Add node when tracking is disabled */
+static void
+test_tracking_disabled(void **state)
+{
+	const RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	bool		old = gp_track_pending_delete;
+
+	gp_track_pending_delete = false;
+
+	assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1)));
+	assert_true(PdlXLogShmemDump() == NULL);
+
+	/* Clean up */
+	gp_track_pending_delete = old;
+}
+
+/* Add node when dynamic_shared_memory_type == DSM_IMPL_NONE */
+static void
+test_shmem_type(void **state)
+{
+	const RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	int			old = dynamic_shared_memory_type;
+
+	dynamic_shared_memory_type = DSM_IMPL_NONE;
+
+	assert_false(DsaPointerIsValid(PdlShmemAdd(&relnode, TEST_XID1)));
+	assert_true(PdlXLogShmemDump() == NULL);
+
+	/* Clean up */
+	dynamic_shared_memory_type = old;
+}
+
+/* Add nodes for two backends */
+static void
+test_2_backends(void **state)
+{
+	RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode  = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	dsa_pointer p[5];
+
+	p[0] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	relnode.node.spcNode = TEST_TABLESPACE_OID2;
+	p[1] = PdlShmemAdd(&relnode, TEST_XID2);
+
+	relnode.node.dbNode = TEST_DB_OID2;
+	p[2] = PdlShmemAdd(&relnode, TEST_XID1);
+
+	BackendId	old = MyBackendId;
+
+	MyBackendId = 3;
+
+	relnode.node.relNode = TEST_REL_OID2;
+	p[3] = PdlShmemAdd(&relnode, TEST_XID3);
+
+	relnode.node.spcNode = TEST_TABLESPACE_OID1;
+	p[4] = PdlShmemAdd(&relnode, TEST_XID4);
+
+	PendingRelXactDelete expected[] = 
+	{
+		{
+			.relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID1
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID1, TEST_REL_OID1}},
+			.xid = TEST_XID2
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID1}},
+			.xid = TEST_XID1
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID2, TEST_DB_OID2, TEST_REL_OID2}},
+			.xid = TEST_XID3
+		},
+		{
+			.relnode = {{TEST_TABLESPACE_OID1, TEST_DB_OID2, TEST_REL_OID2}},
+			.xid = TEST_XID4
+		},
+	};
+
+	PendingRelXactDeleteArray	*arr = PdlXLogShmemDump();
+
+	/* 
+	 * Clean up.
+	 * Elements which were added for backend 3 should be removed
+	 * when MyBackendId is 3. Other elements are removed in clean_lists
+	 * after restoring MyBackendId.
+	 */
+	PdlShmemRemove(p[3]);
+	PdlShmemRemove(p[4]);
+
+	MyBackendId = old;
+
+	check_array(arr, expected, ARRAY_SIZE(expected));
+	pfree(arr);
+
+	clean_lists(p, 3);
+}
+
+/* Add nodes to use repalloc twice in PdlXLogShmemDump() */
+static void
+test_repalloc(void **state)
+{
+	RelFileNodePendingDelete relnode =
+	{
+		.node =
+		{
+			.spcNode = TEST_TABLESPACE_OID1,
+			.dbNode = TEST_DB_OID1,
+			.relNode = TEST_REL_OID1
+		}
+	};
+
+	dsa_pointer p[100]; /* 100 > 32 + 64 */
+	PendingRelXactDelete expected[ARRAY_SIZE(p)];
+	
+	for(int i = 0; i < ARRAY_SIZE(p); i++)
+	{
+		relnode.node.spcNode += i;
+		relnode.node.dbNode  += i;
+		relnode.node.relNode += i;
+
+		p[i] = PdlShmemAdd(&relnode, TEST_XID1 + i);
+
+		expected[i].relnode = relnode;
+		expected[i].xid = TEST_XID1 + i;
+	}
+
+	check_dump(expected, ARRAY_SIZE(expected));
+	clean_lists(p, ARRAY_SIZE(p));
+}
+
+int
+main(int argc, char *argv[])
+{
+	cmockery_parse_arguments(argc, argv);
+
+	const UnitTest tests[] = {
+		unit_test(test_empty),
+		unit_test(test_1),
+		unit_test(test_remove_fisrt),
+		unit_test(test_remove_middle),
+		unit_test(test_remove_last),
+		unit_test(test_invalid_xid),
+		unit_test(test_invalid_backend),
+		unit_test(test_invalid_mode),
+		unit_test(test_tracking_disabled),
+		unit_test(test_shmem_type),
+		unit_test(test_2_backends),
+		unit_test(test_repalloc)
+	};
+
+	MemoryContextInit();
+
+	gp_track_pending_delete = true;
+	dynamic_shared_memory_type = DSM_IMPL_POSIX;
+	DataDir = ".";
+	MaxBackends = 5;
+
+	PGShmemHeader *shim = NULL;
+
+	InitShmemAccess(PGSharedMemoryCreate(300000, 6000, &shim));
+	InitShmemAllocation();
+	CreateLWLocks();
+	InitShmemIndex();
+	dsm_postmaster_startup(shim);
+
+	PdlShmemInit();
+
+	IsUnderPostmaster = true;
+	MyBackendId = 1;
+
+	PGPROC		proc = {.backendId = MyBackendId};
+
+	MyProc = &proc;
+	return run_tests(tests);
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index f33ebdbf7511..947f90980889 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -13196,7 +13196,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 			if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
 				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
 				 forkNum == INIT_FORKNUM))
-				log_smgrcreate(&newrnode, forkNum);
+				log_smgrcreate(&newrnode, forkNum, rel->rd_rel->relstorage);
 			copy_relation_data(rel->rd_smgr, dstrel, forkNum,
 							   rel->rd_rel->relpersistence);
 		}
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 1c0bd8aea7dc..e5e6d2aedb71 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -195,6 +195,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPW_CHANGE:
 		case XLOG_FPI:
 		case XLOG_OVERWRITE_CONTRECORD:
+		case XLOG_PENDING_DELETE:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
index 1cecbb205a7f..95726f21f267 100644
--- a/src/backend/storage/ipc/dsm.c
+++ b/src/backend/storage/ipc/dsm.c
@@ -82,6 +82,7 @@ typedef struct dsm_control_item
 {
 	dsm_handle	handle;
 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
+	bool		pinned;
 } dsm_control_item;
 
 /* Layout of the dynamic shared memory control segment. */
@@ -467,8 +468,8 @@ dsm_create(Size size)
 	uint32		i;
 	uint32		nitems;
 
-	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
-	Assert(IsUnderPostmaster);
+	/* Unsafe in postmaster. */
+	Assert(!IsPostmasterEnvironment || IsUnderPostmaster);
 
 	if (!dsm_init_done)
 		dsm_backend_startup();
@@ -497,6 +498,7 @@ dsm_create(Size size)
 			dsm_control->item[i].handle = seg->handle;
 			/* refcnt of 1 triggers destruction, so start at 2 */
 			dsm_control->item[i].refcnt = 2;
+			dsm_control->item[i].pinned = false;
 			seg->control_slot = i;
 			LWLockRelease(DynamicSharedMemoryControlLock);
 			return seg;
@@ -522,6 +524,7 @@ dsm_create(Size size)
 	dsm_control->item[nitems].handle = seg->handle;
 	/* refcnt of 1 triggers destruction, so start at 2 */
 	dsm_control->item[nitems].refcnt = 2;
+	dsm_control->item[nitems].pinned = false;
 	seg->control_slot = nitems;
 	dsm_control->nitems++;
 	LWLockRelease(DynamicSharedMemoryControlLock);
@@ -765,6 +768,9 @@ dsm_detach(dsm_segment *seg)
 		/* If new reference count is 1, try to destroy the segment. */
 		if (refcnt == 1)
 		{
+			/* A pinned segment should never reach 1. */
+			Assert(!dsm_control->item[control_slot].pinned);
+
 			/*
 			 * If we fail to destroy the segment here, or are killed before we
 			 * finish doing so, the reference count will remain at 1, which
@@ -817,11 +823,11 @@ dsm_pin_mapping(dsm_segment *seg)
 }
 
 /*
- * Keep a dynamic shared memory segment until postmaster shutdown.
+ * Keep a dynamic shared memory segment until postmaster shutdown, or until
+ * dsm_unpin_segment is called.
  *
- * This function should not be called more than once per segment;
- * on Windows, doing so will create unnecessary handles which will
- * consume system resources to no benefit.
+ * This function should not be called more than once per segment, unless the
+ * segment is explicitly unpinned with dsm_unpin_segment in between calls.
  *
  * Note that this function does not arrange for the current process to
  * keep the segment mapped indefinitely; if that behavior is desired,
@@ -834,13 +840,98 @@ dsm_pin_segment(dsm_segment *seg)
 	/*
 	 * Bump reference count for this segment in shared memory. This will
 	 * ensure that even if there is no session which is attached to this
-	 * segment, it will remain until postmaster shutdown.
+	 * segment, it will remain until postmaster shutdown or an explicit call
+	 * to unpin.
 	 */
 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	if (dsm_control->item[seg->control_slot].pinned)
+		elog(ERROR, "cannot pin a segment that is already pinned");
+	dsm_impl_pin_segment(seg->handle, seg->impl_private);
+	dsm_control->item[seg->control_slot].pinned = true;
 	dsm_control->item[seg->control_slot].refcnt++;
 	LWLockRelease(DynamicSharedMemoryControlLock);
+}
 
-	dsm_impl_pin_segment(seg->handle, seg->impl_private);
+/*
+ * Unpin a dynamic shared memory segment that was previously pinned with
+ * dsm_pin_segment.  This function should not be called unless dsm_pin_segment
+ * was previously called for this segment.
+ *
+ * The argument is a dsm_handle rather than a dsm_segment in case you want
+ * to unpin a segment to which you haven't attached.  This turns out to be
+ * useful if, for example, a reference to one shared memory segment is stored
+ * within another shared memory segment.  You might want to unpin the
+ * referenced segment before destroying the referencing segment.
+ */
+void
+dsm_unpin_segment(dsm_handle handle)
+{
+	uint32		control_slot = INVALID_CONTROL_SLOT;
+	bool		destroy = false;
+	uint32		i;
+
+	/* Find the control slot for the given handle. */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	for (i = 0; i < dsm_control->nitems; ++i)
+	{
+		/* Skip unused slots. */
+		if (dsm_control->item[i].refcnt == 0)
+			continue;
+
+		/* If we've found our handle, we can stop searching. */
+		if (dsm_control->item[i].handle == handle)
+		{
+			control_slot = i;
+			break;
+		}
+	}
+
+	/*
+	 * We should definitely have found the slot, and it should not already be
+	 * in the process of going away, because this function should only be
+	 * called on a segment which is pinned.
+	 */
+	if (control_slot == INVALID_CONTROL_SLOT)
+		elog(ERROR, "cannot unpin unknown segment handle");
+	if (!dsm_control->item[control_slot].pinned)
+		elog(ERROR, "cannot unpin a segment that is not pinned");
+	Assert(dsm_control->item[control_slot].refcnt > 1);
+
+	/* Note that 1 means no references (0 means unused slot). */
+	if (--dsm_control->item[control_slot].refcnt == 1)
+		destroy = true;
+	dsm_control->item[control_slot].pinned = false;
+
+	/* Now we can release the lock. */
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	/* Clean up resources if that was the last reference. */
+	if (destroy)
+	{
+		void	   *junk_impl_private = NULL;
+		void	   *junk_mapped_address = NULL;
+		Size		junk_mapped_size = 0;
+
+		/*
+		 * For an explanation of how error handling works in this case, see
+		 * comments in dsm_detach.  Note that if we reach this point, the
+		 * current process certainly does not have the segment mapped, because
+		 * if it did, the reference count would have still been greater than 1
+		 * even after releasing the reference count held by the pin.  The fact
+		 * that there can't be a dsm_segment for this handle makes it OK to
+		 * pass the mapped size, mapped address, and private data as NULL
+		 * here.
+		 */
+		if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+						&junk_mapped_address, &junk_mapped_size, WARNING))
+		{
+			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+			Assert(dsm_control->item[control_slot].handle == handle);
+			Assert(dsm_control->item[control_slot].refcnt == 1);
+			dsm_control->item[control_slot].refcnt = 0;
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		}
+	}
 }
 
 /*
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index ebe84618ad25..bf949fbd91da 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -1055,8 +1055,8 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
 #endif
 
 /*
- * Implementation-specific actions that must be performed when a segment
- * is to be preserved until postmaster shutdown.
+ * Implementation-specific actions that must be performed when a segment is to
+ * be preserved even when no backend has it attached.
  *
  * Except on Windows, we don't need to do anything at all.  But since Windows
  * cleans up segments automatically when no references remain, we duplicate
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3c664ccf93eb..d2f9ae05d84a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -40,6 +40,7 @@
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
+#include "catalog/storage_pending_deletes.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/predicate.h"
@@ -216,6 +217,9 @@ CreateSharedMemoryAndSemaphores(int port)
 		/* size of parallel cursor count */
 		size = add_size(size, ParallelCursorCountSize());
 
+		/* size of pending deletes */
+		size = add_size(size, PdlShmemSize());
+
 		elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size);
 
 		/*
@@ -388,6 +392,8 @@ CreateSharedMemoryAndSemaphores(int port)
 	if (Gp_role == GP_ROLE_DISPATCH)
 		ParallelCursorCountInit();
 
+	PdlShmemInit();
+
 	/*
 	 * Now give loadable modules a chance to set up their shmem allocations
 	 */
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 56e5b85b8f16..b7aab2caa047 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -279,6 +279,9 @@ NumLWLocks(void)
 	lock_addin_request_allowed = false;
 	numLocks += Max(lock_addin_request, NUM_USER_DEFINED_LWLOCKS);
 
+	/* storage_pending_deletes.c needs one for each backend */
+	numLocks += MaxBackends;
+
 	return numLocks;
 }
 
diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c
index 388d63881998..8d6b38368bf0 100644
--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -480,6 +480,8 @@ bool		gp_log_endpoints = false;
 /* optional reject to  parse ambigous 5-digits date in YYYMMDD format */
 bool		gp_allow_date_field_width_5digits = false;
 
+bool		gp_track_pending_delete = true;
+
 /* GUC to set interval for streaming archival status */
 int wal_sender_archiving_status_interval;
 
@@ -3419,6 +3421,19 @@ struct config_bool ConfigureNamesBool_gp[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"gp_track_pending_delete", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Enable extended pending deletion tracking to avoid "
+						 "accumulation of orphaned files."),
+			gettext_noop("Disabling this turns off storing relation nodes in "
+						 "shmem, dumping them to WAL and removing of files "
+						 "during recovery.")
+		},
+		&gp_track_pending_delete,
+		true,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index 2d24fa0124cf..ee473f313d45 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS =  aset.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o
+OBJS =  aset.o dsa.o freepage.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o
 
 # In PostgreSQL, this is under src/common. It has been backported, but because
 # we haven't merged the changes that introduced the src/common directory, it
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
new file mode 100644
index 000000000000..670b12f792d2
--- /dev/null
+++ b/src/backend/utils/mmgr/dsa.c
@@ -0,0 +1,2214 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsa.c
+ *	  Dynamic shared memory areas.
+ *
+ * This module provides dynamic shared memory areas which are built on top of
+ * DSM segments.  While dsm.c allows segments of memory of shared memory to be
+ * created and shared between backends, it isn't designed to deal with small
+ * objects.  A DSA area is a shared memory heap usually backed by one or more
+ * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
+ * Alternatively, it can be created in pre-existing shared memory, including a
+ * DSM segment, and then create extra DSM segments as required.  Unlike the
+ * regular system heap, it deals in pseudo-pointers which must be converted to
+ * backend-local pointers before they are dereferenced.  These pseudo-pointers
+ * can however be shared with other backends, and can be used to construct
+ * shared data structures.
+ *
+ * Each DSA area manages a set of DSM segments, adding new segments as
+ * required and detaching them when they are no longer needed.  Each segment
+ * contains a number of 4KB pages, a free page manager for tracking
+ * consecutive runs of free pages, and a page map for tracking the source of
+ * objects allocated on each page.  Allocation requests above 8KB are handled
+ * by choosing a segment and finding consecutive free pages in its free page
+ * manager.  Allocation requests for smaller sizes are handled using pools of
+ * objects of a selection of sizes.  Each pool consists of a number of 16 page
+ * (64KB) superblocks allocated in the same way as large objects.  Allocation
+ * of large objects and new superblocks is serialized by a single LWLock, but
+ * allocation of small objects from pre-existing superblocks uses one LWLock
+ * per pool.  Currently there is one pool, and therefore one lock, per size
+ * class.  Per-core pools to increase concurrency and strategies for reducing
+ * the resulting fragmentation are areas for future research.  Each superblock
+ * is managed with a 'span', which tracks the superblock's freelist.  Free
+ * requests are handled by looking in the page map to find which span an
+ * address was allocated from, so that small objects can be returned to the
+ * appropriate free list, and large object pages can be returned directly to
+ * the free page map.  When allocating, simple heuristics for selecting
+ * segments and superblocks try to encourage occupied memory to be
+ * concentrated, increasing the likelihood that whole superblocks can become
+ * empty and be returned to the free page manager, and whole segments can
+ * become empty and be returned to the operating system.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/dsa.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/dsa.h"
+#include "utils/freepage.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
+
+/*
+ * The size of the initial DSM segment that backs a dsa_area created by
+ * dsa_create.  After creating some number of segments of this size we'll
+ * double this size, and so on.  Larger segments may be created if necessary
+ * to satisfy large requests.
+ */
+#define DSA_INITIAL_SEGMENT_SIZE ((Size) (1 * 1024 * 1024))
+
+/*
+ * How many segments to create before we double the segment size.  If this is
+ * low, then there is likely to be a lot of wasted space in the largest
+ * segment.  If it is high, then we risk running out of segment slots (see
+ * dsm.c's limits on total number of segments), or limiting the total size
+ * an area can manage when using small pointers.
+ */
+#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4
+
+/*
+ * The number of bits used to represent the offset part of a dsa_pointer.
+ * This controls the maximum size of a segment, the maximum possible
+ * allocation size and also the maximum number of segments per area.
+ */
+#if SIZEOF_DSA_POINTER == 4
+#define DSA_OFFSET_WIDTH 27		/* 32 segments of size up to 128MB */
+#else
+#define DSA_OFFSET_WIDTH 40		/* 1024 segments of size up to 1TB */
+#endif
+
+/*
+ * The maximum number of DSM segments that an area can own, determined by
+ * the number of bits remaining (but capped at 1024).
+ */
+#define DSA_MAX_SEGMENTS \
+	Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
+
+/* The bitmask for extracting the offset from a dsa_pointer. */
+#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
+
+/* The maximum size of a DSM segment. */
+#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
+
+/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
+#define DSA_PAGES_PER_SUPERBLOCK		16
+
+/*
+ * A magic number used as a sanity check for following DSM segments belonging
+ * to a DSA area (this number will be XORed with the area handle and
+ * the segment index).
+ */
+#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
+
+/* Build a dsa_pointer given a segment number and offset. */
+#define DSA_MAKE_POINTER(segment_number, offset) \
+	(((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
+
+/* Extract the segment number from a dsa_pointer. */
+#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
+
+/* Extract the offset from a dsa_pointer. */
+#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
+
+/* The type used for index segment indexes (zero based). */
+typedef Size dsa_segment_index;
+
+/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
+#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
+
+/*
+ * How many bins of segments do we have?  The bins are used to categorize
+ * segments by their largest contiguous run of free pages.
+ */
+#define DSA_NUM_SEGMENT_BINS 16
+
+/*
+ * What is the lowest bin that holds segments that *might* have n contiguous
+ * free pages?	There is no point in looking in segments in lower bins; they
+ * definitely can't service a request for n free pages.
+ */
+#define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
+
+/* Macros for access to locks. */
+#define DSA_AREA_LOCK(area) (&area->control->lock)
+#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
+
+/*
+ * The header for an individual segment.  This lives at the start of each DSM
+ * segment owned by a DSA area including the first segment (where it appears
+ * as part of the dsa_area_control struct).
+ */
+typedef struct
+{
+	/* Sanity check magic value. */
+	uint32		magic;
+	/* Total number of pages in this segment (excluding metadata area). */
+	Size		usable_pages;
+	/* Total size of this segment in bytes. */
+	Size		size;
+
+	/*
+	 * Index of the segment that precedes this one in the same segment bin, or
+	 * DSA_SEGMENT_INDEX_NONE if this is the first one.
+	 */
+	dsa_segment_index prev;
+
+	/*
+	 * Index of the segment that follows this one in the same segment bin, or
+	 * DSA_SEGMENT_INDEX_NONE if this is the last one.
+	 */
+	dsa_segment_index next;
+	/* The index of the bin that contains this segment. */
+	Size		bin;
+
+	/*
+	 * A flag raised to indicate that this segment is being returned to the
+	 * operating system and has been unpinned.
+	 */
+	bool		freed;
+} dsa_segment_header;
+
+/*
+ * Metadata for one superblock.
+ *
+ * For most blocks, span objects are stored out-of-line; that is, the span
+ * object is not stored within the block itself.  But, as an exception, for a
+ * "span of spans", the span object is stored "inline".  The allocation is
+ * always exactly one page, and the dsa_area_span object is located at
+ * the beginning of that page.  The size class is DSA_SCLASS_BLOCK_OF_SPANS,
+ * and the remaining fields are used just as they would be in an ordinary
+ * block.  We can't allocate spans out of ordinary superblocks because
+ * creating an ordinary superblock requires us to be able to allocate a span
+ * *first*.  Doing it this way avoids that circularity.
+ */
+typedef struct
+{
+	dsa_pointer pool;			/* Containing pool. */
+	dsa_pointer prevspan;		/* Previous span. */
+	dsa_pointer nextspan;		/* Next span. */
+	dsa_pointer start;			/* Starting address. */
+	Size		npages;			/* Length of span in pages. */
+	uint16		size_class;		/* Size class. */
+	uint16		ninitialized;	/* Maximum number of objects ever allocated. */
+	uint16		nallocatable;	/* Number of objects currently allocatable. */
+	uint16		firstfree;		/* First object on free list. */
+	uint16		nmax;			/* Maximum number of objects ever possible. */
+	uint16		fclass;			/* Current fullness class. */
+} dsa_area_span;
+
+/*
+ * Given a pointer to an object in a span, access the index of the next free
+ * object in the same span (ie in the span's freelist) as an L-value.
+ */
+#define NextFreeObjectIndex(object) (* (uint16 *) (object))
+
+/*
+ * Small allocations are handled by dividing a single block of memory into
+ * many small objects of equal size.  The possible allocation sizes are
+ * defined by the following array.  Larger size classes are spaced more widely
+ * than smaller size classes.  We fudge the spacing for size classes >1kB to
+ * avoid space wastage: based on the knowledge that we plan to allocate 64kB
+ * blocks, we bump the maximum object size up to the largest multiple of
+ * 8 bytes that still lets us fit the same number of objects into one block.
+ *
+ * NB: Because of this fudging, if we were ever to use differently-sized blocks
+ * for small allocations, these size classes would need to be reworked to be
+ * optimal for the new size.
+ *
+ * NB: The optimal spacing for size classes, as well as the size of the blocks
+ * out of which small objects are allocated, is not a question that has one
+ * right answer.  Some allocators (such as tcmalloc) use more closely-spaced
+ * size classes than we do here, while others (like aset.c) use more
+ * widely-spaced classes.  Spacing the classes more closely avoids wasting
+ * memory within individual chunks, but also means a larger number of
+ * potentially-unfilled blocks.
+ */
+static const uint16 dsa_size_classes[] = {
+	sizeof(dsa_area_span), 0,	/* special size classes */
+	8, 16, 24, 32, 40, 48, 56, 64,		/* 8 classes separated by 8 bytes */
+	80, 96, 112, 128,			/* 4 classes separated by 16 bytes */
+	160, 192, 224, 256,			/* 4 classes separated by 32 bytes */
+	320, 384, 448, 512,			/* 4 classes separated by 64 bytes */
+	640, 768, 896, 1024,		/* 4 classes separated by 128 bytes */
+	1280, 1560, 1816, 2048,		/* 4 classes separated by ~256 bytes */
+	2616, 3120, 3640, 4096,		/* 4 classes separated by ~512 bytes */
+	5456, 6552, 7280, 8192		/* 4 classes separated by ~1024 bytes */
+};
+#define DSA_NUM_SIZE_CLASSES				lengthof(dsa_size_classes)
+
+/* Special size classes. */
+#define DSA_SCLASS_BLOCK_OF_SPANS		0
+#define DSA_SCLASS_SPAN_LARGE			1
+
+/*
+ * The following lookup table is used to map the size of small objects
+ * (less than 1kB) onto the corresponding size class.  To use this table,
+ * round the size of the object up to the next multiple of 8 bytes, and then
+ * index into this array.
+ */
+static char dsa_size_class_map[] = {
+	2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+	20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
+};
+#define DSA_SIZE_CLASS_MAP_QUANTUM	8
+
+/*
+ * Superblocks are binned by how full they are.  Generally, each fullness
+ * class corresponds to one quartile, but the block being used for
+ * allocations is always at the head of the list for fullness class 1,
+ * regardless of how full it really is.
+ */
+#define DSA_FULLNESS_CLASSES		4
+
+/*
+ * Maximum length of a DSA name.
+ */
+#define DSA_MAXLEN					64
+
+/*
+ * A dsa_area_pool represents a set of objects of a given size class.
+ *
+ * Perhaps there should be multiple pools for the same size class for
+ * contention avoidance, but for now there is just one!
+ */
+typedef struct
+{
+	/* A lock protecting access to this pool. */
+	LWLock		lock;
+	/* A set of linked lists of spans, arranged by fullness. */
+	dsa_pointer spans[DSA_FULLNESS_CLASSES];
+	/* Should we pad this out to a cacheline boundary? */
+} dsa_area_pool;
+
+/*
+ * The control block for an area.  This lives in shared memory, at the start of
+ * the first DSM segment controlled by this area.
+ */
+typedef struct
+{
+	/* The segment header for the first segment. */
+	dsa_segment_header segment_header;
+	/* The handle for this area. */
+	dsa_handle	handle;
+	/* The handles of the segments owned by this area. */
+	dsm_handle	segment_handles[DSA_MAX_SEGMENTS];
+	/* Lists of segments, binned by maximum contiguous run of free pages. */
+	dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
+	/* The object pools for each size class. */
+	dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
+	/* The total size of all active segments. */
+	Size		total_segment_size;
+	/* The maximum total size of backing storage we are allowed. */
+	Size		max_total_segment_size;
+	/* Highest used segment index in the history of this area. */
+	dsa_segment_index high_segment_index;
+	/* The reference count for this area. */
+	int			refcnt;
+	/* A flag indicating that this area has been pinned. */
+	bool		pinned;
+	/* The number of times that segments have been freed. */
+	Size		freed_segment_counter;
+	/* The LWLock tranche ID. */
+	int			lwlock_tranche_id;
+	char		lwlock_tranche_name[DSA_MAXLEN];
+	/* The general lock (protects everything except object pools). */
+	LWLock		lock;
+} dsa_area_control;
+
+/* Given a pointer to a pool, find a dsa_pointer. */
+#define DsaAreaPoolToDsaPointer(area, p)	\
+	DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
+
+/*
+ * A dsa_segment_map is stored within the backend-private memory of each
+ * individual backend.  It holds the base address of the segment within that
+ * backend, plus the addresses of key objects within the segment.  Those
+ * could instead be derived from the base address but it's handy to have them
+ * around.
+ */
+typedef struct
+{
+	dsm_segment *segment;		/* DSM segment */
+	char	   *mapped_address; /* Address at which segment is mapped */
+	dsa_segment_header *header; /* Header (same as mapped_address) */
+	FreePageManager *fpm;		/* Free page manager within segment. */
+	dsa_pointer *pagemap;		/* Page map within segment. */
+} dsa_segment_map;
+
+/*
+ * Per-backend state for a storage area.  Backends obtain one of these by
+ * creating an area or attaching to an existing one using a handle.  Each
+ * process that needs to use an area uses its own object to track where the
+ * segments are mapped.
+ */
+struct dsa_area
+{
+	/* Pointer to the control object in shared memory. */
+	dsa_area_control *control;
+
+	/* The lock tranche for this process. */
+	LWLockTranche lwlock_tranche;
+
+	/*
+	 * All the mappings are owned by this.  The dsa_area itself is not
+	 * directly tracked by the ResourceOwner, but the effect is the same. NULL
+	 * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
+	 * been called.
+	 */
+	ResourceOwner resowner;
+
+	/*
+	 * This backend's array of segment maps, ordered by segment index
+	 * corresponding to control->segment_handles.  Some of the area's segments
+	 * may not be mapped in in this backend yet, and some slots may have been
+	 * freed and need to be detached; these operations happen on demand.
+	 */
+	dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
+
+	/* The highest segment index this backend has ever mapped. */
+	dsa_segment_index high_segment_index;
+
+	/* The last observed freed_segment_counter. */
+	Size		freed_segment_counter;
+};
+
+#define DSA_SPAN_NOTHING_FREE	((uint16) -1)
+#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
+
+/* Given a pointer to a segment_map, obtain a segment index number. */
+#define get_segment_index(area, segment_map_ptr) \
+	(segment_map_ptr - &area->segment_maps[0])
+
+static void init_span(dsa_area *area, dsa_pointer span_pointer,
+		  dsa_area_pool *pool, dsa_pointer start, Size npages,
+		  uint16 size_class);
+static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
+					int fromclass, int toclass);
+static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
+static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
+						 int size_class);
+static dsa_segment_map *get_segment_by_index(dsa_area *area,
+					 dsa_segment_index index);
+static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
+static void unlink_span(dsa_area *area, dsa_area_span *span);
+static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
+						   dsa_pointer span_pointer, int fclass);
+static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
+static dsa_segment_map *get_best_segment(dsa_area *area, Size npages);
+static dsa_segment_map *make_new_segment(dsa_area *area, Size requested_pages);
+static dsa_area *create_internal(void *place, size_t size,
+				int tranche_id, const char *tranche_name,
+				dsm_handle control_handle,
+				dsm_segment *control_segment);
+static dsa_area *attach_internal(void *place, dsm_segment *segment,
+				dsa_handle handle);
+static void check_for_freed_segments(dsa_area *area);
+
+/*
+ * Create a new shared area in a new DSM segment.  Further DSM segments will
+ * be allocated as required to extend the available space.
+ *
+ * We can't allocate a LWLock tranche_id within this function, because tranche
+ * IDs are a scarce resource; there are only 64k available, using low numbers
+ * when possible matters, and we have no provision for recycling them.  So,
+ * we require the caller to provide one.  The caller must also provide the
+ * tranche name, so that we can distinguish LWLocks belonging to different
+ * DSAs.
+ */
+dsa_area *
+dsa_create(int tranche_id, const char *tranche_name)
+{
+	dsm_segment *segment;
+	dsa_area   *area;
+
+	/*
+	 * Create the DSM segment that will hold the shared control object and the
+	 * first segment of usable space.
+	 */
+	segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE);
+
+	/*
+	 * All segments backing this area are pinned, so that DSA can explicitly
+	 * control their lifetime (otherwise a newly created segment belonging to
+	 * this area might be freed when the only backend that happens to have it
+	 * mapped in ends, corrupting the area).
+	 */
+	dsm_pin_segment(segment);
+
+	/* Create a new DSA area with the control objet in this segment. */
+	area = create_internal(dsm_segment_address(segment),
+						   DSA_INITIAL_SEGMENT_SIZE,
+						   tranche_id, tranche_name,
+						   dsm_segment_handle(segment), segment);
+
+	/* Clean up when the control segment detaches. */
+	on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+				  PointerGetDatum(dsm_segment_address(segment)));
+
+	return area;
+}
+
+/*
+ * Create a new shared area in an existing shared memory space, which may be
+ * either DSM or Postmaster-initialized memory.  DSM segments will be
+ * allocated as required to extend the available space, though that can be
+ * prevented with dsa_set_size_limit(area, size) using the same size provided
+ * to dsa_create_in_place.
+ *
+ * Areas created in-place must eventually be released by the backend that
+ * created them and all backends that attach to them.  This can be done
+ * explicitly with dsa_release_in_place, or, in the special case that 'place'
+ * happens to be in a pre-existing DSM segment, by passing in a pointer to the
+ * segment so that a detach hook can be registered with the containing DSM
+ * segment.
+ *
+ * See dsa_create() for a note about the tranche arguments.
+ */
+dsa_area *
+dsa_create_in_place(void *place, size_t size,
+					int tranche_id, const char *tranche_name,
+					dsm_segment *segment)
+{
+	dsa_area   *area;
+
+	area = create_internal(place, size, tranche_id, tranche_name,
+						   DSM_HANDLE_INVALID, NULL);
+
+	/*
+	 * Clean up when the control segment detaches, if a containing DSM segment
+	 * was provided.
+	 */
+	if (segment != NULL)
+		on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+					  PointerGetDatum(place));
+
+	return area;
+}
+
+/*
+ * Obtain a handle that can be passed to other processes so that they can
+ * attach to the given area.  Cannot be called for areas created with
+ * dsa_create_in_place.
+ */
+dsa_handle
+dsa_get_handle(dsa_area *area)
+{
+	Assert(area->control->handle != DSM_HANDLE_INVALID);
+	return area->control->handle;
+}
+
+/*
+ * Attach to an area given a handle generated (possibly in another process) by
+ * dsa_get_area_handle.  The area must have been created with dsa_create (not
+ * dsa_create_in_place).
+ */
+dsa_area *
+dsa_attach(dsa_handle handle)
+{
+	dsm_segment *segment;
+	dsa_area   *area;
+
+	/*
+	 * An area handle is really a DSM segment handle for the first segment, so
+	 * we go ahead and attach to that.
+	 */
+	segment = dsm_attach(handle);
+	if (segment == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not attach to dsa_handle")));
+
+	area = attach_internal(dsm_segment_address(segment), segment, handle);
+
+	/* Clean up when the control segment detaches. */
+	on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+				  PointerGetDatum(dsm_segment_address(segment)));
+
+	return area;
+}
+
+/*
+ * Attach to an area that was created with dsa_create_in_place.  The caller
+ * must somehow know the location in memory that was used when the area was
+ * created, though it may be mapped at a different virtual address in this
+ * process.
+ *
+ * See dsa_create_in_place for note about releasing in-place areas, and the
+ * optional 'segment' argument which can be provided to allow automatic
+ * release if the containing memory happens to be a DSM segment.
+ */
+dsa_area *
+dsa_attach_in_place(void *place, dsm_segment *segment)
+{
+	dsa_area   *area;
+
+	area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
+
+	/*
+	 * Clean up when the control segment detaches, if a containing DSM segment
+	 * was provided.
+	 */
+	if (segment != NULL)
+		on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+					  PointerGetDatum(place));
+
+	return area;
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place.  The 'segment' argument is ignored but provides an
+ * interface suitable for on_dsm_detach, for the convenience of users who want
+ * to create a DSA segment inside an existing DSM segment and have it
+ * automatically released when the containing DSM segment is detached.
+ * 'place' should be the address of the place where the area was created.
+ *
+ * This callback is automatically registered for the DSM segment containing
+ * the control object of in-place areas when a segment is provided to
+ * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
+ * with dsa_create.
+ */
+void
+dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
+{
+	dsa_release_in_place(DatumGetPointer(place));
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place.  The 'code' argument is ignored but provides an
+ * interface suitable for on_shmem_exit or before_shmem_exit, for the
+ * convenience of users who want to create a DSA segment inside shared memory
+ * other than a DSM segment and have it automatically release at backend exit.
+ * 'place' should be the address of the place where the area was created.
+ */
+void
+dsa_on_shmem_exit_release_in_place(int code, Datum place)
+{
+	dsa_release_in_place(DatumGetPointer(place));
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place.  It is preferable to use one of the 'dsa_on_XXX'
+ * callbacks so that this is managed automatically, because failure to release
+ * an area created in-place leaks its segments permanently.
+ *
+ * This is also called automatically for areas produced by dsa_create or
+ * dsa_attach as an implementation detail.
+ */
+void
+dsa_release_in_place(void *place)
+{
+	dsa_area_control *control = (dsa_area_control *) place;
+	int			i;
+
+	LWLockAcquire(&control->lock, LW_EXCLUSIVE);
+	Assert(control->segment_header.magic ==
+		   (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
+	Assert(control->refcnt > 0);
+	if (--control->refcnt == 0)
+	{
+		for (i = 0; i <= control->high_segment_index; ++i)
+		{
+			dsm_handle	handle;
+
+			handle = control->segment_handles[i];
+			if (handle != DSM_HANDLE_INVALID)
+				dsm_unpin_segment(handle);
+		}
+	}
+	LWLockRelease(&control->lock);
+}
+
+/*
+ * Keep a DSA area attached until end of session or explicit detach.
+ *
+ * By default, areas are owned by the current resource owner, which means they
+ * are detached automatically when that scope ends.
+ */
+void
+dsa_pin_mapping(dsa_area *area)
+{
+	int			i;
+
+	if (area->resowner != NULL)
+	{
+		area->resowner = NULL;
+
+		for (i = 0; i <= area->high_segment_index; ++i)
+			if (area->segment_maps[i].segment != NULL)
+				dsm_pin_mapping(area->segment_maps[i].segment);
+	}
+}
+
+/*
+ * Allocate memory in this storage area.  The return value is a dsa_pointer
+ * that can be passed to other processes, and converted to a local pointer
+ * with dsa_get_address.  If no memory is available, returns
+ * InvalidDsaPointer.
+ */
+dsa_pointer
+dsa_allocate(dsa_area *area, Size size)
+{
+	uint16		size_class;
+	dsa_pointer start_pointer;
+	dsa_segment_map *segment_map;
+
+	Assert(size > 0);
+
+	/*
+	 * If bigger than the largest size class, just grab a run of pages from
+	 * the free page manager, instead of allocating an object from a pool.
+	 * There will still be a span, but it's a special class of span that
+	 * manages this whole allocation and simply gives all pages back to the
+	 * free page manager when dsa_free is called.
+	 */
+	if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
+	{
+		Size		npages = fpm_size_to_pages(size);
+		Size		first_page;
+		dsa_pointer span_pointer;
+		dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
+
+		/* Obtain a span object. */
+		span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
+		if (!DsaPointerIsValid(span_pointer))
+			return InvalidDsaPointer;
+
+		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+
+		/* Find a segment from which to allocate. */
+		segment_map = get_best_segment(area, npages);
+		if (segment_map == NULL)
+			segment_map = make_new_segment(area, npages);
+		if (segment_map == NULL)
+		{
+			/* Can't make any more segments: game over. */
+			LWLockRelease(DSA_AREA_LOCK(area));
+			dsa_free(area, span_pointer);
+			return InvalidDsaPointer;
+		}
+
+		/*
+		 * Ask the free page manager for a run of pages.  This should always
+		 * succeed, since both get_best_segment and make_new_segment should
+		 * only return a non-NULL pointer if it actually contains enough
+		 * contiguous freespace.  If it does fail, something in our backend
+		 * private state is out of whack, so use FATAL to kill the process.
+		 */
+		if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
+			elog(FATAL,
+				 "dsa_allocate could not find %zu free pages", npages);
+		LWLockRelease(DSA_AREA_LOCK(area));
+
+		start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
+										 first_page * FPM_PAGE_SIZE);
+
+		/* Initialize span and pagemap. */
+		LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
+					  LW_EXCLUSIVE);
+		init_span(area, span_pointer, pool, start_pointer, npages,
+				  DSA_SCLASS_SPAN_LARGE);
+		segment_map->pagemap[first_page] = span_pointer;
+		LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
+
+		return start_pointer;
+	}
+
+	/* Map allocation to a size class. */
+	if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
+	{
+		int			mapidx;
+
+		/* For smaller sizes we have a lookup table... */
+		mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
+				  DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
+		size_class = dsa_size_class_map[mapidx];
+	}
+	else
+	{
+		uint16		min;
+		uint16		max;
+
+		/* ... and for the rest we search by binary chop. */
+		min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
+		max = lengthof(dsa_size_classes) - 1;
+
+		while (min < max)
+		{
+			uint16		mid = (min + max) / 2;
+			uint16		class_size = dsa_size_classes[mid];
+
+			if (class_size < size)
+				min = mid + 1;
+			else
+				max = mid;
+		}
+
+		size_class = min;
+	}
+	Assert(size <= dsa_size_classes[size_class]);
+	Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
+
+	/*
+	 * Attempt to allocate an object from the appropriate pool.  This might
+	 * return InvalidDsaPointer if there's no space available.
+	 */
+	return alloc_object(area, size_class);
+}
+
+/*
+ * Free memory obtained with dsa_allocate.
+ */
+void
+dsa_free(dsa_area *area, dsa_pointer dp)
+{
+	dsa_segment_map *segment_map;
+	int			pageno;
+	dsa_pointer span_pointer;
+	dsa_area_span *span;
+	char	   *superblock;
+	char	   *object;
+	Size		size;
+	int			size_class;
+
+	/* Make sure we don't have a stale segment in the slot 'dp' refers to. */
+	check_for_freed_segments(area);
+
+	/* Locate the object, span and pool. */
+	segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
+	pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
+	span_pointer = segment_map->pagemap[pageno];
+	span = dsa_get_address(area, span_pointer);
+	superblock = dsa_get_address(area, span->start);
+	object = dsa_get_address(area, dp);
+	size_class = span->size_class;
+	size = dsa_size_classes[size_class];
+
+	/*
+	 * Special case for large objects that live in a special span: we return
+	 * those pages directly to the free page manager and free the span.
+	 */
+	if (span->size_class == DSA_SCLASS_SPAN_LARGE)
+	{
+
+#ifdef CLOBBER_FREED_MEMORY
+		memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
+#endif
+
+		/* Give pages back to free page manager. */
+		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+		FreePageManagerPut(segment_map->fpm,
+						   DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
+						   span->npages);
+		LWLockRelease(DSA_AREA_LOCK(area));
+		/* Unlink span. */
+		LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
+					  LW_EXCLUSIVE);
+		unlink_span(area, span);
+		LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
+		/* Free the span object so it can be reused. */
+		dsa_free(area, span_pointer);
+		return;
+	}
+
+#ifdef CLOBBER_FREED_MEMORY
+	memset(object, 0x7f, size);
+#endif
+
+	LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+
+	/* Put the object on the span's freelist. */
+	Assert(object >= superblock);
+	Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
+	Assert((object - superblock) % size == 0);
+	NextFreeObjectIndex(object) = span->firstfree;
+	span->firstfree = (object - superblock) / size;
+	++span->nallocatable;
+
+	/*
+	 * See if the span needs to moved to a different fullness class, or be
+	 * freed so its pages can be given back to the segment.
+	 */
+	if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
+	{
+		/*
+		 * The block was completely full and is located in the
+		 * highest-numbered fullness class, which is never scanned for free
+		 * chunks.  We must move it to the next-lower fullness class.
+		 */
+		unlink_span(area, span);
+		add_span_to_fullness_class(area, span, span_pointer,
+								   DSA_FULLNESS_CLASSES - 2);
+
+		/*
+		 * If this is the only span, and there is no active span, then we
+		 * should probably move this span to fullness class 1.  (Otherwise if
+		 * you allocate exactly all the objects in the only span, it moves to
+		 * class 3, then you free them all, it moves to 2, and then is given
+		 * back, leaving no active span).
+		 */
+	}
+	else if (span->nallocatable == span->nmax &&
+			 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
+	{
+		/*
+		 * This entire block is free, and it's not the active block for this
+		 * size class.  Return the memory to the free page manager. We don't
+		 * do this for the active block to prevent hysteresis: if we
+		 * repeatedly allocate and free the only chunk in the active block, it
+		 * will be very inefficient if we deallocate and reallocate the block
+		 * every time.
+		 */
+		destroy_superblock(area, span_pointer);
+	}
+
+	LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+}
+
+/*
+ * Obtain a backend-local address for a dsa_pointer.  'dp' must point to
+ * memory allocated by the given area (possibly in another process) that
+ * hasn't yet been freed.  This may cause a segment to be mapped into the
+ * current process if required, and may cause freed segments to be unmapped.
+ */
+void *
+dsa_get_address(dsa_area *area, dsa_pointer dp)
+{
+	dsa_segment_index index;
+	Size		offset;
+
+	/* Convert InvalidDsaPointer to NULL. */
+	if (!DsaPointerIsValid(dp))
+		return NULL;
+
+	/* Process any requests to detach from freed segments. */
+	check_for_freed_segments(area);
+
+	/* Break the dsa_pointer into its components. */
+	index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
+	offset = DSA_EXTRACT_OFFSET(dp);
+	Assert(index < DSA_MAX_SEGMENTS);
+
+	/* Check if we need to cause this segment to be mapped in. */
+	if (unlikely(area->segment_maps[index].mapped_address == NULL))
+	{
+		/* Call for effect (we don't need the result). */
+		get_segment_by_index(area, index);
+	}
+
+	return area->segment_maps[index].mapped_address + offset;
+}
+
+/*
+ * Pin this area, so that it will continue to exist even if all backends
+ * detach from it.  In that case, the area can still be reattached to if a
+ * handle has been recorded somewhere.
+ */
+void
+dsa_pin(dsa_area *area)
+{
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	if (area->control->pinned)
+	{
+		LWLockRelease(DSA_AREA_LOCK(area));
+		elog(ERROR, "dsa_area already pinned");
+	}
+	area->control->pinned = true;
+	++area->control->refcnt;
+	LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Undo the effects of dsa_pin, so that the given area can be freed when no
+ * backends are attached to it.  May be called only if dsa_pin has been
+ * called.
+ */
+void
+dsa_unpin(dsa_area *area)
+{
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	Assert(area->control->refcnt > 1);
+	if (!area->control->pinned)
+	{
+		LWLockRelease(DSA_AREA_LOCK(area));
+		elog(ERROR, "dsa_area not pinned");
+	}
+	area->control->pinned = false;
+	--area->control->refcnt;
+	LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Set the total size limit for this area.  This limit is checked whenever new
+ * segments need to be allocated from the operating system.  If the new size
+ * limit is already exceeded, this has no immediate effect.
+ *
+ * Note that the total virtual memory usage may be temporarily larger than
+ * this limit when segments have been freed, but not yet detached by all
+ * backends that have attached to them.
+ */
+void
+dsa_set_size_limit(dsa_area *area, Size limit)
+{
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	area->control->max_total_segment_size = limit;
+	LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Aggressively free all spare memory in the hope of returning DSM segments to
+ * the operating system.
+ */
+void
+dsa_trim(dsa_area *area)
+{
+	int			size_class;
+
+	/*
+	 * Trim in reverse pool order so we get to the spans-of-spans last, just
+	 * in case any become entirely free while processing all the other pools.
+	 */
+	for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
+	{
+		dsa_area_pool *pool = &area->control->pools[size_class];
+		dsa_pointer span_pointer;
+
+		if (size_class == DSA_SCLASS_SPAN_LARGE)
+		{
+			/* Large object frees give back segments aggressively already. */
+			continue;
+		}
+
+		/*
+		 * Search fullness class 1 only.  That is where we expect to find an
+		 * entirely empty superblock (entirely empty superblocks in other
+		 * fullness classes are returned to the free page map by dsa_free).
+		 */
+		LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+		span_pointer = pool->spans[1];
+		while (DsaPointerIsValid(span_pointer))
+		{
+			dsa_area_span *span = dsa_get_address(area, span_pointer);
+			dsa_pointer next = span->nextspan;
+
+			if (span->nallocatable == span->nmax)
+				destroy_superblock(area, span_pointer);
+
+			span_pointer = next;
+		}
+		LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+	}
+}
+
+/*
+ * Print out debugging information about the internal state of the shared
+ * memory area.
+ */
+void
+dsa_dump(dsa_area *area)
+{
+	Size		i,
+				j;
+
+	/*
+	 * Note: This gives an inconsistent snapshot as it acquires and releases
+	 * individual locks as it goes...
+	 */
+
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
+	fprintf(stderr, "  max_total_segment_size: %zu\n",
+			area->control->max_total_segment_size);
+	fprintf(stderr, "  total_segment_size: %zu\n",
+			area->control->total_segment_size);
+	fprintf(stderr, "  refcnt: %d\n", area->control->refcnt);
+	fprintf(stderr, "  pinned: %c\n", area->control->pinned ? 't' : 'f');
+	fprintf(stderr, "  segment bins:\n");
+	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
+	{
+		if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
+		{
+			dsa_segment_index segment_index;
+
+			fprintf(stderr,
+				"    segment bin %zu (at least %d contiguous pages free):\n",
+					i, 1 << (i - 1));
+			segment_index = area->control->segment_bins[i];
+			while (segment_index != DSA_SEGMENT_INDEX_NONE)
+			{
+				dsa_segment_map *segment_map;
+
+				segment_map =
+					get_segment_by_index(area, segment_index);
+
+				fprintf(stderr,
+						"      segment index %zu, usable_pages = %zu, "
+						"contiguous_pages = %zu, mapped at %p\n",
+						segment_index,
+						segment_map->header->usable_pages,
+						fpm_largest(segment_map->fpm),
+						segment_map->mapped_address);
+				segment_index = segment_map->header->next;
+			}
+		}
+	}
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	fprintf(stderr, "  pools:\n");
+	for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
+	{
+		bool		found = false;
+
+		LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
+		for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
+			if (DsaPointerIsValid(area->control->pools[i].spans[j]))
+				found = true;
+		if (found)
+		{
+			if (i == DSA_SCLASS_BLOCK_OF_SPANS)
+				fprintf(stderr, "    pool for blocks of span objects:\n");
+			else if (i == DSA_SCLASS_SPAN_LARGE)
+				fprintf(stderr, "    pool for large object spans:\n");
+			else
+				fprintf(stderr,
+					"    pool for size class %zu (object size %hu bytes):\n",
+						i, dsa_size_classes[i]);
+			for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
+			{
+				if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
+					fprintf(stderr, "      fullness class %zu is empty\n", j);
+				else
+				{
+					dsa_pointer span_pointer = area->control->pools[i].spans[j];
+
+					fprintf(stderr, "      fullness class %zu:\n", j);
+					while (DsaPointerIsValid(span_pointer))
+					{
+						dsa_area_span *span;
+
+						span = dsa_get_address(area, span_pointer);
+						fprintf(stderr,
+								"        span descriptor at %016lx, "
+								"superblock at %016lx, pages = %zu, "
+								"objects free = %hu/%hu\n",
+								span_pointer, span->start, span->npages,
+								span->nallocatable, span->nmax);
+						span_pointer = span->nextspan;
+					}
+				}
+			}
+		}
+		LWLockRelease(DSA_SCLASS_LOCK(area, i));
+	}
+}
+
+/*
+ * Return the smallest size that you can successfully provide to
+ * dsa_create_in_place.
+ */
+Size
+dsa_minimum_size(void)
+{
+	Size		size;
+	int			pages = 0;
+
+	size = MAXALIGN(sizeof(dsa_area_control)) +
+		MAXALIGN(sizeof(FreePageManager));
+
+	/* Figure out how many pages we need, including the page map... */
+	while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
+	{
+		++pages;
+		size += sizeof(dsa_pointer);
+	}
+
+	return pages * FPM_PAGE_SIZE;
+}
+
+/*
+ * Workhorse function for dsa_create and dsa_create_in_place.
+ */
+static dsa_area *
+create_internal(void *place, size_t size,
+				int tranche_id, const char *tranche_name,
+				dsm_handle control_handle,
+				dsm_segment *control_segment)
+{
+	dsa_area_control *control;
+	dsa_area   *area;
+	dsa_segment_map *segment_map;
+	Size		usable_pages;
+	Size		total_pages;
+	Size		metadata_bytes;
+	int			i;
+
+	/* Sanity check on the space we have to work in. */
+	if (size < dsa_minimum_size())
+		elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
+			 dsa_minimum_size(), size);
+
+	/* Now figure out how much space is usuable */
+	total_pages = size / FPM_PAGE_SIZE;
+	metadata_bytes =
+		MAXALIGN(sizeof(dsa_area_control)) +
+		MAXALIGN(sizeof(FreePageManager)) +
+		total_pages * sizeof(dsa_pointer);
+	/* Add padding up to next page boundary. */
+	if (metadata_bytes % FPM_PAGE_SIZE != 0)
+		metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+	Assert(metadata_bytes <= size);
+	usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
+
+	/*
+	 * Initialize the dsa_area_control object located at the start of the
+	 * space.
+	 */
+	control = (dsa_area_control *) place;
+	control->segment_header.magic =
+		DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
+	control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
+	control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
+	control->segment_header.usable_pages = usable_pages;
+	control->segment_header.freed = false;
+	control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
+	control->handle = control_handle;
+	control->max_total_segment_size = SIZE_MAX;
+	control->total_segment_size = size;
+	memset(&control->segment_handles[0], 0,
+		   sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
+	control->segment_handles[0] = control_handle;
+	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
+		control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
+	control->high_segment_index = 0;
+	control->refcnt = 1;
+	control->freed_segment_counter = 0;
+	control->lwlock_tranche_id = tranche_id;
+	strlcpy(control->lwlock_tranche_name, tranche_name, DSA_MAXLEN);
+
+	/*
+	 * Create the dsa_area object that this backend will use to access the
+	 * area.  Other backends will need to obtain their own dsa_area object by
+	 * attaching.
+	 */
+	area = palloc(sizeof(dsa_area));
+	area->control = control;
+	area->resowner = CurrentResourceOwner;
+	memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
+	area->high_segment_index = 0;
+	area->lwlock_tranche.array_base = &area->control->pools[0];
+	area->lwlock_tranche.array_stride = sizeof(dsa_area_pool);
+	area->lwlock_tranche.name = control->lwlock_tranche_name;
+	LWLockRegisterTranche(control->lwlock_tranche_id, &area->lwlock_tranche);
+	LWLockInitialize(&control->lock, control->lwlock_tranche_id);
+	for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
+		LWLockInitialize(DSA_SCLASS_LOCK(area, i),
+						 control->lwlock_tranche_id);
+
+	/* Set up the segment map for this process's mapping. */
+	segment_map = &area->segment_maps[0];
+	segment_map->segment = control_segment;
+	segment_map->mapped_address = place;
+	segment_map->header = (dsa_segment_header *) place;
+	segment_map->fpm = (FreePageManager *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_area_control)));
+	segment_map->pagemap = (dsa_pointer *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_area_control)) +
+		 MAXALIGN(sizeof(FreePageManager)));
+
+	/* Set up the free page map. */
+	FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
+	/* There can be 0 usable pages if size is dsa_minimum_size(). */
+
+	if (usable_pages > 0)
+		FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
+						   usable_pages);
+
+	/* Put this segment into the appropriate bin. */
+	control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
+	segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
+
+	return area;
+}
+
+/*
+ * Workhorse function for dsa_attach and dsa_attach_in_place.
+ */
+static dsa_area *
+attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
+{
+	dsa_area_control *control;
+	dsa_area   *area;
+	dsa_segment_map *segment_map;
+
+	control = (dsa_area_control *) place;
+	Assert(control->handle == handle);
+	Assert(control->segment_handles[0] == handle);
+	Assert(control->segment_header.magic ==
+		   (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
+
+	/* Build the backend-local area object. */
+	area = palloc(sizeof(dsa_area));
+	area->control = control;
+	area->resowner = CurrentResourceOwner;
+	memset(&area->segment_maps[0], 0,
+		   sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
+	area->high_segment_index = 0;
+	area->lwlock_tranche.array_base = &area->control->pools[0];
+	area->lwlock_tranche.array_stride = sizeof(dsa_area_pool);
+	area->lwlock_tranche.name = control->lwlock_tranche_name;
+	LWLockRegisterTranche(control->lwlock_tranche_id, &area->lwlock_tranche);
+
+	/* Set up the segment map for this process's mapping. */
+	segment_map = &area->segment_maps[0];
+	segment_map->segment = segment;		/* NULL for in-place */
+	segment_map->mapped_address = place;
+	segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
+	segment_map->fpm = (FreePageManager *)
+		(segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
+	segment_map->pagemap = (dsa_pointer *)
+		(segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
+		 MAXALIGN(sizeof(FreePageManager)));
+
+	/* Bump the reference count. */
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	++control->refcnt;
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	return area;
+}
+
+/*
+ * Add a new span to fullness class 1 of the indicated pool.
+ */
+static void
+init_span(dsa_area *area,
+		  dsa_pointer span_pointer,
+		  dsa_area_pool *pool, dsa_pointer start, Size npages,
+		  uint16 size_class)
+{
+	dsa_area_span *span = dsa_get_address(area, span_pointer);
+	Size		obsize = dsa_size_classes[size_class];
+
+	/*
+	 * The per-pool lock must be held because we manipulate the span list for
+	 * this pool.
+	 */
+	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+
+	/* Push this span onto the front of the span list for fullness class 1. */
+	if (DsaPointerIsValid(pool->spans[1]))
+	{
+		dsa_area_span *head = (dsa_area_span *)
+		dsa_get_address(area, pool->spans[1]);
+
+		head->prevspan = span_pointer;
+	}
+	span->pool = DsaAreaPoolToDsaPointer(area, pool);
+	span->nextspan = pool->spans[1];
+	span->prevspan = InvalidDsaPointer;
+	pool->spans[1] = span_pointer;
+
+	span->start = start;
+	span->npages = npages;
+	span->size_class = size_class;
+	span->ninitialized = 0;
+	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+	{
+		/*
+		 * A block-of-spans contains its own descriptor, so mark one object as
+		 * initialized and reduce the count of allocatable objects by one.
+		 * Doing this here has the side effect of also reducing nmax by one,
+		 * which is important to make sure we free this object at the correct
+		 * time.
+		 */
+		span->ninitialized = 1;
+		span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
+	}
+	else if (size_class != DSA_SCLASS_SPAN_LARGE)
+		span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
+	span->firstfree = DSA_SPAN_NOTHING_FREE;
+	span->nmax = span->nallocatable;
+	span->fclass = 1;
+}
+
+/*
+ * Transfer the first span in one fullness class to the head of another
+ * fullness class.
+ */
+static bool
+transfer_first_span(dsa_area *area,
+					dsa_area_pool *pool, int fromclass, int toclass)
+{
+	dsa_pointer span_pointer;
+	dsa_area_span *span;
+	dsa_area_span *nextspan;
+
+	/* Can't do it if source list is empty. */
+	span_pointer = pool->spans[fromclass];
+	if (!DsaPointerIsValid(span_pointer))
+		return false;
+
+	/* Remove span from head of source list. */
+	span = dsa_get_address(area, span_pointer);
+	pool->spans[fromclass] = span->nextspan;
+	if (DsaPointerIsValid(span->nextspan))
+	{
+		nextspan = (dsa_area_span *)
+			dsa_get_address(area, span->nextspan);
+		nextspan->prevspan = InvalidDsaPointer;
+	}
+
+	/* Add span to head of target list. */
+	span->nextspan = pool->spans[toclass];
+	pool->spans[toclass] = span_pointer;
+	if (DsaPointerIsValid(span->nextspan))
+	{
+		nextspan = (dsa_area_span *)
+			dsa_get_address(area, span->nextspan);
+		nextspan->prevspan = span_pointer;
+	}
+	span->fclass = toclass;
+
+	return true;
+}
+
+/*
+ * Allocate one object of the requested size class from the given area.
+ */
+static inline dsa_pointer
+alloc_object(dsa_area *area, int size_class)
+{
+	dsa_area_pool *pool = &area->control->pools[size_class];
+	dsa_area_span *span;
+	dsa_pointer block;
+	dsa_pointer result;
+	char	   *object;
+	Size		size;
+
+	/*
+	 * Even though ensure_active_superblock can in turn call alloc_object if
+	 * it needs to allocate a new span, that's always from a different pool,
+	 * and the order of lock acquisition is always the same, so it's OK that
+	 * we hold this lock for the duration of this function.
+	 */
+	Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+	LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+
+	/*
+	 * If there's no active superblock, we must successfully obtain one or
+	 * fail the request.
+	 */
+	if (!DsaPointerIsValid(pool->spans[1]) &&
+		!ensure_active_superblock(area, pool, size_class))
+	{
+		result = InvalidDsaPointer;
+	}
+	else
+	{
+		/*
+		 * There should be a block in fullness class 1 at this point, and it
+		 * should never be completely full.  Thus we can either pop an object
+		 * from the free list or, failing that, initialize a new object.
+		 */
+		Assert(DsaPointerIsValid(pool->spans[1]));
+		span = (dsa_area_span *)
+			dsa_get_address(area, pool->spans[1]);
+		Assert(span->nallocatable > 0);
+		block = span->start;
+		Assert(size_class < DSA_NUM_SIZE_CLASSES);
+		size = dsa_size_classes[size_class];
+		if (span->firstfree != DSA_SPAN_NOTHING_FREE)
+		{
+			result = block + span->firstfree * size;
+			object = dsa_get_address(area, result);
+			span->firstfree = NextFreeObjectIndex(object);
+		}
+		else
+		{
+			result = block + span->ninitialized * size;
+			++span->ninitialized;
+		}
+		--span->nallocatable;
+
+		/* If it's now full, move it to the highest-numbered fullness class. */
+		if (span->nallocatable == 0)
+			transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
+	}
+
+	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+	LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+
+	return result;
+}
+
+/*
+ * Ensure an active (i.e. fullness class 1) superblock, unless all existing
+ * superblocks are completely full and no more can be allocated.
+ *
+ * Fullness classes K of 0..N are loosely intended to represent blocks whose
+ * utilization percentage is at least K/N, but we only enforce this rigorously
+ * for the highest-numbered fullness class, which always contains exactly
+ * those blocks that are completely full.  It's otherwise acceptable for a
+ * block to be in a higher-numbered fullness class than the one to which it
+ * logically belongs.  In addition, the active block, which is always the
+ * first block in fullness class 1, is permitted to have a higher allocation
+ * percentage than would normally be allowable for that fullness class; we
+ * don't move it until it's completely full, and then it goes to the
+ * highest-numbered fullness class.
+ *
+ * It might seem odd that the active block is the head of fullness class 1
+ * rather than fullness class 0, but experience with other allocators has
+ * shown that it's usually better to allocate from a block that's moderately
+ * full rather than one that's nearly empty.  Insofar as is reasonably
+ * possible, we want to avoid performing new allocations in a block that would
+ * otherwise become empty soon.
+ */
+static bool
+ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
+						 int size_class)
+{
+	dsa_pointer span_pointer;
+	dsa_pointer start_pointer;
+	Size		obsize = dsa_size_classes[size_class];
+	Size		nmax;
+	int			fclass;
+	Size		npages = 1;
+	Size		first_page;
+	Size		i;
+	dsa_segment_map *segment_map;
+
+	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+
+	/*
+	 * Compute the number of objects that will fit in a block of this size
+	 * class.  Span-of-spans blocks are just a single page, and the first
+	 * object isn't available for use because it describes the block-of-spans
+	 * itself.
+	 */
+	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+		nmax = FPM_PAGE_SIZE / obsize - 1;
+	else
+		nmax = DSA_SUPERBLOCK_SIZE / obsize;
+
+	/*
+	 * If fullness class 1 is empty, try to find a span to put in it by
+	 * scanning higher-numbered fullness classes (excluding the last one,
+	 * whose blocks are certain to all be completely full).
+	 */
+	for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
+	{
+		span_pointer = pool->spans[fclass];
+
+		while (DsaPointerIsValid(span_pointer))
+		{
+			int			tfclass;
+			dsa_area_span *span;
+			dsa_area_span *nextspan;
+			dsa_area_span *prevspan;
+			dsa_pointer next_span_pointer;
+
+			span = (dsa_area_span *)
+				dsa_get_address(area, span_pointer);
+			next_span_pointer = span->nextspan;
+
+			/* Figure out what fullness class should contain this span. */
+			tfclass = (nmax - span->nallocatable)
+				* (DSA_FULLNESS_CLASSES - 1) / nmax;
+
+			/* Look up next span. */
+			if (DsaPointerIsValid(span->nextspan))
+				nextspan = (dsa_area_span *)
+					dsa_get_address(area, span->nextspan);
+			else
+				nextspan = NULL;
+
+			/*
+			 * If utilization has dropped enough that this now belongs in some
+			 * other fullness class, move it there.
+			 */
+			if (tfclass < fclass)
+			{
+				/* Remove from the current fullness class list. */
+				if (pool->spans[fclass] == span_pointer)
+				{
+					/* It was the head; remove it. */
+					Assert(!DsaPointerIsValid(span->prevspan));
+					pool->spans[fclass] = span->nextspan;
+					if (nextspan != NULL)
+						nextspan->prevspan = InvalidDsaPointer;
+				}
+				else
+				{
+					/* It was not the head. */
+					Assert(DsaPointerIsValid(span->prevspan));
+					prevspan = (dsa_area_span *)
+						dsa_get_address(area, span->prevspan);
+					prevspan->nextspan = span->nextspan;
+				}
+				if (nextspan != NULL)
+					nextspan->prevspan = span->prevspan;
+
+				/* Push onto the head of the new fullness class list. */
+				span->nextspan = pool->spans[tfclass];
+				pool->spans[tfclass] = span_pointer;
+				span->prevspan = InvalidDsaPointer;
+				if (DsaPointerIsValid(span->nextspan))
+				{
+					nextspan = (dsa_area_span *)
+						dsa_get_address(area, span->nextspan);
+					nextspan->prevspan = span_pointer;
+				}
+				span->fclass = tfclass;
+			}
+
+			/* Advance to next span on list. */
+			span_pointer = next_span_pointer;
+		}
+
+		/* Stop now if we found a suitable block. */
+		if (DsaPointerIsValid(pool->spans[1]))
+			return true;
+	}
+
+	/*
+	 * If there are no blocks that properly belong in fullness class 1, pick
+	 * one from some other fullness class and move it there anyway, so that we
+	 * have an allocation target.  Our last choice is to transfer a block
+	 * that's almost empty (and might become completely empty soon if left
+	 * alone), but even that is better than failing, which is what we must do
+	 * if there are no blocks at all with freespace.
+	 */
+	Assert(!DsaPointerIsValid(pool->spans[1]));
+	for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
+		if (transfer_first_span(area, pool, fclass, 1))
+			return true;
+	if (!DsaPointerIsValid(pool->spans[1]) &&
+		transfer_first_span(area, pool, 0, 1))
+		return true;
+
+	/*
+	 * We failed to find an existing span with free objects, so we need to
+	 * allocate a new superblock and construct a new span to manage it.
+	 *
+	 * First, get a dsa_area_span object to describe the new superblock block
+	 * ... unless this allocation is for a dsa_area_span object, in which case
+	 * that's surely not going to work.  We handle that case by storing the
+	 * span describing a block-of-spans inline.
+	 */
+	if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+	{
+		span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
+		if (!DsaPointerIsValid(span_pointer))
+			return false;
+		npages = DSA_PAGES_PER_SUPERBLOCK;
+	}
+
+	/* Find or create a segment and allocate the superblock. */
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	segment_map = get_best_segment(area, npages);
+	if (segment_map == NULL)
+	{
+		segment_map = make_new_segment(area, npages);
+		if (segment_map == NULL)
+		{
+			LWLockRelease(DSA_AREA_LOCK(area));
+			return false;
+		}
+	}
+	if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
+	{
+		LWLockRelease(DSA_AREA_LOCK(area));
+		if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+			dsa_free(area, span_pointer);
+		return false;
+	}
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	/* Compute the start of the superblock. */
+	start_pointer =
+		DSA_MAKE_POINTER(get_segment_index(area, segment_map),
+						 first_page * FPM_PAGE_SIZE);
+
+	/*
+	 * If this is a block-of-spans, carve the descriptor right out of the
+	 * allocated space.
+	 */
+	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+	{
+		/*
+		 * We have a pointer into the segment.  We need to build a dsa_pointer
+		 * from the segment index and offset into the segment.
+		 */
+		span_pointer = start_pointer;
+	}
+
+	/* Initialize span and pagemap. */
+	init_span(area, span_pointer, pool, start_pointer, npages, size_class);
+	for (i = 0; i < npages; ++i)
+		segment_map->pagemap[first_page + i] = span_pointer;
+
+	return true;
+}
+
+/*
+ * Return the segment map corresponding to a given segment index, mapping the
+ * segment in if necessary.  For internal segment book-keeping, this is called
+ * with the area lock held.  It is also called by dsa_free and dsa_get_address
+ * without any locking, relying on the fact they have a known live segment
+ * index and they always call check_for_freed_segments to ensures that any
+ * freed segment occupying the same slot is detached first.
+ */
+static dsa_segment_map *
+get_segment_by_index(dsa_area *area, dsa_segment_index index)
+{
+	if (unlikely(area->segment_maps[index].mapped_address == NULL))
+	{
+		dsm_handle	handle;
+		dsm_segment *segment;
+		dsa_segment_map *segment_map;
+		ResourceOwner oldowner;
+
+		/*
+		 * If we are reached by dsa_free or dsa_get_address, there must be at
+		 * least one object allocated in the referenced segment.  Otherwise,
+		 * their caller has a double-free or access-after-free bug, which we
+		 * have no hope of detecting.  So we know it's safe to access this
+		 * array slot without holding a lock; it won't change underneath us.
+		 * Furthermore, we know that we can see the latest contents of the
+		 * slot, as explained in check_for_freed_segments, which those
+		 * functions call before arriving here.
+		 */
+		handle = area->control->segment_handles[index];
+
+		/* It's an erro to try to access an unused slot. */
+		if (handle == DSM_HANDLE_INVALID)
+			elog(ERROR,
+			   "dsa_area could not attach to a segment that has been freed");
+
+		oldowner = CurrentResourceOwner;
+		CurrentResourceOwner = area->resowner;
+		segment = dsm_attach(handle);
+		CurrentResourceOwner = oldowner;
+		if (segment == NULL)
+			elog(ERROR, "dsa_area could not attach to segment");
+		segment_map = &area->segment_maps[index];
+		segment_map->segment = segment;
+		segment_map->mapped_address = dsm_segment_address(segment);
+		segment_map->header =
+			(dsa_segment_header *) segment_map->mapped_address;
+		segment_map->fpm = (FreePageManager *)
+			(segment_map->mapped_address +
+			 MAXALIGN(sizeof(dsa_segment_header)));
+		segment_map->pagemap = (dsa_pointer *)
+			(segment_map->mapped_address +
+			 MAXALIGN(sizeof(dsa_segment_header)) +
+			 MAXALIGN(sizeof(FreePageManager)));
+
+		/* Remember the highest index this backend has ever mapped. */
+		if (area->high_segment_index < index)
+			area->high_segment_index = index;
+
+		Assert(segment_map->header->magic ==
+			   (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
+	}
+
+	return &area->segment_maps[index];
+}
+
+/*
+ * Return a superblock to the free page manager.  If the underlying segment
+ * has become entirely free, then return it to the operating system.
+ *
+ * The appropriate pool lock must be held.
+ */
+static void
+destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
+{
+	dsa_area_span *span = dsa_get_address(area, span_pointer);
+	int			size_class = span->size_class;
+	dsa_segment_map *segment_map;
+
+	segment_map =
+		get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
+
+	/* Remove it from its fullness class list. */
+	unlink_span(area, span);
+
+	/*
+	 * Note: Here we acquire the area lock while we already hold a per-pool
+	 * lock.  We never hold the area lock and then take a pool lock, or we
+	 * could deadlock.
+	 */
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	FreePageManagerPut(segment_map->fpm,
+					   DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
+					   span->npages);
+	/* Check if the segment is now entirely free. */
+	if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
+	{
+		dsa_segment_index index = get_segment_index(area, segment_map);
+
+		/* If it's not the segment with extra control data, free it. */
+		if (index != 0)
+		{
+			/*
+			 * Give it back to the OS, and allow other backends to detect that
+			 * they need to detach.
+			 */
+			unlink_segment(area, segment_map);
+			segment_map->header->freed = true;
+			Assert(area->control->total_segment_size >=
+				   segment_map->header->size);
+			area->control->total_segment_size -=
+				segment_map->header->size;
+			dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
+			dsm_detach(segment_map->segment);
+			area->control->segment_handles[index] = DSM_HANDLE_INVALID;
+			++area->control->freed_segment_counter;
+			segment_map->segment = NULL;
+			segment_map->header = NULL;
+			segment_map->mapped_address = NULL;
+		}
+	}
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	/*
+	 * Span-of-spans blocks store the span which describes them within the
+	 * block itself, so freeing the storage implicitly frees the descriptor
+	 * also.  If this is a block of any other type, we need to separately free
+	 * the span object also.  This recursive call to dsa_free will acquire the
+	 * span pool's lock.  We can't deadlock because the acquisition order is
+	 * always some other pool and then the span pool.
+	 */
+	if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+		dsa_free(area, span_pointer);
+}
+
+static void
+unlink_span(dsa_area *area, dsa_area_span *span)
+{
+	if (DsaPointerIsValid(span->nextspan))
+	{
+		dsa_area_span *next = dsa_get_address(area, span->nextspan);
+
+		next->prevspan = span->prevspan;
+	}
+	if (DsaPointerIsValid(span->prevspan))
+	{
+		dsa_area_span *prev = dsa_get_address(area, span->prevspan);
+
+		prev->nextspan = span->nextspan;
+	}
+	else
+	{
+		dsa_area_pool *pool = dsa_get_address(area, span->pool);
+
+		pool->spans[span->fclass] = span->nextspan;
+	}
+}
+
+static void
+add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
+						   dsa_pointer span_pointer,
+						   int fclass)
+{
+	dsa_area_pool *pool = dsa_get_address(area, span->pool);
+
+	if (DsaPointerIsValid(pool->spans[fclass]))
+	{
+		dsa_area_span *head = dsa_get_address(area,
+											  pool->spans[fclass]);
+
+		head->prevspan = span_pointer;
+	}
+	span->prevspan = InvalidDsaPointer;
+	span->nextspan = pool->spans[fclass];
+	pool->spans[fclass] = span_pointer;
+	span->fclass = fclass;
+}
+
+/*
+ * Detach from an area that was either created or attached to by this process.
+ */
+void
+dsa_detach(dsa_area *area)
+{
+	int			i;
+
+	/* Detach from all segments. */
+	for (i = 0; i <= area->high_segment_index; ++i)
+		if (area->segment_maps[i].segment != NULL)
+			dsm_detach(area->segment_maps[i].segment);
+
+	/*
+	 * Note that 'detaching' (= detaching from DSM segments) doesn't include
+	 * 'releasing' (= adjusting the reference count).  It would be nice to
+	 * combine these operations, but client code might never get around to
+	 * calling dsa_detach because of an error path, and a detach hook on any
+	 * particular segment is too late to detach other segments in the area
+	 * without risking a 'leak' warning in the non-error path.
+	 */
+
+	/* Free the backend-local area object. */
+	pfree(area);
+}
+
+/*
+ * Unlink a segment from the bin that contains it.
+ */
+static void
+unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
+{
+	if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *prev;
+
+		prev = get_segment_by_index(area, segment_map->header->prev);
+		prev->header->next = segment_map->header->next;
+	}
+	else
+	{
+		Assert(area->control->segment_bins[segment_map->header->bin] ==
+			   get_segment_index(area, segment_map));
+		area->control->segment_bins[segment_map->header->bin] =
+			segment_map->header->next;
+	}
+	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *next;
+
+		next = get_segment_by_index(area, segment_map->header->next);
+		next->header->prev = segment_map->header->prev;
+	}
+}
+
+/*
+ * Find a segment that could satisfy a request for 'npages' of contiguous
+ * memory, or return NULL if none can be found.  This may involve attaching to
+ * segments that weren't previously attached so that we can query their free
+ * pages map.
+ */
+static dsa_segment_map *
+get_best_segment(dsa_area *area, Size npages)
+{
+	Size		bin;
+
+	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+
+	/*
+	 * Start searching from the first bin that *might* have enough contiguous
+	 * pages.
+	 */
+	for (bin = contiguous_pages_to_segment_bin(npages);
+		 bin < DSA_NUM_SEGMENT_BINS;
+		 ++bin)
+	{
+		/*
+		 * The minimum contiguous size that any segment in this bin should
+		 * have.  We'll re-bin if we see segments with fewer.
+		 */
+		Size		threshold = 1 << (bin - 1);
+		dsa_segment_index segment_index;
+
+		/* Search this bin for a segment with enough contiguous space. */
+		segment_index = area->control->segment_bins[bin];
+		while (segment_index != DSA_SEGMENT_INDEX_NONE)
+		{
+			dsa_segment_map *segment_map;
+			dsa_segment_index next_segment_index;
+			Size		contiguous_pages;
+
+			segment_map = get_segment_by_index(area, segment_index);
+			next_segment_index = segment_map->header->next;
+			contiguous_pages = fpm_largest(segment_map->fpm);
+
+			/* Not enough for the request, still enough for this bin. */
+			if (contiguous_pages >= threshold && contiguous_pages < npages)
+			{
+				segment_index = next_segment_index;
+				continue;
+			}
+
+			/* Re-bin it if it's no longer in the appropriate bin. */
+			if (contiguous_pages < threshold)
+			{
+				Size		new_bin;
+
+				new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
+
+				/* Remove it from its current bin. */
+				unlink_segment(area, segment_map);
+
+				/* Push it onto the front of its new bin. */
+				segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
+				segment_map->header->next =
+					area->control->segment_bins[new_bin];
+				segment_map->header->bin = new_bin;
+				area->control->segment_bins[new_bin] = segment_index;
+				if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+				{
+					dsa_segment_map *next;
+
+					next = get_segment_by_index(area,
+												segment_map->header->next);
+					Assert(next->header->bin == new_bin);
+					next->header->prev = segment_index;
+				}
+
+				/*
+				 * But fall through to see if it's enough to satisfy this
+				 * request anyway....
+				 */
+			}
+
+			/* Check if we are done. */
+			if (contiguous_pages >= npages)
+				return segment_map;
+
+			/* Continue searching the same bin. */
+			segment_index = next_segment_index;
+		}
+	}
+
+	/* Not found. */
+	return NULL;
+}
+
+/*
+ * Create a new segment that can handle at least requested_pages.  Returns
+ * NULL if the requested total size limit or maximum allowed number of
+ * segments would be exceeded.
+ */
+static dsa_segment_map *
+make_new_segment(dsa_area *area, Size requested_pages)
+{
+	dsa_segment_index new_index;
+	Size		metadata_bytes;
+	Size		total_size;
+	Size		total_pages;
+	Size		usable_pages;
+	dsa_segment_map *segment_map;
+	dsm_segment *segment;
+	ResourceOwner oldowner;
+
+	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+
+	/* Find a segment slot that is not in use (linearly for now). */
+	for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
+	{
+		if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
+			break;
+	}
+	if (new_index == DSA_MAX_SEGMENTS)
+		return NULL;
+
+	/*
+	 * If the total size limit is already exceeded, then we exit early and
+	 * avoid arithmetic wraparound in the unsigned expressions below.
+	 */
+	if (area->control->total_segment_size >=
+		area->control->max_total_segment_size)
+		return NULL;
+
+	/*
+	 * The size should be at least as big as requested, and at least big
+	 * enough to follow a geometric series that approximately doubles the
+	 * total storage each time we create a new segment.  We use geometric
+	 * growth because the underlying DSM system isn't designed for large
+	 * numbers of segments (otherwise we might even consider just using one
+	 * DSM segment for each large allocation and for each superblock, and then
+	 * we wouldn't need to use FreePageManager).
+	 *
+	 * We decide on a total segment size first, so that we produce tidy
+	 * power-of-two sized segments.  This is a good property to have if we
+	 * move to huge pages in the future.  Then we work back to the number of
+	 * pages we can fit.
+	 */
+	total_size = DSA_INITIAL_SEGMENT_SIZE *
+		((Size) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
+	total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
+	total_size = Min(total_size,
+					 area->control->max_total_segment_size -
+					 area->control->total_segment_size);
+
+	total_pages = total_size / FPM_PAGE_SIZE;
+	metadata_bytes =
+		MAXALIGN(sizeof(dsa_segment_header)) +
+		MAXALIGN(sizeof(FreePageManager)) +
+		sizeof(dsa_pointer) * total_pages;
+
+	/* Add padding up to next page boundary. */
+	if (metadata_bytes % FPM_PAGE_SIZE != 0)
+		metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+	if (total_size <= metadata_bytes)
+		return NULL;
+	usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
+	Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
+
+	/* See if that is enough... */
+	if (requested_pages > usable_pages)
+	{
+		/*
+		 * We'll make an odd-sized segment, working forward from the requested
+		 * number of pages.
+		 */
+		usable_pages = requested_pages;
+		metadata_bytes =
+			MAXALIGN(sizeof(dsa_segment_header)) +
+			MAXALIGN(sizeof(FreePageManager)) +
+			usable_pages * sizeof(dsa_pointer);
+
+		/* Add padding up to next page boundary. */
+		if (metadata_bytes % FPM_PAGE_SIZE != 0)
+			metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+		total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
+
+		/* Is that too large for dsa_pointer's addressing scheme? */
+		if (total_size > DSA_MAX_SEGMENT_SIZE)
+			return NULL;
+
+		/* Would that exceed the limit? */
+		if (total_size > area->control->max_total_segment_size -
+			area->control->total_segment_size)
+			return NULL;
+	}
+
+	/* Create the segment. */
+	oldowner = CurrentResourceOwner;
+	CurrentResourceOwner = area->resowner;
+	segment = dsm_create(total_size);
+	CurrentResourceOwner = oldowner;
+	if (segment == NULL)
+		return NULL;
+	dsm_pin_segment(segment);
+
+	/* Store the handle in shared memory to be found by index. */
+	area->control->segment_handles[new_index] =
+		dsm_segment_handle(segment);
+	/* Track the highest segment index in the history of the area. */
+	if (area->control->high_segment_index < new_index)
+		area->control->high_segment_index = new_index;
+	/* Track the highest segment index this backend has ever mapped. */
+	if (area->high_segment_index < new_index)
+		area->high_segment_index = new_index;
+	/* Track total size of all segments. */
+	area->control->total_segment_size += total_size;
+	Assert(area->control->total_segment_size <=
+		   area->control->max_total_segment_size);
+
+	/* Build a segment map for this segment in this backend. */
+	segment_map = &area->segment_maps[new_index];
+	segment_map->segment = segment;
+	segment_map->mapped_address = dsm_segment_address(segment);
+	segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
+	segment_map->fpm = (FreePageManager *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_segment_header)));
+	segment_map->pagemap = (dsa_pointer *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_segment_header)) +
+		 MAXALIGN(sizeof(FreePageManager)));
+
+	/* Set up the free page map. */
+	FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
+	FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
+					   usable_pages);
+
+	/* Set up the segment header and put it in the appropriate bin. */
+	segment_map->header->magic =
+		DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
+	segment_map->header->usable_pages = usable_pages;
+	segment_map->header->size = total_size;
+	segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
+	segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
+	segment_map->header->next =
+		area->control->segment_bins[segment_map->header->bin];
+	segment_map->header->freed = false;
+	area->control->segment_bins[segment_map->header->bin] = new_index;
+	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *next =
+		get_segment_by_index(area, segment_map->header->next);
+
+		Assert(next->header->bin == segment_map->header->bin);
+		next->header->prev = new_index;
+	}
+
+	return segment_map;
+}
+
+/*
+ * Check if any segments have been freed by destroy_superblock, so we can
+ * detach from them in this backend.  This function is called by
+ * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
+ * received can be resolved to the correct segment.
+ *
+ * The danger we want to defend against is that there could be an old segment
+ * mapped into a given slot in this backend, and the dsa_pointer they have
+ * might refer to some new segment in the same slot.  So those functions must
+ * be sure to process all instructions to detach from a freed segment that had
+ * been generated by the time this process received the dsa_pointer, before
+ * they call get_segment_by_index.
+ */
+static void
+check_for_freed_segments(dsa_area *area)
+{
+	Size		freed_segment_counter;
+
+	/*
+	 * Any other process that has freed a segment has incremented
+	 * free_segment_counter while holding an LWLock, and that must precede any
+	 * backend creating a new segment in the same slot while holding an
+	 * LWLock, and that must precede the creation of any dsa_pointer pointing
+	 * into the new segment which might reach us here, and the caller must
+	 * have sent the dsa_pointer to this process using appropriate memory
+	 * synchronization (some kind of locking or atomic primitive or system
+	 * call).  So all we need to do on the reading side is ask for the load of
+	 * freed_segment_counter to follow the caller's load of the dsa_pointer it
+	 * has, and we can be sure to detect any segments that had been freed as
+	 * of the time that the dsa_pointer reached this process.
+	 */
+	pg_read_barrier();
+	freed_segment_counter = area->control->freed_segment_counter;
+	if (unlikely(area->freed_segment_counter != freed_segment_counter))
+	{
+		int			i;
+
+		/* Check all currently mapped segments to find what's been freed. */
+		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+		for (i = 0; i <= area->high_segment_index; ++i)
+		{
+			if (area->segment_maps[i].header != NULL &&
+				area->segment_maps[i].header->freed)
+			{
+				dsm_detach(area->segment_maps[i].segment);
+				area->segment_maps[i].segment = NULL;
+				area->segment_maps[i].header = NULL;
+				area->segment_maps[i].mapped_address = NULL;
+			}
+		}
+		LWLockRelease(DSA_AREA_LOCK(area));
+		area->freed_segment_counter = freed_segment_counter;
+	}
+}
diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c
new file mode 100644
index 000000000000..8c017a425a44
--- /dev/null
+++ b/src/backend/utils/mmgr/freepage.c
@@ -0,0 +1,1886 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.c
+ *	  Management of free memory pages.
+ *
+ * The intention of this code is to provide infrastructure for memory
+ * allocators written specifically for PostgreSQL.  At least in the case
+ * of dynamic shared memory, we can't simply use malloc() or even
+ * relatively thin wrappers like palloc() which sit on top of it, because
+ * no allocator built into the operating system will deal with relative
+ * pointers.  In the future, we may find other cases in which greater
+ * control over our own memory management seems desirable.
+ *
+ * A FreePageManager keeps track of which 4kB pages of memory are currently
+ * unused from the point of view of some higher-level memory allocator.
+ * Unlike a user-facing allocator such as palloc(), a FreePageManager can
+ * only allocate and free in units of whole pages, and freeing an
+ * allocation can only be done given knowledge of its length in pages.
+ *
+ * Since a free page manager has only a fixed amount of dedicated memory,
+ * and since there is no underlying allocator, it uses the free pages
+ * it is given to manage to store its bookkeeping data.  It keeps multiple
+ * freelists of runs of pages, sorted by the size of the run; the head of
+ * each freelist is stored in the FreePageManager itself, and the first
+ * page of each run contains a relative pointer to the next run. See
+ * FreePageManagerGetInternal for more details on how the freelists are
+ * managed.
+ *
+ * To avoid memory fragmentation, it's important to consolidate adjacent
+ * spans of pages whenever possible; otherwise, large allocation requests
+ * might not be satisfied even when sufficient contiguous space is
+ * available.  Therefore, in addition to the freelists, we maintain an
+ * in-memory btree of free page ranges ordered by page number.  If a
+ * range being freed precedes or follows a range that is already free,
+ * the existing range is extended; if it exactly bridges the gap between
+ * free ranges, then the two existing ranges are consolidated with the
+ * newly-freed range to form one great big range of free pages.
+ *
+ * When there is only one range of free pages, the btree is trivial and
+ * is stored within the FreePageManager proper; otherwise, pages are
+ * allocated from the area under management as needed.  Even in cases
+ * where memory fragmentation is very severe, only a tiny fraction of
+ * the pages under management are consumed by this btree.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/freepage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+
+#include "utils/freepage.h"
+#include "utils/relptr.h"
+
+
+/* Magic numbers to identify various page types */
+#define FREE_PAGE_SPAN_LEADER_MAGIC		0xea4020f0
+#define FREE_PAGE_LEAF_MAGIC			0x98eae728
+#define FREE_PAGE_INTERNAL_MAGIC		0x19aa32c9
+
+/* Doubly linked list of spans of free pages; stored in first page of span. */
+struct FreePageSpanLeader
+{
+	int			magic;			/* always FREE_PAGE_SPAN_LEADER_MAGIC */
+	Size		npages;			/* number of pages in span */
+	RelptrFreePageSpanLeader prev;
+	RelptrFreePageSpanLeader next;
+};
+
+/* Common header for btree leaf and internal pages. */
+typedef struct FreePageBtreeHeader
+{
+	int			magic;			/* FREE_PAGE_LEAF_MAGIC or
+								 * FREE_PAGE_INTERNAL_MAGIC */
+	Size		nused;			/* number of items used */
+	RelptrFreePageBtree parent; /* uplink */
+} FreePageBtreeHeader;
+
+/* Internal key; points to next level of btree. */
+typedef struct FreePageBtreeInternalKey
+{
+	Size		first_page;		/* low bound for keys on child page */
+	RelptrFreePageBtree child;	/* downlink */
+} FreePageBtreeInternalKey;
+
+/* Leaf key; no payload data. */
+typedef struct FreePageBtreeLeafKey
+{
+	Size		first_page;		/* first page in span */
+	Size		npages;			/* number of pages in span */
+} FreePageBtreeLeafKey;
+
+/* Work out how many keys will fit on a page. */
+#define FPM_ITEMS_PER_INTERNAL_PAGE \
+	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+		sizeof(FreePageBtreeInternalKey))
+#define FPM_ITEMS_PER_LEAF_PAGE \
+	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+		sizeof(FreePageBtreeLeafKey))
+
+/* A btree page of either sort */
+struct FreePageBtree
+{
+	FreePageBtreeHeader hdr;
+	union
+	{
+		FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
+		FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
+	}			u;
+};
+
+/* Results of a btree search */
+typedef struct FreePageBtreeSearchResult
+{
+	FreePageBtree *page;
+	Size		index;
+	bool		found;
+	unsigned	split_pages;
+} FreePageBtreeSearchResult;
+
+/* Helper functions */
+static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
+								FreePageBtree *btp);
+static Size FreePageBtreeCleanup(FreePageManager *fpm);
+static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
+							 FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
+							  FreePageBtree *btp);
+static Size FreePageBtreeFirstKey(FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
+static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
+						  Size index, Size first_page, FreePageBtree *child);
+static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
+						Size first_page, Size npages);
+static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
+static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
+					Size index);
+static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
+static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+					FreePageBtreeSearchResult *result);
+static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
+static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
+static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
+					   FreePageBtree *btp);
+static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
+static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+						 FreePageBtree *parent, int level, StringInfo buf);
+static void FreePageManagerDumpSpans(FreePageManager *fpm,
+						 FreePageSpanLeader *span, Size expected_pages,
+						 StringInfo buf);
+static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
+						   Size *first_page);
+static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
+						   Size npages, bool soft);
+static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
+static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
+					   Size npages);
+static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
+static void FreePageManagerUpdateLargest(FreePageManager *fpm);
+
+#if FPM_EXTRA_ASSERTS
+static Size sum_free_pages(FreePageManager *fpm);
+#endif
+
+/*
+ * Initialize a new, empty free page manager.
+ *
+ * 'fpm' should reference caller-provided memory large enough to contain a
+ * FreePageManager.  We'll initialize it here.
+ *
+ * 'base' is the address to which all pointers are relative.  When managing
+ * a dynamic shared memory segment, it should normally be the base of the
+ * segment.  When managing backend-private memory, it can be either NULL or,
+ * if managing a single contiguous extent of memory, the start of that extent.
+ */
+void
+FreePageManagerInitialize(FreePageManager *fpm, char *base)
+{
+	Size		f;
+
+	relptr_store(base, fpm->self, fpm);
+	relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+	relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
+	fpm->btree_depth = 0;
+	fpm->btree_recycle_count = 0;
+	fpm->singleton_first_page = 0;
+	fpm->singleton_npages = 0;
+	fpm->contiguous_pages = 0;
+	fpm->contiguous_pages_dirty = true;
+#ifdef FPM_EXTRA_ASSERTS
+	fpm->free_pages = 0;
+#endif
+
+	for (f = 0; f < FPM_NUM_FREELISTS; f++)
+		relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
+}
+
+/*
+ * Allocate a run of pages of the given length from the free page manager.
+ * The return value indicates whether we were able to satisfy the request;
+ * if true, the first page of the allocation is stored in *first_page.
+ */
+bool
+FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
+{
+	bool		result;
+	Size		contiguous_pages;
+
+	result = FreePageManagerGetInternal(fpm, npages, first_page);
+
+	/*
+	 * It's a bit counterintuitive, but allocating pages can actually create
+	 * opportunities for cleanup that create larger ranges.  We might pull a
+	 * key out of the btree that enables the item at the head of the btree
+	 * recycle list to be inserted; and then if there are more items behind it
+	 * one of those might cause two currently-separated ranges to merge,
+	 * creating a single range of contiguous pages larger than any that
+	 * existed previously.  It might be worth trying to improve the cleanup
+	 * algorithm to avoid such corner cases, but for now we just notice the
+	 * condition and do the appropriate reporting.
+	 */
+	contiguous_pages = FreePageBtreeCleanup(fpm);
+	if (fpm->contiguous_pages < contiguous_pages)
+		fpm->contiguous_pages = contiguous_pages;
+
+	/*
+	 * FreePageManagerGetInternal may have set contiguous_pages_dirty.
+	 * Recompute contigous_pages if so.
+	 */
+	FreePageManagerUpdateLargest(fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+	if (result)
+	{
+		Assert(fpm->free_pages >= npages);
+		fpm->free_pages -= npages;
+	}
+	Assert(fpm->free_pages == sum_free_pages(fpm));
+	Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
+#endif
+	return result;
+}
+
+#ifdef FPM_EXTRA_ASSERTS
+static void
+sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
+{
+	char	   *base = fpm_segment_base(fpm);
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
+		   btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	++*sum;
+	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		Size		index;
+
+
+		for (index = 0; index < btp->hdr.nused; ++index)
+		{
+			FreePageBtree *child;
+
+			child = relptr_access(base, btp->u.internal_key[index].child);
+			sum_free_pages_recurse(fpm, child, sum);
+		}
+	}
+}
+static Size
+sum_free_pages(FreePageManager *fpm)
+{
+	FreePageSpanLeader *recycle;
+	char	   *base = fpm_segment_base(fpm);
+	Size		sum = 0;
+	int			list;
+
+	/* Count the spans by scanning the freelists. */
+	for (list = 0; list < FPM_NUM_FREELISTS; ++list)
+	{
+
+		if (!relptr_is_null(fpm->freelist[list]))
+		{
+			FreePageSpanLeader *candidate =
+			relptr_access(base, fpm->freelist[list]);
+
+			do
+			{
+				sum += candidate->npages;
+				candidate = relptr_access(base, candidate->next);
+			} while (candidate != NULL);
+		}
+	}
+
+	/* Count btree internal pages. */
+	if (fpm->btree_depth > 0)
+	{
+		FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+		sum_free_pages_recurse(fpm, root, &sum);
+	}
+
+	/* Count the recycle list. */
+	for (recycle = relptr_access(base, fpm->btree_recycle);
+		 recycle != NULL;
+		 recycle = relptr_access(base, recycle->next))
+	{
+		Assert(recycle->npages == 1);
+		++sum;
+	}
+
+	return sum;
+}
+#endif
+
+/*
+ * Compute the size of the largest run of pages that the user could
+ * succesfully get.
+ */
+static Size
+FreePageManagerLargestContiguous(FreePageManager *fpm)
+{
+	char	   *base;
+	Size		largest;
+
+	base = fpm_segment_base(fpm);
+	largest = 0;
+	if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
+	{
+		FreePageSpanLeader *candidate;
+
+		candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
+		do
+		{
+			if (candidate->npages > largest)
+				largest = candidate->npages;
+			candidate = relptr_access(base, candidate->next);
+		} while (candidate != NULL);
+	}
+	else
+	{
+		Size		f = FPM_NUM_FREELISTS - 1;
+
+		do
+		{
+			--f;
+			if (!relptr_is_null(fpm->freelist[f]))
+			{
+				largest = f + 1;
+				break;
+			}
+		} while (f > 0);
+	}
+
+	return largest;
+}
+
+/*
+ * Recompute the size of the largest run of pages that the user could
+ * succesfully get, if it has been marked dirty.
+ */
+static void
+FreePageManagerUpdateLargest(FreePageManager *fpm)
+{
+	if (!fpm->contiguous_pages_dirty)
+		return;
+
+	fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
+	fpm->contiguous_pages_dirty = false;
+}
+
+/*
+ * Transfer a run of pages to the free page manager.
+ */
+void
+FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
+{
+	Size		contiguous_pages;
+
+	Assert(npages > 0);
+
+	/* Record the new pages. */
+	contiguous_pages =
+		FreePageManagerPutInternal(fpm, first_page, npages, false);
+
+	/*
+	 * If the new range we inserted into the page manager was contiguous with
+	 * an existing range, it may have opened up cleanup opportunities.
+	 */
+	if (contiguous_pages > npages)
+	{
+		Size		cleanup_contiguous_pages;
+
+		cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
+		if (cleanup_contiguous_pages > contiguous_pages)
+			contiguous_pages = cleanup_contiguous_pages;
+	}
+
+	/* See if we now have a new largest chunk. */
+	if (fpm->contiguous_pages < contiguous_pages)
+		fpm->contiguous_pages = contiguous_pages;
+
+	/*
+	 * The earlier call to FreePageManagerPutInternal may have set
+	 * contiguous_pages_dirty if it needed to allocate internal pages, so
+	 * recompute contiguous_pages if necessary.
+	 */
+	FreePageManagerUpdateLargest(fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+	fpm->free_pages += npages;
+	Assert(fpm->free_pages == sum_free_pages(fpm));
+	Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
+#endif
+}
+
+/*
+ * Produce a debugging dump of the state of a free page manager.
+ */
+char *
+FreePageManagerDump(FreePageManager *fpm)
+{
+	char	   *base = fpm_segment_base(fpm);
+	StringInfoData buf;
+	FreePageSpanLeader *recycle;
+	bool		dumped_any_freelist = false;
+	Size		f;
+
+	/* Initialize output buffer. */
+	initStringInfo(&buf);
+
+	/* Dump general stuff. */
+	appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
+					 fpm->self.relptr_off, fpm->contiguous_pages);
+
+	/* Dump btree. */
+	if (fpm->btree_depth > 0)
+	{
+		FreePageBtree *root;
+
+		appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
+		root = relptr_access(base, fpm->btree_root);
+		FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
+	}
+	else if (fpm->singleton_npages > 0)
+	{
+		appendStringInfo(&buf, "singleton: %zu(%zu)\n",
+						 fpm->singleton_first_page, fpm->singleton_npages);
+	}
+
+	/* Dump btree recycle list. */
+	recycle = relptr_access(base, fpm->btree_recycle);
+	if (recycle != NULL)
+	{
+		appendStringInfo(&buf, "btree recycle:");
+		FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
+	}
+
+	/* Dump free lists. */
+	for (f = 0; f < FPM_NUM_FREELISTS; ++f)
+	{
+		FreePageSpanLeader *span;
+
+		if (relptr_is_null(fpm->freelist[f]))
+			continue;
+		if (!dumped_any_freelist)
+		{
+			appendStringInfo(&buf, "freelists:\n");
+			dumped_any_freelist = true;
+		}
+		appendStringInfo(&buf, "  %zu:", f + 1);
+		span = relptr_access(base, fpm->freelist[f]);
+		FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
+	}
+
+	/* And return result to caller. */
+	return buf.data;
+}
+
+
+/*
+ * The first_page value stored at index zero in any non-root page must match
+ * the first_page value stored in its parent at the index which points to that
+ * page.  So when the value stored at index zero in a btree page changes, we've
+ * got to walk up the tree adjusting ancestor keys until we reach an ancestor
+ * where that key isn't index zero.  This function should be called after
+ * updating the first key on the target page; it will propagate the change
+ * upward as far as needed.
+ *
+ * We assume here that the first key on the page has not changed enough to
+ * require changes in the ordering of keys on its ancestor pages.  Thus,
+ * if we search the parent page for the first key greater than or equal to
+ * the first key on the current page, the downlink to this page will be either
+ * the exact index returned by the search (if the first key decreased)
+ * or one less (if the first key increased).
+ */
+static void
+FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		first_page;
+	FreePageBtree *parent;
+	FreePageBtree *child;
+
+	/* This might be either a leaf or an internal page. */
+	Assert(btp->hdr.nused > 0);
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+	{
+		Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+		first_page = btp->u.leaf_key[0].first_page;
+	}
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+		first_page = btp->u.internal_key[0].first_page;
+	}
+	child = btp;
+
+	/* Loop until we find an ancestor that does not require adjustment. */
+	for (;;)
+	{
+		Size		s;
+
+		parent = relptr_access(base, child->hdr.parent);
+		if (parent == NULL)
+			break;
+		s = FreePageBtreeSearchInternal(parent, first_page);
+
+		/* Key is either at index s or index s-1; figure out which. */
+		if (s >= parent->hdr.nused)
+		{
+			Assert(s == parent->hdr.nused);
+			--s;
+		}
+		else
+		{
+			FreePageBtree *check;
+
+			check = relptr_access(base, parent->u.internal_key[s].child);
+			if (check != child)
+			{
+				Assert(s > 0);
+				--s;
+			}
+		}
+
+#ifdef USE_ASSERT_CHECKING
+		/* Debugging double-check. */
+		{
+			FreePageBtree *check;
+
+			check = relptr_access(base, parent->u.internal_key[s].child);
+			Assert(s < parent->hdr.nused);
+			Assert(child == check);
+		}
+#endif
+
+		/* Update the parent key. */
+		parent->u.internal_key[s].first_page = first_page;
+
+		/*
+		 * If this is the first key in the parent, go up another level; else
+		 * done.
+		 */
+		if (s > 0)
+			break;
+		child = parent;
+	}
+}
+
+/*
+ * Attempt to reclaim space from the free-page btree.  The return value is
+ * the largest range of contiguous pages created by the cleanup operation.
+ */
+static Size
+FreePageBtreeCleanup(FreePageManager *fpm)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		max_contiguous_pages = 0;
+
+	/* Attempt to shrink the depth of the btree. */
+	while (!relptr_is_null(fpm->btree_root))
+	{
+		FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+		/* If the root contains only one key, reduce depth by one. */
+		if (root->hdr.nused == 1)
+		{
+			/* Shrink depth of tree by one. */
+			Assert(fpm->btree_depth > 0);
+			--fpm->btree_depth;
+			if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+			{
+				/* If root is a leaf, convert only entry to singleton range. */
+				relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+				fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+				fpm->singleton_npages = root->u.leaf_key[0].npages;
+			}
+			else
+			{
+				FreePageBtree *newroot;
+
+				/* If root is an internal page, make only child the root. */
+				Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+				relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
+				newroot = relptr_access(base, fpm->btree_root);
+				relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
+			}
+			FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
+		}
+		else if (root->hdr.nused == 2 &&
+				 root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			Size		end_of_first;
+			Size		start_of_second;
+
+			end_of_first = root->u.leaf_key[0].first_page +
+				root->u.leaf_key[0].npages;
+			start_of_second = root->u.leaf_key[1].first_page;
+
+			if (end_of_first + 1 == start_of_second)
+			{
+				Size		root_page = fpm_pointer_to_page(base, root);
+
+				if (end_of_first == root_page)
+				{
+					FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
+					FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
+					fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+					fpm->singleton_npages = root->u.leaf_key[0].npages +
+						root->u.leaf_key[1].npages + 1;
+					fpm->btree_depth = 0;
+					relptr_store(base, fpm->btree_root,
+								 (FreePageBtree *) NULL);
+					FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+										   fpm->singleton_npages);
+					Assert(max_contiguous_pages == 0);
+					max_contiguous_pages = fpm->singleton_npages;
+				}
+			}
+
+			/* Whether it worked or not, it's time to stop. */
+			break;
+		}
+		else
+		{
+			/* Nothing more to do.  Stop. */
+			break;
+		}
+	}
+
+	/*
+	 * Attempt to free recycled btree pages.  We skip this if releasing the
+	 * recycled page would require a btree page split, because the page we're
+	 * trying to recycle would be consumed by the split, which would be
+	 * counterproductive.
+	 *
+	 * We also currently only ever attempt to recycle the first page on the
+	 * list; that could be made more aggressive, but it's not clear that the
+	 * complexity would be worthwhile.
+	 */
+	while (fpm->btree_recycle_count > 0)
+	{
+		FreePageBtree *btp;
+		Size		first_page;
+		Size		contiguous_pages;
+
+		btp = FreePageBtreeGetRecycled(fpm);
+		first_page = fpm_pointer_to_page(base, btp);
+		contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
+		if (contiguous_pages == 0)
+		{
+			FreePageBtreeRecycle(fpm, first_page);
+			break;
+		}
+		else
+		{
+			if (contiguous_pages > max_contiguous_pages)
+				max_contiguous_pages = contiguous_pages;
+		}
+	}
+
+	return max_contiguous_pages;
+}
+
+/*
+ * Consider consolidating the given page with its left or right sibling,
+ * if it's fairly empty.
+ */
+static void
+FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtree *np;
+	Size		max;
+
+	/*
+	 * We only try to consolidate pages that are less than a third full. We
+	 * could be more aggressive about this, but that might risk performing
+	 * consolidation only to end up splitting again shortly thereafter.  Since
+	 * the btree should be very small compared to the space under management,
+	 * our goal isn't so much to ensure that it always occupies the absolutely
+	 * smallest possible number of pages as to reclaim pages before things get
+	 * too egregiously out of hand.
+	 */
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		max = FPM_ITEMS_PER_LEAF_PAGE;
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		max = FPM_ITEMS_PER_INTERNAL_PAGE;
+	}
+	if (btp->hdr.nused >= max / 3)
+		return;
+
+	/*
+	 * If we can fit our right sibling's keys onto this page, consolidate.
+	 */
+	np = FreePageBtreeFindRightSibling(base, btp);
+	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+	{
+		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
+				   sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
+			btp->hdr.nused += np->hdr.nused;
+		}
+		else
+		{
+			memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
+				   sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
+			btp->hdr.nused += np->hdr.nused;
+			FreePageBtreeUpdateParentPointers(base, btp);
+		}
+		FreePageBtreeRemovePage(fpm, np);
+		return;
+	}
+
+	/*
+	 * If we can fit our keys onto our left sibling's page, consolidate. In
+	 * this case, we move our keys onto the other page rather than visca
+	 * versa, to avoid having to adjust ancestor keys.
+	 */
+	np = FreePageBtreeFindLeftSibling(base, btp);
+	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+	{
+		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
+				   sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
+			np->hdr.nused += btp->hdr.nused;
+		}
+		else
+		{
+			memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
+				   sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
+			np->hdr.nused += btp->hdr.nused;
+			FreePageBtreeUpdateParentPointers(base, np);
+		}
+		FreePageBtreeRemovePage(fpm, btp);
+		return;
+	}
+}
+
+/*
+ * Find the passed page's left sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately precedes ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
+{
+	FreePageBtree *p = btp;
+	int			levels = 0;
+
+	/* Move up until we can move left. */
+	for (;;)
+	{
+		Size		first_page;
+		Size		index;
+
+		first_page = FreePageBtreeFirstKey(p);
+		p = relptr_access(base, p->hdr.parent);
+
+		if (p == NULL)
+			return NULL;		/* we were passed the rightmost page */
+
+		index = FreePageBtreeSearchInternal(p, first_page);
+		if (index > 0)
+		{
+			Assert(p->u.internal_key[index].first_page == first_page);
+			p = relptr_access(base, p->u.internal_key[index - 1].child);
+			break;
+		}
+		Assert(index == 0);
+		++levels;
+	}
+
+	/* Descend left. */
+	while (levels > 0)
+	{
+		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
+		--levels;
+	}
+	Assert(p->hdr.magic == btp->hdr.magic);
+
+	return p;
+}
+
+/*
+ * Find the passed page's right sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately follows ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
+{
+	FreePageBtree *p = btp;
+	int			levels = 0;
+
+	/* Move up until we can move right. */
+	for (;;)
+	{
+		Size		first_page;
+		Size		index;
+
+		first_page = FreePageBtreeFirstKey(p);
+		p = relptr_access(base, p->hdr.parent);
+
+		if (p == NULL)
+			return NULL;		/* we were passed the rightmost page */
+
+		index = FreePageBtreeSearchInternal(p, first_page);
+		if (index < p->hdr.nused - 1)
+		{
+			Assert(p->u.internal_key[index].first_page == first_page);
+			p = relptr_access(base, p->u.internal_key[index + 1].child);
+			break;
+		}
+		Assert(index == p->hdr.nused - 1);
+		++levels;
+	}
+
+	/* Descend left. */
+	while (levels > 0)
+	{
+		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		p = relptr_access(base, p->u.internal_key[0].child);
+		--levels;
+	}
+	Assert(p->hdr.magic == btp->hdr.magic);
+
+	return p;
+}
+
+/*
+ * Get the first key on a btree page.
+ */
+static Size
+FreePageBtreeFirstKey(FreePageBtree *btp)
+{
+	Assert(btp->hdr.nused > 0);
+
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		return btp->u.leaf_key[0].first_page;
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		return btp->u.internal_key[0].first_page;
+	}
+}
+
+/*
+ * Get a page from the btree recycle list for use as a btree page.
+ */
+static FreePageBtree *
+FreePageBtreeGetRecycled(FreePageManager *fpm)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
+	FreePageSpanLeader *newhead;
+
+	Assert(victim != NULL);
+	newhead = relptr_access(base, victim->next);
+	if (newhead != NULL)
+		relptr_copy(newhead->prev, victim->prev);
+	relptr_store(base, fpm->btree_recycle, newhead);
+	Assert(fpm_pointer_is_page_aligned(base, victim));
+	fpm->btree_recycle_count--;
+	return (FreePageBtree *) victim;
+}
+
+/*
+ * Insert an item into an internal page.
+ */
+static void
+FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
+							Size first_page, FreePageBtree *child)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+	Assert(index <= btp->hdr.nused);
+	memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
+			sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
+	btp->u.internal_key[index].first_page = first_page;
+	relptr_store(base, btp->u.internal_key[index].child, child);
+	++btp->hdr.nused;
+}
+
+/*
+ * Insert an item into a leaf page.
+ */
+static void
+FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
+						Size npages)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+	Assert(index <= btp->hdr.nused);
+	memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
+			sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+	btp->u.leaf_key[index].first_page = first_page;
+	btp->u.leaf_key[index].npages = npages;
+	++btp->hdr.nused;
+}
+
+/*
+ * Put a page on the btree recycle list.
+ */
+static void
+FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
+	FreePageSpanLeader *span;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+	span->npages = 1;
+	relptr_store(base, span->next, head);
+	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+	if (head != NULL)
+		relptr_store(base, head->prev, span);
+	relptr_store(base, fpm->btree_recycle, span);
+	fpm->btree_recycle_count++;
+}
+
+/*
+ * Remove an item from the btree at the given position on the given page.
+ */
+static void
+FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(index < btp->hdr.nused);
+
+	/* When last item is removed, extirpate entire page from btree. */
+	if (btp->hdr.nused == 1)
+	{
+		FreePageBtreeRemovePage(fpm, btp);
+		return;
+	}
+
+	/* Physically remove the key from the page. */
+	--btp->hdr.nused;
+	if (index < btp->hdr.nused)
+		memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
+				sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+
+	/* If we just removed the first key, adjust ancestor keys. */
+	if (index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, btp);
+
+	/* Consider whether to consolidate this page with a sibling. */
+	FreePageBtreeConsolidate(fpm, btp);
+}
+
+/*
+ * Remove a page from the btree.  Caller is responsible for having relocated
+ * any keys from this page that are still wanted.  The page is placed on the
+ * recycled list.
+ */
+static void
+FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtree *parent;
+	Size		index;
+	Size		first_page;
+
+	for (;;)
+	{
+		/* Find parent page. */
+		parent = relptr_access(base, btp->hdr.parent);
+		if (parent == NULL)
+		{
+			/* We are removing the root page. */
+			relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+			fpm->btree_depth = 0;
+			Assert(fpm->singleton_first_page == 0);
+			Assert(fpm->singleton_npages == 0);
+			return;
+		}
+
+		/*
+		 * If the parent contains only one item, we need to remove it as well.
+		 */
+		if (parent->hdr.nused > 1)
+			break;
+		FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+		btp = parent;
+	}
+
+	/* Find and remove the downlink. */
+	first_page = FreePageBtreeFirstKey(btp);
+	if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+	{
+		index = FreePageBtreeSearchLeaf(parent, first_page);
+		Assert(index < parent->hdr.nused);
+		if (index < parent->hdr.nused - 1)
+			memmove(&parent->u.leaf_key[index],
+					&parent->u.leaf_key[index + 1],
+					sizeof(FreePageBtreeLeafKey)
+					* (parent->hdr.nused - index - 1));
+	}
+	else
+	{
+		index = FreePageBtreeSearchInternal(parent, first_page);
+		Assert(index < parent->hdr.nused);
+		if (index < parent->hdr.nused - 1)
+			memmove(&parent->u.internal_key[index],
+					&parent->u.internal_key[index + 1],
+					sizeof(FreePageBtreeInternalKey)
+					* (parent->hdr.nused - index - 1));
+	}
+	parent->hdr.nused--;
+	Assert(parent->hdr.nused > 0);
+
+	/* Recycle the page. */
+	FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+
+	/* Adjust ancestor keys if needed. */
+	if (index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, parent);
+
+	/* Consider whether to consolidate the parent with a sibling. */
+	FreePageBtreeConsolidate(fpm, parent);
+}
+
+/*
+ * Search the btree for an entry for the given first page and initialize
+ * *result with the results of the search.  result->page and result->index
+ * indicate either the position of an exact match or the position at which
+ * the new key should be inserted.  result->found is true for an exact match,
+ * otherwise false.  result->split_pages will contain the number of additional
+ * btree pages that will be needed when performing a split to insert a key.
+ * Except as described above, the contents of fields in the result object are
+ * undefined on return.
+ */
+static void
+FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+					FreePageBtreeSearchResult *result)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtree *btp = relptr_access(base, fpm->btree_root);
+	Size		index;
+
+	result->split_pages = 1;
+
+	/* If the btree is empty, there's nothing to find. */
+	if (btp == NULL)
+	{
+		result->page = NULL;
+		result->found = false;
+		return;
+	}
+
+	/* Descend until we hit a leaf. */
+	while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		FreePageBtree *child;
+		bool		found_exact;
+
+		index = FreePageBtreeSearchInternal(btp, first_page);
+		found_exact = index < btp->hdr.nused &&
+			btp->u.internal_key[index].first_page == first_page;
+
+		/*
+		 * If we found an exact match we descend directly.  Otherwise, we
+		 * descend into the child to the left if possible so that we can find
+		 * the insertion point at that child's high end.
+		 */
+		if (!found_exact && index > 0)
+			--index;
+
+		/* Track required split depth for leaf insert. */
+		if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
+		{
+			Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+			result->split_pages++;
+		}
+		else
+			result->split_pages = 0;
+
+		/* Descend to appropriate child page. */
+		Assert(index < btp->hdr.nused);
+		child = relptr_access(base, btp->u.internal_key[index].child);
+		Assert(relptr_access(base, child->hdr.parent) == btp);
+		btp = child;
+	}
+
+	/* Track required split depth for leaf insert. */
+	if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
+	{
+		Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+		result->split_pages++;
+	}
+	else
+		result->split_pages = 0;
+
+	/* Search leaf page. */
+	index = FreePageBtreeSearchLeaf(btp, first_page);
+
+	/* Assemble results. */
+	result->page = btp;
+	result->index = index;
+	result->found = index < btp->hdr.nused &&
+		first_page == btp->u.leaf_key[index].first_page;
+}
+
+/*
+ * Search an internal page for the first key greater than or equal to a given
+ * page number.  Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
+{
+	Size		low = 0;
+	Size		high = btp->hdr.nused;
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
+
+	while (low < high)
+	{
+		Size		mid = (low + high) / 2;
+		Size		val = btp->u.internal_key[mid].first_page;
+
+		if (first_page == val)
+			return mid;
+		else if (first_page < val)
+			high = mid;
+		else
+			low = mid + 1;
+	}
+
+	return low;
+}
+
+/*
+ * Search a leaf page for the first key greater than or equal to a given
+ * page number.  Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
+{
+	Size		low = 0;
+	Size		high = btp->hdr.nused;
+
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
+
+	while (low < high)
+	{
+		Size		mid = (low + high) / 2;
+		Size		val = btp->u.leaf_key[mid].first_page;
+
+		if (first_page == val)
+			return mid;
+		else if (first_page < val)
+			high = mid;
+		else
+			low = mid + 1;
+	}
+
+	return low;
+}
+
+/*
+ * Allocate a new btree page and move half the keys from the provided page
+ * to the new page.  Caller is responsible for making sure that there's a
+ * page available from fpm->btree_recycle.  Returns a pointer to the new page,
+ * to which caller must add a downlink.
+ */
+static FreePageBtree *
+FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
+{
+	FreePageBtree *newsibling;
+
+	newsibling = FreePageBtreeGetRecycled(fpm);
+	newsibling->hdr.magic = btp->hdr.magic;
+	newsibling->hdr.nused = btp->hdr.nused / 2;
+	relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
+	btp->hdr.nused -= newsibling->hdr.nused;
+
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		memcpy(&newsibling->u.leaf_key,
+			   &btp->u.leaf_key[btp->hdr.nused],
+			   sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		memcpy(&newsibling->u.internal_key,
+			   &btp->u.internal_key[btp->hdr.nused],
+			   sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
+		FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
+	}
+
+	return newsibling;
+}
+
+/*
+ * When internal pages are split or merged, the parent pointers of their
+ * children must be updated.
+ */
+static void
+FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
+{
+	Size		i;
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	for (i = 0; i < btp->hdr.nused; ++i)
+	{
+		FreePageBtree *child;
+
+		child = relptr_access(base, btp->u.internal_key[i].child);
+		relptr_store(base, child->hdr.parent, btp);
+	}
+}
+
+/*
+ * Debugging dump of btree data.
+ */
+static void
+FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+						 FreePageBtree *parent, int level, StringInfo buf)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		pageno = fpm_pointer_to_page(base, btp);
+	Size		index;
+	FreePageBtree *check_parent;
+
+	check_stack_depth();
+	check_parent = relptr_access(base, btp->hdr.parent);
+	appendStringInfo(buf, "  %zu@%d %c", pageno, level,
+					 btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
+	if (parent != check_parent)
+		appendStringInfo(buf, " [actual parent %zu, expected %zu]",
+						 fpm_pointer_to_page(base, check_parent),
+						 fpm_pointer_to_page(base, parent));
+	appendStringInfoChar(buf, ':');
+	for (index = 0; index < btp->hdr.nused; ++index)
+	{
+		if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+			appendStringInfo(buf, " %zu->%zu",
+							 btp->u.internal_key[index].first_page,
+				btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE);
+		else
+			appendStringInfo(buf, " %zu(%zu)",
+							 btp->u.leaf_key[index].first_page,
+							 btp->u.leaf_key[index].npages);
+	}
+	appendStringInfo(buf, "\n");
+
+	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		for (index = 0; index < btp->hdr.nused; ++index)
+		{
+			FreePageBtree *child;
+
+			child = relptr_access(base, btp->u.internal_key[index].child);
+			FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
+		}
+	}
+}
+
+/*
+ * Debugging dump of free-span data.
+ */
+static void
+FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
+						 Size expected_pages, StringInfo buf)
+{
+	char	   *base = fpm_segment_base(fpm);
+
+	while (span != NULL)
+	{
+		if (span->npages != expected_pages)
+			appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
+							 span->npages);
+		else
+			appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
+		span = relptr_access(base, span->next);
+	}
+
+	appendStringInfo(buf, "\n");
+}
+
+/*
+ * This function allocates a run of pages of the given length from the free
+ * page manager.
+ */
+static bool
+FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *victim = NULL;
+	FreePageSpanLeader *prev;
+	FreePageSpanLeader *next;
+	FreePageBtreeSearchResult result;
+	Size		victim_page = 0;	/* placate compiler */
+	Size		f;
+
+	/*
+	 * Search for a free span.
+	 *
+	 * Right now, we use a simple best-fit policy here, but it's possible for
+	 * this to result in memory fragmentation if we're repeatedly asked to
+	 * allocate chunks just a little smaller than what we have available.
+	 * Hopefully, this is unlikely, because we expect most requests to be
+	 * single pages or superblock-sized chunks -- but no policy can be optimal
+	 * under all circumstances unless it has knowledge of future allocation
+	 * patterns.
+	 */
+	for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
+	{
+		/* Skip empty freelists. */
+		if (relptr_is_null(fpm->freelist[f]))
+			continue;
+
+		/*
+		 * All of the freelists except the last one contain only items of a
+		 * single size, so we just take the first one.  But the final free
+		 * list contains everything too big for any of the other lists, so we
+		 * need to search the list.
+		 */
+		if (f < FPM_NUM_FREELISTS - 1)
+			victim = relptr_access(base, fpm->freelist[f]);
+		else
+		{
+			FreePageSpanLeader *candidate;
+
+			candidate = relptr_access(base, fpm->freelist[f]);
+			do
+			{
+				if (candidate->npages >= npages && (victim == NULL ||
+										 victim->npages > candidate->npages))
+				{
+					victim = candidate;
+					if (victim->npages == npages)
+						break;
+				}
+				candidate = relptr_access(base, candidate->next);
+			} while (candidate != NULL);
+		}
+		break;
+	}
+
+	/* If we didn't find an allocatable span, return failure. */
+	if (victim == NULL)
+		return false;
+
+	/* Remove span from free list. */
+	Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
+	prev = relptr_access(base, victim->prev);
+	next = relptr_access(base, victim->next);
+	if (prev != NULL)
+		relptr_copy(prev->next, victim->next);
+	else
+		relptr_copy(fpm->freelist[f], victim->next);
+	if (next != NULL)
+		relptr_copy(next->prev, victim->prev);
+	victim_page = fpm_pointer_to_page(base, victim);
+
+	/* Decide whether we might be invalidating contiguous_pages. */
+	if (f == FPM_NUM_FREELISTS - 1 &&
+		victim->npages == fpm->contiguous_pages)
+	{
+		/*
+		 * The victim span came from the oversized freelist, and had the same
+		 * size as the longest span.  There may or may not be another one of
+		 * the same size, so contiguous_pages must be recomputed just to be
+		 * safe.
+		 */
+		fpm->contiguous_pages_dirty = true;
+	}
+	else if (f + 1 == fpm->contiguous_pages &&
+			 relptr_is_null(fpm->freelist[f]))
+	{
+		/*
+		 * The victim span came from a fixed sized freelist, and it was the
+		 * list for spans of the same size as the current longest span, and
+		 * the list is now empty after removing the victim.  So
+		 * contiguous_pages must be recomputed without a doubt.
+		 */
+		fpm->contiguous_pages_dirty = true;
+	}
+
+	/*
+	 * If we haven't initialized the btree yet, the victim must be the single
+	 * span stored within the FreePageManager itself.  Otherwise, we need to
+	 * update the btree.
+	 */
+	if (relptr_is_null(fpm->btree_root))
+	{
+		Assert(victim_page == fpm->singleton_first_page);
+		Assert(victim->npages == fpm->singleton_npages);
+		Assert(victim->npages >= npages);
+		fpm->singleton_first_page += npages;
+		fpm->singleton_npages -= npages;
+		if (fpm->singleton_npages > 0)
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+	}
+	else
+	{
+		/*
+		 * If the span we found is exactly the right size, remove it from the
+		 * btree completely.  Otherwise, adjust the btree entry to reflect the
+		 * still-unallocated portion of the span, and put that portion on the
+		 * appropriate free list.
+		 */
+		FreePageBtreeSearch(fpm, victim_page, &result);
+		Assert(result.found);
+		if (victim->npages == npages)
+			FreePageBtreeRemove(fpm, result.page, result.index);
+		else
+		{
+			FreePageBtreeLeafKey *key;
+
+			/* Adjust btree to reflect remaining pages. */
+			Assert(victim->npages > npages);
+			key = &result.page->u.leaf_key[result.index];
+			Assert(key->npages == victim->npages);
+			key->first_page += npages;
+			key->npages -= npages;
+			if (result.index == 0)
+				FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+			/* Put the unallocated pages back on the appropriate free list. */
+			FreePagePushSpanLeader(fpm, victim_page + npages,
+								   victim->npages - npages);
+		}
+	}
+
+	/* Return results to caller. */
+	*first_page = fpm_pointer_to_page(base, victim);
+	return true;
+}
+
+/*
+ * Put a range of pages into the btree and freelists, consolidating it with
+ * existing free spans just before and/or after it.  If 'soft' is true,
+ * only perform the insertion if it can be done without allocating new btree
+ * pages; if false, do it always.  Returns 0 if the soft flag caused the
+ * insertion to be skipped, or otherwise the size of the contiguous span
+ * created by the insertion.  This may be larger than npages if we're able
+ * to consolidate with an adjacent range.  *internal_pages_used is set to
+ * true if the btree allocated pages for internal purposes, which might
+ * invalidate the current largest run requiring it to be recomputed.
+ */
+static Size
+FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
+						   bool soft)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtreeSearchResult result;
+	FreePageBtreeLeafKey *prevkey = NULL;
+	FreePageBtreeLeafKey *nextkey = NULL;
+	FreePageBtree *np;
+	Size		nindex;
+
+	Assert(npages > 0);
+
+	/* We can store a single free span without initializing the btree. */
+	if (fpm->btree_depth == 0)
+	{
+		if (fpm->singleton_npages == 0)
+		{
+			/* Don't have a span yet; store this one. */
+			fpm->singleton_first_page = first_page;
+			fpm->singleton_npages = npages;
+			FreePagePushSpanLeader(fpm, first_page, npages);
+			return fpm->singleton_npages;
+		}
+		else if (fpm->singleton_first_page + fpm->singleton_npages ==
+				 first_page)
+		{
+			/* New span immediately follows sole existing span. */
+			fpm->singleton_npages += npages;
+			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+			return fpm->singleton_npages;
+		}
+		else if (first_page + npages == fpm->singleton_first_page)
+		{
+			/* New span immediately precedes sole existing span. */
+			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+			fpm->singleton_first_page = first_page;
+			fpm->singleton_npages += npages;
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+			return fpm->singleton_npages;
+		}
+		else
+		{
+			/* Not contiguous; we need to initialize the btree. */
+			Size		root_page;
+			FreePageBtree *root;
+
+			if (!relptr_is_null(fpm->btree_recycle))
+				root = FreePageBtreeGetRecycled(fpm);
+			else if (FreePageManagerGetInternal(fpm, 1, &root_page))
+				root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
+			else
+			{
+				/* We'd better be able to get a page from the existing range. */
+				elog(FATAL, "free page manager btree is corrupt");
+			}
+
+			/* Create the btree and move the preexisting range into it. */
+			root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
+			root->hdr.nused = 1;
+			relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
+			root->u.leaf_key[0].first_page = fpm->singleton_first_page;
+			root->u.leaf_key[0].npages = fpm->singleton_npages;
+			relptr_store(base, fpm->btree_root, root);
+			fpm->singleton_first_page = 0;
+			fpm->singleton_npages = 0;
+			fpm->btree_depth = 1;
+
+			/*
+			 * Corner case: it may be that the btree root took the very last
+			 * free page.  In that case, the sole btree entry covers a zero
+			 * page run, which is invalid.  Overwrite it with the entry we're
+			 * trying to insert and get out.
+			 */
+			if (root->u.leaf_key[0].npages == 0)
+			{
+				root->u.leaf_key[0].first_page = first_page;
+				root->u.leaf_key[0].npages = npages;
+				FreePagePushSpanLeader(fpm, first_page, npages);
+				return npages;
+			}
+
+			/* Fall through to insert the new key. */
+		}
+	}
+
+	/* Search the btree. */
+	FreePageBtreeSearch(fpm, first_page, &result);
+	Assert(!result.found);
+	if (result.index > 0)
+		prevkey = &result.page->u.leaf_key[result.index - 1];
+	if (result.index < result.page->hdr.nused)
+	{
+		np = result.page;
+		nindex = result.index;
+		nextkey = &result.page->u.leaf_key[result.index];
+	}
+	else
+	{
+		np = FreePageBtreeFindRightSibling(base, result.page);
+		nindex = 0;
+		if (np != NULL)
+			nextkey = &np->u.leaf_key[0];
+	}
+
+	/* Consolidate with the previous entry if possible. */
+	if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
+	{
+		bool		remove_next = false;
+		Size		result;
+
+		Assert(prevkey->first_page + prevkey->npages == first_page);
+		prevkey->npages = (first_page - prevkey->first_page) + npages;
+
+		/* Check whether we can *also* consolidate with the following entry. */
+		if (nextkey != NULL &&
+			prevkey->first_page + prevkey->npages >= nextkey->first_page)
+		{
+			Assert(prevkey->first_page + prevkey->npages ==
+				   nextkey->first_page);
+			prevkey->npages = (nextkey->first_page - prevkey->first_page)
+				+ nextkey->npages;
+			FreePagePopSpanLeader(fpm, nextkey->first_page);
+			remove_next = true;
+		}
+
+		/* Put the span on the correct freelist and save size. */
+		FreePagePopSpanLeader(fpm, prevkey->first_page);
+		FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
+		result = prevkey->npages;
+
+		/*
+		 * If we consolidated with both the preceding and following entries,
+		 * we must remove the following entry.  We do this last, because
+		 * removing an element from the btree may invalidate pointers we hold
+		 * into the current data structure.
+		 *
+		 * NB: The btree is technically in an invalid state a this point
+		 * because we've already updated prevkey to cover the same key space
+		 * as nextkey.  FreePageBtreeRemove() shouldn't notice that, though.
+		 */
+		if (remove_next)
+			FreePageBtreeRemove(fpm, np, nindex);
+
+		return result;
+	}
+
+	/* Consolidate with the next entry if possible. */
+	if (nextkey != NULL && first_page + npages >= nextkey->first_page)
+	{
+		Size		newpages;
+
+		/* Compute new size for span. */
+		Assert(first_page + npages == nextkey->first_page);
+		newpages = (nextkey->first_page - first_page) + nextkey->npages;
+
+		/* Put span on correct free list. */
+		FreePagePopSpanLeader(fpm, nextkey->first_page);
+		FreePagePushSpanLeader(fpm, first_page, newpages);
+
+		/* Update key in place. */
+		nextkey->first_page = first_page;
+		nextkey->npages = newpages;
+
+		/* If reducing first key on page, ancestors might need adjustment. */
+		if (nindex == 0)
+			FreePageBtreeAdjustAncestorKeys(fpm, np);
+
+		return nextkey->npages;
+	}
+
+	/* Split leaf page and as many of its ancestors as necessary. */
+	if (result.split_pages > 0)
+	{
+		/*
+		 * NB: We could consider various coping strategies here to avoid a
+		 * split; most obviously, if np != result.page, we could target that
+		 * page instead.   More complicated shuffling strategies could be
+		 * possible as well; basically, unless every single leaf page is 100%
+		 * full, we can jam this key in there if we try hard enough.  It's
+		 * unlikely that trying that hard is worthwhile, but it's possible we
+		 * might need to make more than no effort.  For now, we just do the
+		 * easy thing, which is nothing.
+		 */
+
+		/* If this is a soft insert, it's time to give up. */
+		if (soft)
+			return 0;
+
+		/* Check whether we need to allocate more btree pages to split. */
+		if (result.split_pages > fpm->btree_recycle_count)
+		{
+			Size		pages_needed;
+			Size		recycle_page;
+			Size		i;
+
+			/*
+			 * Allocate the required number of pages and split each one in
+			 * turn.  This should never fail, because if we've got enough
+			 * spans of free pages kicking around that we need additional
+			 * storage space just to remember them all, then we should
+			 * certainly have enough to expand the btree, which should only
+			 * ever use a tiny number of pages compared to the number under
+			 * management.  If it does, something's badly screwed up.
+			 */
+			pages_needed = result.split_pages - fpm->btree_recycle_count;
+			for (i = 0; i < pages_needed; ++i)
+			{
+				if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
+					elog(FATAL, "free page manager btree is corrupt");
+				FreePageBtreeRecycle(fpm, recycle_page);
+			}
+
+			/*
+			 * The act of allocating pages to recycle may have invalidated the
+			 * results of our previous btree reserch, so repeat it. (We could
+			 * recheck whether any of our split-avoidance strategies that were
+			 * not viable before now are, but it hardly seems worthwhile, so
+			 * we don't bother. Consolidation can't be possible now if it
+			 * wasn't previously.)
+			 */
+			FreePageBtreeSearch(fpm, first_page, &result);
+
+			/*
+			 * The act of allocating pages for use in constructing our btree
+			 * should never cause any page to become more full, so the new
+			 * split depth should be no greater than the old one, and perhaps
+			 * less if we fortutiously allocated a chunk that freed up a slot
+			 * on the page we need to update.
+			 */
+			Assert(result.split_pages <= fpm->btree_recycle_count);
+		}
+
+		/* If we still need to perform a split, do it. */
+		if (result.split_pages > 0)
+		{
+			FreePageBtree *split_target = result.page;
+			FreePageBtree *child = NULL;
+			Size		key = first_page;
+
+			for (;;)
+			{
+				FreePageBtree *newsibling;
+				FreePageBtree *parent;
+
+				/* Identify parent page, which must receive downlink. */
+				parent = relptr_access(base, split_target->hdr.parent);
+
+				/* Split the page - downlink not added yet. */
+				newsibling = FreePageBtreeSplitPage(fpm, split_target);
+
+				/*
+				 * At this point in the loop, we're always carrying a pending
+				 * insertion.  On the first pass, it's the actual key we're
+				 * trying to insert; on subsequent passes, it's the downlink
+				 * that needs to be added as a result of the split performed
+				 * during the previous loop iteration.  Since we've just split
+				 * the page, there's definitely room on one of the two
+				 * resulting pages.
+				 */
+				if (child == NULL)
+				{
+					Size		index;
+					FreePageBtree *insert_into;
+
+					insert_into = key < newsibling->u.leaf_key[0].first_page ?
+						split_target : newsibling;
+					index = FreePageBtreeSearchLeaf(insert_into, key);
+					FreePageBtreeInsertLeaf(insert_into, index, key, npages);
+					if (index == 0 && insert_into == split_target)
+						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+				}
+				else
+				{
+					Size		index;
+					FreePageBtree *insert_into;
+
+					insert_into =
+						key < newsibling->u.internal_key[0].first_page ?
+						split_target : newsibling;
+					index = FreePageBtreeSearchInternal(insert_into, key);
+					FreePageBtreeInsertInternal(base, insert_into, index,
+												key, child);
+					relptr_store(base, child->hdr.parent, insert_into);
+					if (index == 0 && insert_into == split_target)
+						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+				}
+
+				/* If the page we just split has no parent, split the root. */
+				if (parent == NULL)
+				{
+					FreePageBtree *newroot;
+
+					newroot = FreePageBtreeGetRecycled(fpm);
+					newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
+					newroot->hdr.nused = 2;
+					relptr_store(base, newroot->hdr.parent,
+								 (FreePageBtree *) NULL);
+					newroot->u.internal_key[0].first_page =
+						FreePageBtreeFirstKey(split_target);
+					relptr_store(base, newroot->u.internal_key[0].child,
+								 split_target);
+					relptr_store(base, split_target->hdr.parent, newroot);
+					newroot->u.internal_key[1].first_page =
+						FreePageBtreeFirstKey(newsibling);
+					relptr_store(base, newroot->u.internal_key[1].child,
+								 newsibling);
+					relptr_store(base, newsibling->hdr.parent, newroot);
+					relptr_store(base, fpm->btree_root, newroot);
+					fpm->btree_depth++;
+
+					break;
+				}
+
+				/* If the parent page isn't full, insert the downlink. */
+				key = newsibling->u.internal_key[0].first_page;
+				if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
+				{
+					Size		index;
+
+					index = FreePageBtreeSearchInternal(parent, key);
+					FreePageBtreeInsertInternal(base, parent, index,
+												key, newsibling);
+					relptr_store(base, newsibling->hdr.parent, parent);
+					if (index == 0)
+						FreePageBtreeAdjustAncestorKeys(fpm, parent);
+					break;
+				}
+
+				/* The parent also needs to be split, so loop around. */
+				child = newsibling;
+				split_target = parent;
+			}
+
+			/*
+			 * The loop above did the insert, so just need to update the free
+			 * list, and we're done.
+			 */
+			FreePagePushSpanLeader(fpm, first_page, npages);
+
+			return npages;
+		}
+	}
+
+	/* Physically add the key to the page. */
+	Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
+	FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
+
+	/* If new first key on page, ancestors might need adjustment. */
+	if (result.index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+	/* Put it on the free list. */
+	FreePagePushSpanLeader(fpm, first_page, npages);
+
+	return npages;
+}
+
+/*
+ * Remove a FreePageSpanLeader from the linked-list that contains it, either
+ * because we're changing the size of the span, or because we're allocating it.
+ */
+static void
+FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *span;
+	FreePageSpanLeader *next;
+	FreePageSpanLeader *prev;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+
+	next = relptr_access(base, span->next);
+	prev = relptr_access(base, span->prev);
+	if (next != NULL)
+		relptr_copy(next->prev, span->prev);
+	if (prev != NULL)
+		relptr_copy(prev->next, span->next);
+	else
+	{
+		Size		f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
+
+		Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE);
+		relptr_copy(fpm->freelist[f], span->next);
+	}
+}
+
+/*
+ * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
+ */
+static void
+FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		f = Min(npages, FPM_NUM_FREELISTS) - 1;
+	FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
+	FreePageSpanLeader *span;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
+	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+	span->npages = npages;
+	relptr_store(base, span->next, head);
+	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+	if (head != NULL)
+		relptr_store(base, head->prev, span);
+	relptr_store(base, fpm->freelist[f], span);
+}
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 13955b8a94cd..673f6d3bcea1 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -889,6 +889,7 @@ extractPageInfo(XLogRecord *record)
 			switch (info)
 			{
 				case XLOG_SMGR_CREATE:
+				case XLOG_SMGR_CREATE_PDL:
 					/*
 					 * We can safely ignore these. The local file will be
 					 * removed, if it doesn't exist in remote system. If a
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index e9d5e866cc6c..d96d981030c6 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -108,4 +108,6 @@ extern void getTwoPhasePreparedTransactionData(prepared_transaction_agg_state **
 
 extern void SetupCheckpointPreparedTransactionList(prepared_transaction_agg_state *ptas);
 
+extern bool RemovePendingDeletesForPreparedTransactions(void);
+
 #endif   /* TWOPHASE_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 8246686e7563..732f20739ba8 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -78,6 +78,7 @@ typedef struct CheckPoint
 #define XLOG_FPI						0xA0
 #define XLOG_NEXTRELFILENODE			0xB0
 #define XLOG_OVERWRITE_CONTRECORD		0xC0
+#define XLOG_PENDING_DELETE				0xD0
 
 
 /*
diff --git a/src/include/catalog/storage_pending_deletes.h b/src/include/catalog/storage_pending_deletes.h
new file mode 100644
index 000000000000..587b33125b1d
--- /dev/null
+++ b/src/include/catalog/storage_pending_deletes.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_pending_deletes.h
+ *	  prototypes for functions in backend/catalog/storage_pending_deletes.c
+ *
+ * Copyright (c) 2025 Greengage Community
+ *
+ * src/include/catalog/storage_pending_deletes.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STORAGE_PENDING_DELETES_H
+#define STORAGE_PENDING_DELETES_H
+
+#include "postgres.h"
+
+#include "storage/relfilenode.h"
+#include "utils/dsa.h"
+
+/* Pending delete node linked to xact which created it */
+typedef struct PendingRelXactDelete
+{
+	RelFileNodePendingDelete relnode;
+	TransactionId xid;
+}	PendingRelXactDelete;
+
+typedef struct PendingRelXactDeleteArray
+{
+	Size		count;
+	PendingRelXactDelete array[FLEXIBLE_ARRAY_MEMBER];
+}	PendingRelXactDeleteArray;
+
+static inline Size
+PdlDumpSize(Size count)
+{
+	Size array_size = sizeof(PendingRelXactDelete) * count;
+
+	return offsetof(PendingRelXactDeleteArray, array) + array_size;
+}
+
+extern Size PdlShmemSize(void);
+extern void PdlShmemInit(void);
+extern dsa_pointer PdlShmemAdd(const RelFileNodePendingDelete * relnode,
+			TransactionId xid);
+extern void PdlShmemRemove(dsa_pointer node_ptr);
+extern PendingRelXactDeleteArray *PdlXLogShmemDump(void);
+
+#endif   /* STORAGE_PENDING_DELETES_H */
diff --git a/src/include/catalog/storage_pending_deletes_redo.h b/src/include/catalog/storage_pending_deletes_redo.h
new file mode 100644
index 000000000000..aa3f7b7a4edf
--- /dev/null
+++ b/src/include/catalog/storage_pending_deletes_redo.h
@@ -0,0 +1,31 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_pending_deletes_redo.h
+ *	  prototypes for functions in backend/catalog/storage_pending_deletes_redo.c
+ *
+ * Copyright (c) 2025 Greengage Community
+ *
+ * src/include/catalog/storage_pending_deletes_redo.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STORAGE_PENDING_DELETES_REDO_H
+#define STORAGE_PENDING_DELETES_REDO_H
+
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "catalog/storage_pending_deletes.h"
+
+extern void PdlXLogInsert(void);
+
+extern void PdlRedoAdd(PendingRelXactDelete * pd);
+
+extern void PdlRedoXLogRecord(XLogRecord *record);
+
+extern void PdlRedoRemoveTree(TransactionId xid,
+				  TransactionId *sub_xids, int nsubxacts);
+
+extern void PdlRedoDropFiles(void);
+
+#endif   /* STORAGE_PENDING_DELETES_REDO_H */
diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h
index f3916190ce29..2cf8a3ced7ca 100644
--- a/src/include/catalog/storage_xlog.h
+++ b/src/include/catalog/storage_xlog.h
@@ -26,22 +26,34 @@
  */
 
 /* XLOG gives us high 4 bits */
-#define XLOG_SMGR_CREATE	0x10
-#define XLOG_SMGR_TRUNCATE	0x20
+#define XLOG_SMGR_CREATE		0x10
+#define XLOG_SMGR_TRUNCATE		0x20
+#define XLOG_SMGR_CREATE_PDL	0x30
 
+/*
+ * We do not create `xl_smgr_create` records anymore. We use
+ * `xl_smgr_create_pdl` instead. But we still process `xl_smgr_create` records
+ * for backward compatibility.
+ */
 typedef struct xl_smgr_create
 {
 	RelFileNode rnode;
 	ForkNumber	forkNum;
 } xl_smgr_create;
 
+typedef struct xl_smgr_create_pdl
+{
+	xl_smgr_create createrec;
+	char relstorage;
+} xl_smgr_create_pdl;
+
 typedef struct xl_smgr_truncate
 {
 	BlockNumber blkno;
 	RelFileNode rnode;
 } xl_smgr_truncate;
 
-extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum);
+extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum, char relstorage);
 
 extern void smgr_redo(XLogRecPtr beginLoc, XLogRecPtr lsn, XLogRecord *record);
 extern void smgr_desc(StringInfo buf, XLogRecord *record);
diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h
index 9ff49162c18d..766f94e3d64a 100644
--- a/src/include/storage/dsm.h
+++ b/src/include/storage/dsm.h
@@ -41,6 +41,7 @@ extern void dsm_detach(dsm_segment *seg);
 /* Resource management functions. */
 extern void dsm_pin_mapping(dsm_segment *seg);
 extern void dsm_pin_segment(dsm_segment *seg);
+extern void dsm_unpin_segment(dsm_handle h);
 extern dsm_segment *dsm_find_mapping(dsm_handle h);
 
 /* Informational functions. */
diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h
index 32cfed2ee9c4..37155354a6e9 100644
--- a/src/include/storage/dsm_impl.h
+++ b/src/include/storage/dsm_impl.h
@@ -72,7 +72,7 @@ extern bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
 /* Some implementations cannot resize segments.  Can this one? */
 extern bool dsm_impl_can_resize(void);
 
-/* Implementation-dependent actions required to keep segment until shudown. */
+/* Implementation-dependent actions required to keep segment until shutdown. */
 extern void dsm_impl_pin_segment(dsm_handle handle, void *impl_private);
 
 #endif   /* DSM_IMPL_H */
diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h
new file mode 100644
index 000000000000..4ef5c241c912
--- /dev/null
+++ b/src/include/utils/dsa.h
@@ -0,0 +1,108 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsa.h
+ *	  Dynamic shared memory areas.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/include/utils/dsa.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DSA_H
+#define DSA_H
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/dsm.h"
+
+/* The opaque type used for an area. */
+struct dsa_area;
+typedef struct dsa_area dsa_area;
+
+/*
+ * If this system doesn't support atomic operations on 64 bit values then
+ * we fall back to 32 bit dsa_pointer.  For testing purposes,
+ * USE_SMALL_DSA_POINTER can be defined to force the use of 32 bit
+ * dsa_pointer even on systems that support 64 bit atomics.
+ */
+#ifndef PG_HAVE_ATOMIC_U64_SUPPORT
+#define SIZEOF_DSA_POINTER 4
+#else
+#ifdef USE_SMALL_DSA_POINTER
+#define SIZEOF_DSA_POINTER 4
+#else
+#define SIZEOF_DSA_POINTER 8
+#endif
+#endif
+
+/*
+ * The type of 'relative pointers' to memory allocated by a dynamic shared
+ * area.  dsa_pointer values can be shared with other processes, but must be
+ * converted to backend-local pointers before they can be dereferenced.  See
+ * dsa_get_address.  Also, an atomic version and appropriately sized atomic
+ * operations.
+ */
+#if DSA_POINTER_SIZEOF == 4
+typedef uint32 dsa_pointer;
+typedef pg_atomic_uint32 dsa_pointer_atomic;
+#define dsa_pointer_atomic_init pg_atomic_init_u32
+#define dsa_pointer_atomic_read pg_atomic_read_u32
+#define dsa_pointer_atomic_write pg_atomic_write_u32
+#define dsa_pointer_atomic_fetch_add pg_atomic_fetch_add_u32
+#define dsa_pointer_atomic_compare_exchange pg_atomic_compare_exchange_u32
+#else
+typedef uint64 dsa_pointer;
+typedef pg_atomic_uint64 dsa_pointer_atomic;
+#define dsa_pointer_atomic_init pg_atomic_init_u64
+#define dsa_pointer_atomic_read pg_atomic_read_u64
+#define dsa_pointer_atomic_write pg_atomic_write_u64
+#define dsa_pointer_atomic_fetch_add pg_atomic_fetch_add_u64
+#define dsa_pointer_atomic_compare_exchange pg_atomic_compare_exchange_u64
+#endif
+
+/* A sentinel value for dsa_pointer used to indicate failure to allocate. */
+#define InvalidDsaPointer ((dsa_pointer) 0)
+
+/* Check if a dsa_pointer value is valid. */
+#define DsaPointerIsValid(x) ((x) != InvalidDsaPointer)
+
+/*
+ * The type used for dsa_area handles.  dsa_handle values can be shared with
+ * other processes, so that they can attach to them.  This provides a way to
+ * share allocated storage with other processes.
+ *
+ * The handle for a dsa_area is currently implemented as the dsm_handle
+ * for the first DSM segment backing this dynamic storage area, but client
+ * code shouldn't assume that is true.
+ */
+typedef dsm_handle dsa_handle;
+
+extern void dsa_startup(void);
+
+extern dsa_area *dsa_create(int tranche_id, const char *tranche_name);
+extern dsa_area *dsa_create_in_place(void *place, Size size,
+					int tranche_id, const char *tranche_name,
+					dsm_segment *segment);
+extern dsa_area *dsa_attach(dsa_handle handle);
+extern dsa_area *dsa_attach_in_place(void *place, dsm_segment *segment);
+extern void dsa_release_in_place(void *place);
+extern void dsa_on_dsm_detach_release_in_place(dsm_segment *, Datum);
+extern void dsa_on_shmem_exit_release_in_place(int, Datum);
+extern void dsa_pin_mapping(dsa_area *area);
+extern void dsa_detach(dsa_area *area);
+extern void dsa_pin(dsa_area *area);
+extern void dsa_unpin(dsa_area *area);
+extern void dsa_set_size_limit(dsa_area *area, Size limit);
+extern Size dsa_minimum_size(void);
+extern dsa_handle dsa_get_handle(dsa_area *area);
+extern dsa_pointer dsa_allocate(dsa_area *area, Size size);
+extern void dsa_free(dsa_area *area, dsa_pointer dp);
+extern void *dsa_get_address(dsa_area *area, dsa_pointer dp);
+extern void dsa_trim(dsa_area *area);
+extern void dsa_dump(dsa_area *area);
+
+#endif   /* DSA_H */
diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h
new file mode 100644
index 000000000000..5e1305bc25cd
--- /dev/null
+++ b/src/include/utils/freepage.h
@@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.h
+ *	  Management of page-organized free memory.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/freepage.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef FREEPAGE_H
+#define FREEPAGE_H
+
+#include "storage/lwlock.h"
+#include "utils/relptr.h"
+
+/* Forward declarations. */
+typedef struct FreePageSpanLeader FreePageSpanLeader;
+typedef struct FreePageBtree FreePageBtree;
+typedef struct FreePageManager FreePageManager;
+
+/*
+ * PostgreSQL normally uses 8kB pages for most things, but many common
+ * architecture/operating system pairings use a 4kB page size for memory
+ * allocation, so we do that here also.
+ */
+#define FPM_PAGE_SIZE			4096
+
+/*
+ * Each freelist except for the last contains only spans of one particular
+ * size.  Everything larger goes on the last one.  In some sense this seems
+ * like a waste since most allocations are in a few common sizes, but it
+ * means that small allocations can simply pop the head of the relevant list
+ * without needing to worry about whether the object we find there is of
+ * precisely the correct size (because we know it must be).
+ */
+#define FPM_NUM_FREELISTS		129
+
+/* Define relative pointer types. */
+relptr_declare(FreePageBtree, RelptrFreePageBtree);
+relptr_declare(FreePageManager, RelptrFreePageManager);
+relptr_declare(FreePageSpanLeader, RelptrFreePageSpanLeader);
+
+/* Everything we need in order to manage free pages (see freepage.c) */
+struct FreePageManager
+{
+	RelptrFreePageManager self;
+	RelptrFreePageBtree btree_root;
+	RelptrFreePageSpanLeader btree_recycle;
+	unsigned	btree_depth;
+	unsigned	btree_recycle_count;
+	Size		singleton_first_page;
+	Size		singleton_npages;
+	Size		contiguous_pages;
+	bool		contiguous_pages_dirty;
+	RelptrFreePageSpanLeader freelist[FPM_NUM_FREELISTS];
+#ifdef FPM_EXTRA_ASSERTS
+	/* For debugging only, pages put minus pages gotten. */
+	Size		free_pages;
+#endif
+};
+
+/* Macros to convert between page numbers (expressed as Size) and pointers. */
+#define fpm_page_to_pointer(base, page) \
+	(AssertVariableIsOfTypeMacro(page, Size), \
+	 (base) + FPM_PAGE_SIZE * (page))
+#define fpm_pointer_to_page(base, ptr)		\
+	(((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE)
+
+/* Macro to convert an allocation size to a number of pages. */
+#define fpm_size_to_pages(sz) \
+	(((sz) + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE)
+
+/* Macros to check alignment of absolute and relative pointers. */
+#define fpm_pointer_is_page_aligned(base, ptr)		\
+	(((Size) (((char *) (ptr)) - (base))) % FPM_PAGE_SIZE == 0)
+#define fpm_relptr_is_page_aligned(base, relptr)		\
+	((relptr).relptr_off % FPM_PAGE_SIZE == 0)
+
+/* Macro to find base address of the segment containing a FreePageManager. */
+#define fpm_segment_base(fpm)	\
+	(((char *) fpm) - fpm->self.relptr_off)
+
+/* Macro to access a FreePageManager's largest consecutive run of pages. */
+#define fpm_largest(fpm) \
+	(fpm->contiguous_pages)
+
+/* Functions to manipulate the free page map. */
+extern void FreePageManagerInitialize(FreePageManager *fpm, char *base);
+extern bool FreePageManagerGet(FreePageManager *fpm, Size npages,
+				   Size *first_page);
+extern void FreePageManagerPut(FreePageManager *fpm, Size first_page,
+				   Size npages);
+extern char *FreePageManagerDump(FreePageManager *fpm);
+
+#endif   /* FREEPAGE_H */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 6c8995f8004e..712fd7bc6557 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -602,6 +602,8 @@ extern bool gp_log_endpoints;
 
 extern bool gp_allow_date_field_width_5digits;
 
+extern bool gp_track_pending_delete;
+
 typedef enum
 {
 	INDEX_CHECK_NONE,
diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h
new file mode 100644
index 000000000000..f01924a1edf5
--- /dev/null
+++ b/src/include/utils/relptr.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * relptr.h
+ *	  This file contains basic declarations for relative pointers.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/relptr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef RELPTR_H
+#define RELPTR_H
+
+/*
+ * Relative pointers are intended to be used when storing an address that may
+ * be relative either to the base of the processes address space or some
+ * dynamic shared memory segment mapped therein.
+ *
+ * The idea here is that you declare a relative pointer as relptr(type)
+ * and then use relptr_access to dereference it and relptr_store to change
+ * it.  The use of a union here is a hack, because what's stored in the
+ * relptr is always a Size, never an actual pointer.  But including a pointer
+ * in the union allows us to use stupid macro tricks to provide some measure
+ * of type-safety.
+ */
+#define relptr(type)	 union { type *relptr_type; Size relptr_off; }
+
+/*
+ * pgindent gets confused by declarations of the type relptr(type), so it's
+ * useful to give them a name that doesn't include parentheses.
+ */
+#define relptr_declare(type, name) \
+	typedef union { type *relptr_type; Size relptr_off; } name;
+
+#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P
+#define relptr_access(base, rp) \
+	(AssertVariableIsOfTypeMacro(base, char *), \
+	 (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \
+		(base + (rp).relptr_off)))
+#else
+/*
+ * If we don't have __builtin_types_compatible_p, assume we might not have
+ * __typeof__ either.
+ */
+#define relptr_access(base, rp) \
+	(AssertVariableIsOfTypeMacro(base, char *), \
+	 (void *) ((rp).relptr_off == 0 ? NULL : (base + (rp).relptr_off)))
+#endif
+
+#define relptr_is_null(rp) \
+	((rp).relptr_off == 0)
+
+#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P
+#define relptr_store(base, rp, val) \
+	(AssertVariableIsOfTypeMacro(base, char *), \
+	 AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \
+	 (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base)))
+#else
+/*
+ * If we don't have __builtin_types_compatible_p, assume we might not have
+ * __typeof__ either.
+ */
+#define relptr_store(base, rp, val) \
+	(AssertVariableIsOfTypeMacro(base, char *), \
+	 (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base)))
+#endif
+
+#define relptr_copy(rp1, rp2) \
+	((rp1).relptr_off = (rp2).relptr_off)
+
+#endif   /* RELPTR_H */
diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h
index f41f3320cef9..1dcad86fcc01 100644
--- a/src/include/utils/unsync_guc_name.h
+++ b/src/include/utils/unsync_guc_name.h
@@ -565,3 +565,4 @@
 		"xmlbinary",
 		"xmloption",
 		"zero_damaged_pages",
+		"gp_track_pending_delete",
diff --git a/src/test/isolation2/expected/gp_orphaned_files.out b/src/test/isolation2/expected/gp_orphaned_files.out
new file mode 100644
index 000000000000..39195e2e4381
--- /dev/null
+++ b/src/test/isolation2/expected/gp_orphaned_files.out
@@ -0,0 +1,352 @@
+-- start_ignore
+-- end_ignore
+
+
+-- Test case 1
+-- Check that orphaned files are not left on the coordinator and the standby
+-- when the files are created before checkpoint
+
+-- Create tables of different access methods and return command to check their
+-- files existence on the coordinator and the standby
+1: create or replace function createTables(n text) returns text as $$ declare cmd text; /**/ begin execute 'create table t_orphaned_h'||n||'(i int) distributed by (i)'; /**/ 
+execute 'create table t_orphaned_r'||n||'(i int) with (appendonly=true, orientation=row) distributed by (i)'; /**/ -- Create index to create block directory table execute 'create index t_orphaned_r'||n||'_i on t_orphaned_r'||n||'(i)'; /**/ 
+execute 'create table t_orphaned_c'||n||'(i int) with (appendonly=true, orientation=column) distributed by (i)'; /**/ /* Create index to create block directory table */ execute 'create index t_orphaned_c'||n||'_i on t_orphaned_c'||n||'(i)'; /**/ 
+/* Ensure that the mirrors have applied the filesystem changes */ perform force_mirrors_to_catch_up(); /**/ 
+/* The command do not output PGDATA directories to make it possible to run the test without docker */ select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) into cmd from ( select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ') || ' 2>/dev/null | wc -l' lswc from ( select unnest(array[('t_orphaned_h'||n)::regclass, ('t_orphaned_r'||n)::regclass, ('t_orphaned_r'||n||'_i')::regclass, ('t_orphaned_c'||n)::regclass, ('t_orphaned_c'||n||'_i')::regclass]) union all select unnest(array[segrelid, blkdirrelid, blkdiridxid, visimaprelid, visimapidxid]) from pg_catalog.pg_appendonly where relid in (('t_orphaned_r'||n)::regclass, ('t_orphaned_c'||n)::regclass) ) a ) f, (select datadir from gp_segment_configuration where content = -1) d; /**/ 
+return cmd; /**/ end $$ language plpgsql;
+CREATE
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+BEGIN
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : select createTables('1') check_files;
+
+2: begin;
+BEGIN
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : select createTables('2') check_files;
+
+1: checkpoint;
+CHECKPOINT
+
+-- Make sure that the tables files exist on the coordinator and the standby
+1: ! sh /tmp/gp_orphaned_files.sh;
+15
+15
+15
+15
+
+
+-- Get segfault on the coordinator and reconnect after its restart
+1: select gp_inject_fault('exec_simple_query_start', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- The error message can be different, so ignore it
+1: @post_run 'echo ""' : select 1;
+
+-- Wait for the coordinator to be recovered
+! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done;
+
+1q: ... <quitting>
+2q: ... <quitting>
+
+1: select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+                           
+(1 row)
+
+-- Check that the tables files don't exist on the coordinator and the standby
+! sh /tmp/gp_orphaned_files.sh;
+0
+0
+0
+0
+
+
+-- Cleanup
+! rm /tmp/gp_orphaned_files.sh;
+
+1: drop function createTables(n text);
+DROP
+
+
+-- Test case 2
+-- Check that orphaned files are not left on segments when the files are created
+-- before checkpoint
+
+1: create or replace function getTableSegFiles (t regclass, out gp_contentid smallint, out filepath text) as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' language sql execute on all segments;
+CREATE
+
+1: create or replace function createTables(n text) returns text as $$ declare cmd text; /**/ begin /* Minimal fillfactor to minimize rows number for creating second main fork file */ execute 'create table t_orphaned_h'||n||'(i int) with (fillfactor=10) distributed by (i)'; /**/ /* Create the .1 file. Separate insert to create FSM. */ execute 'insert into t_orphaned_h'||n||' select generate_series(1,9000000)'; /**/ 
+execute 'create table t_orphaned_r'||n||'(i int) with (appendonly=true, orientation=row) distributed by (i)'; /**/ /* Create the .1 file */ execute 'insert into t_orphaned_r'||n||' select generate_series(1,100)'; /**/ 
+/* Create the .128 file */ execute 'create table t_orphaned_c'||n||' with (appendonly=true, orientation=column) as select i as i, i*2 as j from generate_series(1,100) i distributed by (i)'; /**/ /* Create the .1 and .129 files */ execute 'insert into t_orphaned_c'||n||' select i as i, i*2 as j from generate_series(1,100) i'; /**/ 
+/* Ensure that the mirrors have applied the filesystem changes */ perform force_mirrors_to_catch_up(); /**/ 
+/* The command do not output PGDATA directories to make it possible to run the test without docker */ select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) into cmd from ( select gp_contentid, 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc from ( select gp_contentid, filepath || suf f from getTableSegFiles('t_orphaned_h'||n), (values(''), ('.1'), ('_fsm')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_r'||n), (values(''), ('.1')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_c'||n), (values(''), ('.1'), ('.128'), ('.129')) v(suf) ) a group by gp_contentid ) f, (select content, datadir from gp_segment_configuration where content > -1) d where f.gp_contentid = d.content; /**/ 
+return cmd; /**/ end $$ language plpgsql;
+CREATE
+
+-- Test case 2.1
+-- Segfault on all segments
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+BEGIN
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : select createTables('1') check_files;
+
+2: begin;
+BEGIN
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : select createTables('2') check_files;
+
+1: checkpoint;
+CHECKPOINT
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files.sh;
+9
+9
+9
+9
+9
+9
+9
+9
+9
+9
+9
+9
+
+
+-- Get segfault on all segments
+1: select gp_inject_fault('qe_exec_finished', 'segv', dbid) from gp_segment_configuration where role = 'p' and content != -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+ Success:        
+ Success:        
+(3 rows)
+
+-- The error message can be different, so ignore it
+1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id');
+
+
+-- Rollback the transaction to make it possible to run queries after the error
+1: rollback;
+ROLLBACK
+2: rollback;
+ROLLBACK
+
+1: select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+                           
+(1 row)
+
+-- Check that the tables files don't exist on the segments
+! sh /tmp/gp_orphaned_files.sh;
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+
+
+
+-- Test case 2.2
+-- Segfault on one segment
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+BEGIN
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' : select createTables('1') check_files;
+
+2: begin;
+BEGIN
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' : select createTables('2') check_files;
+
+1: checkpoint;
+CHECKPOINT
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files.sh;
+9
+9
+9
+9
+9
+9
+9
+9
+9
+9
+9
+9
+
+
+-- Get segfault on a segment
+1: select gp_inject_fault('qe_exec_finished', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- The error message can be different, so ignore it
+1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id');
+
+
+-- Rollback the transaction to make it possible to run queries after the error
+1: rollback;
+ROLLBACK
+2: rollback;
+ROLLBACK
+
+1: select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+                           
+(1 row)
+
+-- Make a checkpoint to remove orphaned files from segments where segfault did
+-- not happen
+1: select gp_inject_fault_infinite('checkpoint', 'reset', dbid) from gp_segment_configuration where role = 'p' and content > -1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+ Success:                 
+ Success:                 
+(3 rows)
+1: checkpoint;
+CHECKPOINT
+
+-- Check that the tables files don't exist on the segments
+! sh /tmp/gp_orphaned_files.sh;
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+
+
+
+-- Cleanup
+! rm /tmp/gp_orphaned_files.sh;
+
+1: drop function createTables(n text);
+DROP
+1: drop function getTableSegFiles (t regclass, out gp_contentid smallint, out filepath text);
+DROP
+
+
+-- Test case 3
+-- Check that table files are not deleted in the case of prepared transaction
+
+-- Don't create checkpoints on the segment number 1
+1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- Stop after `MyPgXact->delayChkpt = false` and before `PostPrepare_smgr()`
+-- Stop at the beginning of the checkpointer loop
+1: select gp_inject_fault_infinite('end_prepare_two_phase', 'suspend', dbid), gp_inject_fault_infinite('ckpt_loop_begin', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite | gp_inject_fault_infinite 
+--------------------------+--------------------------
+ Success:                 | Success:                 
+(1 row)
+
+1&: select gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role = 'p' and content = 1;  <waiting ...>
+
+2&: create table t(i int) distributed by (i);  <waiting ...>
+1<:  <... completed>
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+1&: select gp_wait_until_triggered_fault('ckpt_loop_begin', 1, dbid) from gp_segment_configuration where role = 'p' and content = 1;  <waiting ...>
+
+-- Create a checkpoint and the XLOG_PENDING_DELETE WAL record with RelFileNode
+-- of the created table. No more creating checkpoint
+3: select gp_inject_fault_infinite('checkpoint', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+3&: checkpoint;  <waiting ...>
+1<:  <... completed>
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+1: select gp_inject_fault_infinite('ckpt_loop_end', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+1: select gp_inject_fault_infinite('ckpt_loop_begin', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+1: select gp_wait_until_triggered_fault('ckpt_loop_end', 1, dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+3<:  <... completed>
+CHECKPOINT
+3q: ... <quitting>
+1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+1: select gp_inject_fault_infinite('ckpt_loop_end', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- Get a segfault on the segment number 1 at the beginning of the prepared
+-- transaction commit
+1: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+1: select gp_inject_fault_infinite('end_prepare_two_phase', 'resume', dbid) from gp_segment_configuration where role = 'p' and content = 1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+1q: ... <quitting>
+2<:  <... completed>
+CREATE
+2q: ... <quitting>
+
+-- Check that the table files are not removed
+1: select * from t;
+ i 
+---
+(0 rows)
+
+-- Cleanup
+1: drop table t;
+DROP
diff --git a/src/test/isolation2/expected/gp_orphaned_files_fts_promote.out b/src/test/isolation2/expected/gp_orphaned_files_fts_promote.out
new file mode 100644
index 000000000000..66a045587581
--- /dev/null
+++ b/src/test/isolation2/expected/gp_orphaned_files_fts_promote.out
@@ -0,0 +1,356 @@
+
+include: helpers/server_helpers.sql;
+CREATE
+
+1: create or replace function getTableSegFiles (t regclass, out gp_contentid smallint, out filepath text) as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)' language sql execute on all segments;
+CREATE
+
+1: create or replace function createTables(n text, mirror_catch_up bool default true) returns text as $$ declare cmd text; /**/ begin execute 'create table t_orphaned_h'||n||'(i int) distributed by (i)'; /**/ execute 'insert into t_orphaned_h'||n||' select generate_series(1,100)'; /**/ 
+execute 'create table t_orphaned_r'||n||'(i int) with (appendonly=true, orientation=row) distributed by (i)'; /**/ /* Create the .1 file */ execute 'insert into t_orphaned_r'||n||' select generate_series(1,100)'; /**/ 
+/* Create the .128 file */ execute 'create table t_orphaned_c'||n||' with (appendonly=true, orientation=column) as select i as i, i*2 as j from generate_series(1,100) i distributed by (i)'; /**/ /* Create the .1 and .129 files */ execute 'insert into t_orphaned_c'||n||' select i as i, i*2 as j from generate_series(1,100) i'; /**/ 
+if mirror_catch_up then /* Ensure that the mirrors have applied the filesystem changes */ perform force_mirrors_to_catch_up(); /**/ end if; /**/ 
+/* The command do not output PGDATA directories to make it possible to run the test without docker */ select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) into cmd from ( select gp_contentid, 'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc from ( select gp_contentid, filepath || suf f from getTableSegFiles('t_orphaned_h'||n), (values(''), ('.1'), ('_fsm')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_r'||n), (values(''), ('.1')) v(suf) union all select gp_contentid, filepath || suf from getTableSegFiles('t_orphaned_c'||n), (values(''), ('.1'), ('.128'), ('.129')) v(suf) ) a group by gp_contentid ) f, (select content, datadir from gp_segment_configuration where content > -1) d where f.gp_contentid = d.content; /**/ 
+return cmd; /**/ end $$ language plpgsql;
+CREATE
+
+-- A copy of standard 'force_mirrors_to_catch_up()', but it forces all mirrors
+-- except the one specified by the argument
+-- (should be used in case one of mirrors is currently down).
+1: create or replace function force_mirrors_to_catch_up_with_exception(excluded_content int) returns void as $$ begin perform pg_switch_xlog(); /**/ perform pg_switch_xlog() from gp_dist_random('gp_id'); /**/ perform gp_inject_fault('after_xlog_redo_noop', 'sleep', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ perform insert_noop_xlog_record(); /**/ perform insert_noop_xlog_record() from gp_dist_random('gp_id'); /**/ perform gp_wait_until_triggered_fault('after_xlog_redo_noop', 1, dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ perform gp_inject_fault('after_xlog_redo_noop', 'reset', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/ end $$ language plpgsql;
+CREATE
+
+-- Test case 1
+-- Check removal of orphaned files together with mirror promotion
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+BEGIN
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1');
+
+-- Let 2nd transaction to commit
+2: begin;
+BEGIN
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' : select createTables('_tx2');
+2: commit;
+COMMIT
+1: checkpoint;
+CHECKPOINT
+
+-- Create another bunch of tables after savepoint
+1: savepoint sp1;
+SAVEPOINT
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1_sp1');
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files_tx1.sh;
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+
+
+-- shutdown primary and make sure the segment is down
+-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where c.role='p' and c.content=0), 'stop', 'immediate');
+ pg_ctl 
+--------
+ OK     
+(1 row)
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+ role | preferred_role | status 
+------+----------------+--------
+ m    | p              | d      
+ p    | m              | u      
+(2 rows)
+
+-- Rollback the transaction to make it possible to run queries after the error
+1: rollback;
+ROLLBACK
+
+-- Make a checkpoint to remove orphaned files from segments that are still up
+1: checkpoint;
+CHECKPOINT
+
+1: select force_mirrors_to_catch_up_with_exception(0);
+ force_mirrors_to_catch_up_with_exception 
+------------------------------------------
+                                          
+(1 row)
+
+-- Check that the tables files don't exist on the segments (except ex-primary 0, which is yet down)
+! sh /tmp/gp_orphaned_files_tx1.sh;
+7
+0
+0
+0
+0
+0
+7
+0
+0
+0
+0
+0
+
+
+-- recovery the nodes
+!\retcode gprecoverseg -a;
+(exited with code 0)
+select wait_until_segment_synchronized(0);
+ wait_until_segment_synchronized 
+---------------------------------
+ OK                              
+(1 row)
+
+-- Check that the tables files don't exist on all segments now
+! sh /tmp/gp_orphaned_files_tx1.sh;
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+
+
+!\retcode gprecoverseg -ar;
+(exited with code 0)
+select wait_until_segment_synchronized(0);
+ wait_until_segment_synchronized 
+---------------------------------
+ OK                              
+(1 row)
+
+-- verify the first segment is recovered to the original state.
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+ role | preferred_role | status 
+------+----------------+--------
+ p    | p              | u      
+ m    | m              | u      
+(2 rows)
+
+-- Check that the tables from the committed transaction still exist
+! sh /tmp/gp_orphaned_files_tx2.sh;
+7
+7
+7
+7
+7
+7
+
+
+drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2;
+DROP
+
+-- Test case 2
+-- Check that orphaned files are not removed after prepare is done
+-- together with mirror promotion
+-- and with orphaned files created (and later cleaned up) when the mirror is down.
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+BEGIN
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1');
+
+-- Let 2nd transaction to commit
+2: begin;
+BEGIN
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' : select createTables('_tx2');
+2: commit;
+COMMIT
+1: checkpoint;
+CHECKPOINT
+
+-- Create another bunch of tables after savepoint
+1: savepoint sp1;
+SAVEPOINT
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' : select createTables('_tx1_sp1');
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files_tx1.sh;
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+
+
+-- Suspend commit after prepare
+select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+1&: commit;  <waiting ...>
+select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- shutdown primary and make sure the segment is down
+-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where c.role='p' and c.content=0), 'stop', 'immediate');
+ pg_ctl 
+--------
+ OK     
+(1 row)
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+ role | preferred_role | status 
+------+----------------+--------
+ m    | p              | d      
+ p    | m              | u      
+(2 rows)
+
+3: begin;
+BEGIN
+3: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx3.sh' : select createTables('_tx3', false);
+
+-- Get segfault on a segment
+3: select gp_inject_fault('qe_exec_finished', 'segv', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- The error message can be different, so ignore it
+3: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id');
+
+
+3: rollback;
+ROLLBACK
+3: checkpoint;
+CHECKPOINT
+
+3: select force_mirrors_to_catch_up_with_exception(0);
+ force_mirrors_to_catch_up_with_exception 
+------------------------------------------
+                                          
+(1 row)
+
+! sh /tmp/gp_orphaned_files_tx3.sh;
+0
+0
+0
+0
+0
+0
+
+
+-- recovery the nodes
+!\retcode gprecoverseg -a;
+(exited with code 0)
+select wait_until_segment_synchronized(0);
+ wait_until_segment_synchronized 
+---------------------------------
+ OK                              
+(1 row)
+
+!\retcode gprecoverseg -ar;
+(exited with code 0)
+select wait_until_segment_synchronized(0);
+ wait_until_segment_synchronized 
+---------------------------------
+ OK                              
+(1 row)
+
+-- verify the first segment is recovered to the original state.
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+ role | preferred_role | status 
+------+----------------+--------
+ p    | p              | u      
+ m    | m              | u      
+(2 rows)
+
+select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1<:  <... completed>
+COMMIT
+
+-- Check that the tables from the committed transactions still exist
+! sh /tmp/gp_orphaned_files_tx1.sh;
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+7
+
+! sh /tmp/gp_orphaned_files_tx2.sh;
+7
+7
+7
+7
+7
+7
+
+
+-- Check that the tables from the not committed transaction don't exist
+! sh /tmp/gp_orphaned_files_tx3.sh;
+0
+0
+0
+0
+0
+0
+
+
+-- Cleanup
+drop table t_orphaned_h_tx1, t_orphaned_r_tx1, t_orphaned_c_tx1;
+DROP
+drop table t_orphaned_h_tx1_sp1, t_orphaned_r_tx1_sp1, t_orphaned_c_tx1_sp1;
+DROP
+drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2;
+DROP
+
+drop function force_mirrors_to_catch_up_with_exception(excluded_content int);
+DROP
+drop function createTables(n text, mirror_catch_up bool);
+DROP
+drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text);
+DROP
+
+! rm /tmp/gp_orphaned_files_tx1.sh;
+
+! rm /tmp/gp_orphaned_files_tx2.sh;
+
+! rm /tmp/gp_orphaned_files_tx3.sh;
+
diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule
index d5673be13995..48e6fa37fd0e 100644
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
@@ -358,4 +358,8 @@ test: copy_interrupt
 
 # test pg_locks view and pg_lock_status() function
 test: lock_status
+
 test: dependency
+
+test: gp_orphaned_files
+test: gp_orphaned_files_fts_promote
diff --git a/src/test/isolation2/sql/gp_orphaned_files.sql b/src/test/isolation2/sql/gp_orphaned_files.sql
new file mode 100644
index 000000000000..1678f2ce7d87
--- /dev/null
+++ b/src/test/isolation2/sql/gp_orphaned_files.sql
@@ -0,0 +1,325 @@
+-- start_ignore
+1: create extension if not exists gp_inject_fault;
+1: drop index if exists t_orphaned_r1_i, t_orphaned_c1_i,
+                        t_orphaned_r2_i, t_orphaned_c2_i;
+1: drop table if exists t_orphaned_h1, t_orphaned_r1, t_orphaned_c1,
+                        t_orphaned_h2, t_orphaned_r2, t_orphaned_c2, t;
+-- end_ignore
+
+
+-- Test case 1
+-- Check that orphaned files are not left on the coordinator and the standby
+-- when the files are created before checkpoint
+
+-- Create tables of different access methods and return command to check their
+-- files existence on the coordinator and the standby
+1: create or replace function createTables(n text) returns text as
+$$
+declare
+  cmd text; /**/
+begin
+  execute 'create table t_orphaned_h'||n||'(i int) distributed by (i)'; /**/
+
+  execute 'create table t_orphaned_r'||n||'(i int)
+           with (appendonly=true, orientation=row)
+           distributed by (i)'; /**/
+  -- Create index to create block directory table
+  execute 'create index t_orphaned_r'||n||'_i on t_orphaned_r'||n||'(i)'; /**/
+
+  execute 'create table t_orphaned_c'||n||'(i int)
+           with (appendonly=true, orientation=column)
+           distributed by (i)'; /**/
+  /* Create index to create block directory table */
+  execute 'create index t_orphaned_c'||n||'_i on t_orphaned_c'||n||'(i)'; /**/
+
+  /* Ensure that the mirrors have applied the filesystem changes */
+  perform force_mirrors_to_catch_up(); /**/
+
+  /* The command do not output PGDATA directories to make it possible to run
+     the test without docker */
+  select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ')
+                 || ' 2>/dev/null | wc -l' lswc
+    from (
+      select unnest(array[('t_orphaned_h'||n)::regclass,
+                          ('t_orphaned_r'||n)::regclass,
+                          ('t_orphaned_r'||n||'_i')::regclass,
+                          ('t_orphaned_c'||n)::regclass,
+                          ('t_orphaned_c'||n||'_i')::regclass])
+      union all
+      select unnest(array[segrelid,
+                          blkdirrelid, blkdiridxid,
+                          visimaprelid, visimapidxid])
+        from pg_catalog.pg_appendonly
+       where relid in (('t_orphaned_r'||n)::regclass,
+                       ('t_orphaned_c'||n)::regclass)
+    ) a
+  ) f,
+  (select datadir from gp_segment_configuration where content = -1) d; /**/
+
+  return cmd; /**/
+end
+$$ language plpgsql;
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' :
+             select createTables('1') check_files;
+
+2: begin;
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' :
+             select createTables('2') check_files;
+
+1: checkpoint;
+
+-- Make sure that the tables files exist on the coordinator and the standby
+1: ! sh /tmp/gp_orphaned_files.sh;
+
+-- Get segfault on the coordinator and reconnect after its restart
+1: select gp_inject_fault('exec_simple_query_start', 'segv', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = -1;
+
+-- The error message can be different, so ignore it
+1: @post_run 'echo ""' : select 1;
+-- Wait for the coordinator to be recovered
+! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done;
+1q:
+2q:
+
+1: select force_mirrors_to_catch_up();
+
+-- Check that the tables files don't exist on the coordinator and the standby
+! sh /tmp/gp_orphaned_files.sh;
+
+-- Cleanup
+! rm /tmp/gp_orphaned_files.sh;
+1: drop function createTables(n text);
+
+
+-- Test case 2
+-- Check that orphaned files are not left on segments when the files are created
+-- before checkpoint
+
+1: create or replace function getTableSegFiles
+(t regclass, out gp_contentid smallint, out filepath text)
+as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)'
+language sql
+execute on all segments;
+
+1: create or replace function createTables(n text) returns text as
+$$
+declare
+  cmd text; /**/
+begin
+  /* Minimal fillfactor to minimize rows number for creating second main fork
+     file */
+  execute 'create table t_orphaned_h'||n||'(i int)
+           with (fillfactor=10)
+           distributed by (i)'; /**/
+  /* Create the .1 file. Separate insert to create FSM. */
+  execute 'insert into t_orphaned_h'||n||'
+           select generate_series(1,9000000)'; /**/
+
+  execute 'create table t_orphaned_r'||n||'(i int)
+           with (appendonly=true, orientation=row)
+           distributed by (i)'; /**/
+  /* Create the .1 file */
+  execute 'insert into t_orphaned_r'||n||'
+           select generate_series(1,100)'; /**/
+
+  /* Create the .128 file */
+  execute 'create table t_orphaned_c'||n||'
+           with (appendonly=true, orientation=column) as
+           select i as i, i*2 as j from generate_series(1,100) i
+           distributed by (i)'; /**/
+  /* Create the .1 and .129 files */
+  execute 'insert into t_orphaned_c'||n||'
+           select i as i, i*2 as j from generate_series(1,100) i'; /**/
+
+  /* Ensure that the mirrors have applied the filesystem changes */
+  perform force_mirrors_to_catch_up(); /**/
+
+  /* The command do not output PGDATA directories to make it possible to run
+     the test without docker */
+  select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select gp_contentid,
+           'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc
+    from (
+      select gp_contentid, filepath || suf f
+        from getTableSegFiles('t_orphaned_h'||n),
+             (values(''), ('.1'), ('_fsm')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_r'||n),
+             (values(''), ('.1')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_c'||n),
+             (values(''), ('.1'), ('.128'), ('.129')) v(suf)
+    ) a
+    group by gp_contentid
+  ) f,
+  (select content, datadir from gp_segment_configuration where content > -1) d
+  where f.gp_contentid = d.content; /**/
+
+  return cmd; /**/
+end
+$$ language plpgsql;
+
+-- Test case 2.1
+-- Segfault on all segments
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' :
+             select createTables('1') check_files;
+
+2: begin;
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' :
+             select createTables('2') check_files;
+
+1: checkpoint;
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files.sh;
+
+-- Get segfault on all segments
+1: select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content != -1;
+
+-- The error message can be different, so ignore it
+1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id');
+
+-- Rollback the transaction to make it possible to run queries after the error
+1: rollback;
+2: rollback;
+
+1: select force_mirrors_to_catch_up();
+
+-- Check that the tables files don't exist on the segments
+! sh /tmp/gp_orphaned_files.sh;
+
+
+-- Test case 2.2
+-- Segfault on one segment
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files.sh' :
+             select createTables('1') check_files;
+
+2: begin;
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files.sh' :
+             select createTables('2') check_files;
+
+1: checkpoint;
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files.sh;
+
+-- Get segfault on a segment
+1: select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+
+-- The error message can be different, so ignore it
+1: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id');
+
+-- Rollback the transaction to make it possible to run queries after the error
+1: rollback;
+2: rollback;
+
+1: select force_mirrors_to_catch_up();
+
+-- Make a checkpoint to remove orphaned files from segments where segfault did
+-- not happen
+1: select gp_inject_fault_infinite('checkpoint', 'reset', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content > -1;
+1: checkpoint;
+
+-- Check that the tables files don't exist on the segments
+! sh /tmp/gp_orphaned_files.sh;
+
+
+-- Cleanup
+! rm /tmp/gp_orphaned_files.sh;
+1: drop function createTables(n text);
+1: drop function getTableSegFiles
+   (t regclass, out gp_contentid smallint, out filepath text);
+
+
+-- Test case 3
+-- Check that table files are not deleted in the case of prepared transaction
+
+-- Don't create checkpoints on the segment number 1
+1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+
+-- Stop after `MyPgXact->delayChkpt = false` and before `PostPrepare_smgr()`
+-- Stop at the beginning of the checkpointer loop
+1: select gp_inject_fault_infinite('end_prepare_two_phase', 'suspend', dbid),
+          gp_inject_fault_infinite('ckpt_loop_begin', 'suspend', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+
+1&: select gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+
+2&: create table t(i int) distributed by (i);
+1<:
+
+1&: select gp_wait_until_triggered_fault('ckpt_loop_begin', 1, dbid)
+      from gp_segment_configuration
+     where role = 'p' and content = 1;
+
+-- Create a checkpoint and the XLOG_PENDING_DELETE WAL record with RelFileNode
+-- of the created table. No more creating checkpoint
+3: select gp_inject_fault_infinite('checkpoint', 'reset', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+3&: checkpoint;
+1<:
+1: select gp_inject_fault_infinite('ckpt_loop_end', 'suspend', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+1: select gp_inject_fault_infinite('ckpt_loop_begin', 'reset', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+1: select gp_wait_until_triggered_fault('ckpt_loop_end', 1, dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+3<:
+3q:
+1: select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+1: select gp_inject_fault_infinite('ckpt_loop_end', 'reset', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+
+-- Get a segfault on the segment number 1 at the beginning of the prepared
+-- transaction commit
+1: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'segv', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+1: select gp_inject_fault_infinite('end_prepare_two_phase', 'resume', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 1;
+1q:
+2<:
+2q:
+
+-- Check that the table files are not removed
+1: select * from t;
+
+-- Cleanup
+1: drop table t;
diff --git a/src/test/isolation2/sql/gp_orphaned_files_fts_promote.sql b/src/test/isolation2/sql/gp_orphaned_files_fts_promote.sql
new file mode 100644
index 000000000000..12257b9f528b
--- /dev/null
+++ b/src/test/isolation2/sql/gp_orphaned_files_fts_promote.sql
@@ -0,0 +1,248 @@
+-- start_ignore
+-- Increase the number of connection attempts to a segment to 120, reduce
+-- the interval between attempts to 1 second. So the segments will have 120
+-- seconds to recover after segfault.
+! gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
+! gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
+! gpstop -u;
+1: create extension if not exists gp_inject_fault;
+-- end_ignore
+
+include: helpers/server_helpers.sql;
+
+1: create or replace function getTableSegFiles
+(t regclass, out gp_contentid smallint, out filepath text)
+as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)'
+language sql
+execute on all segments;
+
+1: create or replace function createTables(n text, mirror_catch_up bool default true) returns text as
+$$
+declare
+  cmd text; /**/
+begin
+  execute 'create table t_orphaned_h'||n||'(i int)
+           distributed by (i)'; /**/
+  execute 'insert into t_orphaned_h'||n||'
+           select generate_series(1,100)'; /**/
+
+  execute 'create table t_orphaned_r'||n||'(i int)
+           with (appendonly=true, orientation=row)
+           distributed by (i)'; /**/
+  /* Create the .1 file */
+  execute 'insert into t_orphaned_r'||n||'
+           select generate_series(1,100)'; /**/
+
+  /* Create the .128 file */
+  execute 'create table t_orphaned_c'||n||'
+           with (appendonly=true, orientation=column) as
+           select i as i, i*2 as j from generate_series(1,100) i
+           distributed by (i)'; /**/
+  /* Create the .1 and .129 files */
+  execute 'insert into t_orphaned_c'||n||'
+           select i as i, i*2 as j from generate_series(1,100) i'; /**/
+
+  if mirror_catch_up then
+	/* Ensure that the mirrors have applied the filesystem changes */
+	perform force_mirrors_to_catch_up(); /**/
+  end if; /**/
+
+  /* The command do not output PGDATA directories to make it possible to run
+     the test without docker */
+  select string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select gp_contentid,
+           'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc
+    from (
+      select gp_contentid, filepath || suf f
+        from getTableSegFiles('t_orphaned_h'||n),
+             (values(''), ('.1'), ('_fsm')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_r'||n),
+             (values(''), ('.1')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_c'||n),
+             (values(''), ('.1'), ('.128'), ('.129')) v(suf)
+    ) a
+    group by gp_contentid
+  ) f,
+  (select content, datadir from gp_segment_configuration where content > -1) d
+  where f.gp_contentid = d.content; /**/
+
+  return cmd; /**/
+end
+$$ language plpgsql;
+
+-- A copy of standard 'force_mirrors_to_catch_up()', but it forces all mirrors
+-- except the one specified by the argument
+-- (should be used in case one of mirrors is currently down).
+1: create or replace function force_mirrors_to_catch_up_with_exception(excluded_content int) returns void as
+$$
+begin
+	perform pg_switch_xlog(); /**/
+	perform pg_switch_xlog() from gp_dist_random('gp_id'); /**/
+	perform gp_inject_fault('after_xlog_redo_noop', 'sleep', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/
+	perform insert_noop_xlog_record(); /**/
+	perform insert_noop_xlog_record() from gp_dist_random('gp_id'); /**/
+	perform gp_wait_until_triggered_fault('after_xlog_redo_noop', 1, dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/
+	perform gp_inject_fault('after_xlog_redo_noop', 'reset', dbid) from gp_segment_configuration where role='m' and content <> excluded_content; /**/
+end
+$$ language plpgsql;
+
+-- Test case 1
+-- Check removal of orphaned files together with mirror promotion
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' :
+             select createTables('_tx1');
+
+-- Let 2nd transaction to commit
+2: begin;
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' :
+             select createTables('_tx2');
+2: commit;
+1: checkpoint;
+
+-- Create another bunch of tables after savepoint
+1: savepoint sp1;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' :
+             select createTables('_tx1_sp1');
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files_tx1.sh;
+
+-- shutdown primary and make sure the segment is down
+-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c
+  where c.role='p' and c.content=0), 'stop', 'immediate');
+select gp_request_fts_probe_scan();
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+
+-- Rollback the transaction to make it possible to run queries after the error
+1: rollback;
+
+-- Make a checkpoint to remove orphaned files from segments that are still up
+1: checkpoint;
+
+1: select force_mirrors_to_catch_up_with_exception(0);
+
+-- Check that the tables files don't exist on the segments (except ex-primary 0, which is yet down)
+! sh /tmp/gp_orphaned_files_tx1.sh;
+
+-- recovery the nodes
+!\retcode gprecoverseg -a;
+select wait_until_segment_synchronized(0);
+
+-- Check that the tables files don't exist on all segments now
+! sh /tmp/gp_orphaned_files_tx1.sh;
+
+!\retcode gprecoverseg -ar;
+select wait_until_segment_synchronized(0);
+
+-- verify the first segment is recovered to the original state.
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+
+-- Check that the tables from the committed transaction still exist
+! sh /tmp/gp_orphaned_files_tx2.sh;
+
+drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2;
+
+-- Test case 2
+-- Check that orphaned files are not removed after prepare is done
+-- together with mirror promotion
+-- and with orphaned files created (and later cleaned up) when the mirror is down.
+
+-- Start transaction and create tables in it before checkpoint
+1: begin;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx1.sh' :
+             select createTables('_tx1');
+
+-- Let 2nd transaction to commit
+2: begin;
+2: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx2.sh' :
+             select createTables('_tx2');
+2: commit;
+1: checkpoint;
+
+-- Create another bunch of tables after savepoint
+1: savepoint sp1;
+1: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' >> /tmp/gp_orphaned_files_tx1.sh' :
+             select createTables('_tx1_sp1');
+
+-- Make sure that all the tables files exist on the segments
+1: ! sh /tmp/gp_orphaned_files_tx1.sh;
+
+-- Suspend commit after prepare
+select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+
+1&: commit;
+select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+
+-- shutdown primary and make sure the segment is down
+-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c
+  where c.role='p' and c.content=0), 'stop', 'immediate');
+select gp_request_fts_probe_scan();
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+
+3: begin;
+3: @post_run 'echo "${RAW_STR}" | awk \'NR==3\' > /tmp/gp_orphaned_files_tx3.sh' :
+             select createTables('_tx3', false);
+
+-- Get segfault on a segment
+3: select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+     from gp_segment_configuration
+    where role = 'p' and content = 0;
+
+-- The error message can be different, so ignore it
+3: @post_run 'echo ""' : select 1 from gp_dist_random('gp_id');
+
+3: rollback;
+3: checkpoint;
+
+3: select force_mirrors_to_catch_up_with_exception(0);
+
+! sh /tmp/gp_orphaned_files_tx3.sh;
+
+-- recovery the nodes
+!\retcode gprecoverseg -a;
+select wait_until_segment_synchronized(0);
+
+!\retcode gprecoverseg -ar;
+select wait_until_segment_synchronized(0);
+
+-- verify the first segment is recovered to the original state.
+select role, preferred_role, status from gp_segment_configuration where content = 0;
+
+select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid)
+  from gp_segment_configuration where role = 'p' and content = -1;
+1<:
+
+-- Check that the tables from the committed transactions still exist
+! sh /tmp/gp_orphaned_files_tx1.sh;
+! sh /tmp/gp_orphaned_files_tx2.sh;
+
+-- Check that the tables from the not committed transaction don't exist
+! sh /tmp/gp_orphaned_files_tx3.sh;
+
+-- Cleanup
+drop table t_orphaned_h_tx1, t_orphaned_r_tx1, t_orphaned_c_tx1;
+drop table t_orphaned_h_tx1_sp1, t_orphaned_r_tx1_sp1, t_orphaned_c_tx1_sp1;
+drop table t_orphaned_h_tx2, t_orphaned_r_tx2, t_orphaned_c_tx2;
+
+drop function force_mirrors_to_catch_up_with_exception(excluded_content int);
+drop function createTables(n text, mirror_catch_up bool);
+drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text);
+
+! rm /tmp/gp_orphaned_files_tx1.sh;
+! rm /tmp/gp_orphaned_files_tx2.sh;
+! rm /tmp/gp_orphaned_files_tx3.sh;
+-- start_ignore
+! gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly;
+! gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly;
+! gpstop -u;
+-- end_ignore
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 2a940fe40fcb..255c443b1d80 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,5 +7,6 @@ include $(top_builddir)/src/Makefile.global
 SUBDIRS = test_planner
 SUBDIRS += connection 
 SUBDIRS += test_extensions 
+SUBDIRS += test_dsa
 
 $(recurse)
diff --git a/src/test/modules/test_dsa/Makefile b/src/test/modules/test_dsa/Makefile
new file mode 100644
index 000000000000..bcddb84b0618
--- /dev/null
+++ b/src/test/modules/test_dsa/Makefile
@@ -0,0 +1,28 @@
+# src/test/modules/test_dsa/Makefile
+
+MODULE_big = test_dsa
+OBJS = \
+	$(WIN32RES) \
+	test_dsa.o
+PGFILEDESC = "test_dsa - test code for dynamic shared memory areas"
+
+EXTENSION = test_dsa
+DATA = test_dsa--1.0.sql
+
+REGRESS = test_dsa
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_dsa
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+installcheck: install
+
+test: clean all install
+	psql postgres -f sql/test_dsa.sql 2>&1
diff --git a/src/test/modules/test_dsa/expected/test_dsa.out b/src/test/modules/test_dsa/expected/test_dsa.out
new file mode 100644
index 000000000000..266010e77fe9
--- /dev/null
+++ b/src/test/modules/test_dsa/expected/test_dsa.out
@@ -0,0 +1,13 @@
+CREATE EXTENSION test_dsa;
+SELECT test_dsa_basic();
+ test_dsa_basic 
+----------------
+ 
+(1 row)
+
+SELECT test_dsa_resowners();
+ test_dsa_resowners 
+--------------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_dsa/meson.build b/src/test/modules/test_dsa/meson.build
new file mode 100644
index 000000000000..21738290ad58
--- /dev/null
+++ b/src/test/modules/test_dsa/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+test_dsa_sources = files(
+  'test_dsa.c',
+)
+
+if host_system == 'windows'
+  test_dsa_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_dsa',
+    '--FILEDESC', 'test_dsa - test code for dynamic shared memory areas',])
+endif
+
+test_dsa = shared_module('test_dsa',
+  test_dsa_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_dsa
+
+test_install_data += files(
+  'test_dsa.control',
+  'test_dsa--1.0.sql',
+)
+
+tests += {
+  'name': 'test_dsa',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_dsa',
+    ],
+  },
+}
diff --git a/src/test/modules/test_dsa/sql/test_dsa.sql b/src/test/modules/test_dsa/sql/test_dsa.sql
new file mode 100644
index 000000000000..c3d8db943720
--- /dev/null
+++ b/src/test/modules/test_dsa/sql/test_dsa.sql
@@ -0,0 +1,4 @@
+CREATE EXTENSION test_dsa;
+
+SELECT test_dsa_basic();
+SELECT test_dsa_resowners();
diff --git a/src/test/modules/test_dsa/test_dsa--1.0.sql b/src/test/modules/test_dsa/test_dsa--1.0.sql
new file mode 100644
index 000000000000..2904cb23525e
--- /dev/null
+++ b/src/test/modules/test_dsa/test_dsa--1.0.sql
@@ -0,0 +1,12 @@
+/* src/test/modules/test_dsa/test_dsa--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_dsa" to load this file. \quit
+
+CREATE FUNCTION test_dsa_basic()
+	RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION test_dsa_resowners()
+	RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c
new file mode 100644
index 000000000000..5ad4c405d79f
--- /dev/null
+++ b/src/test/modules/test_dsa/test_dsa.c
@@ -0,0 +1,111 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_dsa.c
+ *		Test dynamic shared memory areas (DSAs)
+ *
+ * Copyright (c) 2022-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/test/modules/test_dsa/test_dsa.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "utils/dsa.h"
+#include "storage/lwlock.h"
+#include "utils/resowner.h"
+
+PG_MODULE_MAGIC;
+
+/* Test basic DSA functionality */
+PG_FUNCTION_INFO_V1(test_dsa_basic);
+Datum
+test_dsa_basic(PG_FUNCTION_ARGS)
+{
+	int			tranche_id;
+	dsa_area   *a;
+	dsa_pointer p[100];
+
+	/* XXX: this tranche is leaked */
+	tranche_id = LWLockNewTrancheId();
+
+	a = dsa_create(tranche_id, "test_dsa");
+	for (int i = 0; i < 100; i++)
+	{
+		p[i] = dsa_allocate(a, 1000);
+		snprintf(dsa_get_address(a, p[i]), 1000, "foobar%d", i);
+	}
+
+	for (int i = 0; i < 100; i++)
+	{
+		char		buf[100];
+
+		snprintf(buf, 100, "foobar%d", i);
+		if (strcmp(dsa_get_address(a, p[i]), buf) != 0)
+			elog(ERROR, "no match");
+	}
+
+	for (int i = 0; i < 100; i++)
+	{
+		dsa_free(a, p[i]);
+	}
+
+	dsa_detach(a);
+
+	PG_RETURN_VOID();
+}
+
+/* Test using DSA across different resource owners */
+PG_FUNCTION_INFO_V1(test_dsa_resowners);
+Datum
+test_dsa_resowners(PG_FUNCTION_ARGS)
+{
+	int			tranche_id;
+	dsa_area   *a;
+	dsa_pointer p[10000];
+	ResourceOwner oldowner;
+	ResourceOwner childowner;
+
+	/* XXX: this tranche is leaked */
+	tranche_id = LWLockNewTrancheId();
+
+	/* Create DSA in parent resource owner */
+	a = dsa_create(tranche_id, "test_dsa");
+
+	/*
+	 * Switch to child resource owner, and do a bunch of allocations in the
+	 * DSA
+	 */
+	oldowner = CurrentResourceOwner;
+	childowner = ResourceOwnerCreate(oldowner, "test_dsa temp owner");
+	CurrentResourceOwner = childowner;
+
+	for (int i = 0; i < 10000; i++)
+	{
+		p[i] = dsa_allocate(a, 1000);
+		snprintf(dsa_get_address(a, p[i]), 1000, "foobar%d", i);
+	}
+
+	/* Also test freeing, by freeing some of the allocations. */
+	for (int i = 0; i < 500; i++)
+		dsa_free(a, p[i]);
+
+	/* Release the child resource owner */
+	CurrentResourceOwner = oldowner;
+	ResourceOwnerRelease(childowner,
+						 RESOURCE_RELEASE_BEFORE_LOCKS,
+						 true, false);
+	ResourceOwnerRelease(childowner,
+						 RESOURCE_RELEASE_LOCKS,
+						 true, false);
+	ResourceOwnerRelease(childowner,
+						 RESOURCE_RELEASE_AFTER_LOCKS,
+						 true, false);
+	ResourceOwnerDelete(childowner);
+
+	dsa_detach(a);
+
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_dsa/test_dsa.control b/src/test/modules/test_dsa/test_dsa.control
new file mode 100644
index 000000000000..ac9674b2193d
--- /dev/null
+++ b/src/test/modules/test_dsa/test_dsa.control
@@ -0,0 +1,4 @@
+comment = 'Test code for dynamic shared memory areas'
+default_version = '1.0'
+module_pathname = '$libdir/test_dsa'
+relocatable = true
diff --git a/src/test/regress/expected/gp_orphaned_files.out b/src/test/regress/expected/gp_orphaned_files.out
new file mode 100644
index 000000000000..c8cc49b76b8f
--- /dev/null
+++ b/src/test/regress/expected/gp_orphaned_files.out
@@ -0,0 +1,437 @@
+-- start_ignore
+-- end_ignore
+-- start_matchsubs
+-- m/ERROR:  Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/
+-- s/ERROR:  Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/ERROR:  Error on receive from segX sliceX X.X.X.X:X pid=X: server closed the connection unexpectedly/
+-- end_matchsubs
+-- Test case 1
+-- Check that orphaned files are not left on the coordinator and the standby
+-- when the files are created after checkpoint
+-- Create tables of different access methods and return command to check their
+-- files existence on the coordinator and the standby
+create or replace function createTables() returns text as
+$$
+declare
+  cmd text;
+begin
+  create table t_orphaned_h(i int)
+  distributed by (i);
+
+  create table t_orphaned_r(i int)
+  with (appendonly=true, orientation=row)
+  distributed by (i);
+  -- Create index to create block directory table
+  create index t_orphaned_r_i on t_orphaned_r(i);
+
+  create table t_orphaned_c(i int)
+  with (appendonly=true, orientation=column)
+  distributed by (i);
+  -- Create index to create block directory table
+  create index t_orphaned_c_i on t_orphaned_c(i);
+
+  -- Ensure that the mirrors have applied the filesystem changes
+  perform force_mirrors_to_catch_up();
+
+  -- The command do not output PGDATA directories to make it possible to run
+  -- the test without docker
+  select '\! ' ||
+         string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ')
+                 || ' 2>/dev/null | wc -l' lswc
+    from (
+      select unnest(array['t_orphaned_h'::regclass,
+                          't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass,
+                          't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass])
+      union all
+      select unnest(array[segrelid,
+                          blkdirrelid, blkdiridxid,
+                          visimaprelid, visimapidxid])
+        from pg_catalog.pg_appendonly
+       where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass)
+    ) a
+  ) f,
+  (select datadir from gp_segment_configuration where content = -1) d;
+
+  return cmd;
+end
+$$ language plpgsql;
+checkpoint;
+-- Skip checkpoints on the coordinator
+select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = -1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:
+(1 row)
+
+-- Create tables in subtransactions
+begin;
+create table t_top(i int) distributed by (i);
+savepoint sp1;
+create table t_sub1(i int) distributed by (i);
+savepoint sp2;
+create table t_sub2(i int) distributed by (i);
+commit;
+-- Start transaction and create tables in it
+begin;
+select createTables() check_files
+\gset
+ 
+-- Make sure that the tables files exist on the coordinator and the standby
+:check_files
+15
+15
+-- Get segfault on the coordinator and reconnect after its restart
+select gp_inject_fault('exec_simple_query_start', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+-- The error message from psql can be different, so ignore it
+\! psql postgres -c "select 1" 2> /dev/null
+-- Wait for the coordinator to be recovered
+\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done;
+\c regression
+-- All the inject faults have been reset after the coordinator restart
+select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+ 
+(1 row)
+
+-- Check that the tables files don't exist on the coordinator and the standby
+:check_files
+0
+0
+-- Check that the coordinator recovery didn't remove files of the tables which
+-- were created in subtransactions
+table t_sub1;
+ i 
+---
+(0 rows)
+
+table t_sub2;
+ i 
+---
+(0 rows)
+
+-- Clean up
+drop table t_top, t_sub1, t_sub2;
+\unset check_files
+-- Test case 2
+-- Check that files are left untouched on the coordinator and the standby
+-- when the corresponding distributed commit record exists in WAL
+select gp_inject_fault('dtm_xlog_distributed_commit', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = -1;
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+-- Create tables in a transaction. Get segfault right after the distributed
+-- commit record is flushed
+\! psql regression -c "begin; select createTables(); commit;"
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+connection to server was lost
+-- Wait for the coordinator to be recovered
+\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done;
+\c regression
+select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+ 
+(1 row)
+
+-- Check that all the tables and its indexes files exist
+select '\! ' ||
+       string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) lswc
+  from (
+    select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ')
+                 || ' 2>/dev/null | wc -l' lswc
+    from (
+      select unnest(array['t_orphaned_h'::regclass,
+                          't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass,
+                          't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass])
+      union all
+      select unnest(array[segrelid,
+                          blkdirrelid, blkdiridxid,
+                          visimaprelid, visimapidxid])
+        from pg_catalog.pg_appendonly
+       where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass)
+    ) a
+  ) f,
+  (select datadir from gp_segment_configuration where content = -1) d
+\gset
+:lswc
+15
+15
+-- Check that we can read data from the tables
+table t_orphaned_h;
+ i 
+---
+(0 rows)
+
+table t_orphaned_r;
+ i 
+---
+(0 rows)
+
+table t_orphaned_c;
+ i 
+---
+(0 rows)
+
+-- Clean up
+drop table t_orphaned_h, t_orphaned_r, t_orphaned_c;
+drop function createTables();
+-- Test case 3
+-- Check that orphaned files are not left on segments when the files are
+-- created after checkpoint
+create or replace function getTableSegFiles
+(t regclass, out gp_contentid smallint, out filepath text)
+as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)'
+language sql
+execute on all segments;
+-- Get list of the tables file names on each segment
+create or replace function createTables() returns text as
+$$
+declare
+  cmd text;
+begin
+  -- Minimal fillfactor to minimize rows number for creating second main fork file
+  create table t_orphaned_h(i int)
+  with (fillfactor=10)
+  distributed by (i);
+  -- Create the .1 file. Separate insert to create FSM. 
+  insert into t_orphaned_h select generate_series(1,9000000);
+
+  create table t_orphaned_r(i int)
+  with (appendonly=true, orientation=row)
+  distributed by (i);
+  -- Create the .1 file
+  insert into t_orphaned_r select generate_series(1,100);
+
+  -- Create the .128 file
+  create table t_orphaned_c
+  with (appendonly=true, orientation=column) as
+  select i as i, i*2 as j from generate_series(1,100) i
+  distributed by (i);
+  -- Create the .1 and .129 files
+  insert into t_orphaned_c
+  select i as i, i*2 as j from generate_series(1,100) i;
+
+  -- Ensure that the mirrors have applied the filesystem changes
+  perform force_mirrors_to_catch_up();
+
+  -- The command do not output PGDATA directories to make it possible to run
+  -- the test without docker
+  select '\! ' ||
+         string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select gp_contentid,
+           'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc
+    from (
+      select gp_contentid, filepath || suf f
+        from getTableSegFiles('t_orphaned_h'),
+             (values(''), ('.1'), ('_fsm')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_r'),
+             (values(''), ('.1')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_c'),
+             (values(''), ('.1'), ('.128'), ('.129')) v(suf)
+    ) a
+    group by gp_contentid
+  ) f,
+  (select content, datadir from gp_segment_configuration where content > -1) d
+  where f.gp_contentid = d.content;
+
+  return cmd;
+end
+$$ language plpgsql;
+-- Test case 3.1
+-- Segfault on all segments
+checkpoint;
+-- Skip checkpoints
+select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content > -1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:
+ Success:
+ Success:
+(3 rows)
+
+-- Create tables in subtransactions
+begin;
+create table t_top(i int) distributed by (i);
+savepoint sp1;
+create table t_sub1(i int) distributed by (i);
+savepoint sp2;
+create table t_sub2(i int) distributed by (i);
+commit;
+-- Start transaction and create tables in it
+begin;
+select createTables() check_files
+\gset
+-- Make sure that all the tables files exist on the segments
+:check_files
+9
+9
+9
+9
+9
+9
+-- Get segfault on all segments
+select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content != -1;
+ gp_inject_fault 
+-----------------
+ Success:
+ Success:
+ Success:
+(3 rows)
+
+select 1 from gp_dist_random('gp_id');
+ERROR:  Error on receive from seg0 slice1 127.0.0.1:6002 pid=45456: server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-- Rollback the transaction to make it possible to run queries after the error
+rollback;
+select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+ 
+(1 row)
+
+-- Check that the tables files don't exist on the segments
+:check_files
+0
+0
+0
+0
+0
+0
+-- Check that the segments recovery didn't remove files of the tables which
+-- were created in subtransactions
+table t_sub1;
+ i 
+---
+(0 rows)
+
+table t_sub2;
+ i 
+---
+(0 rows)
+
+-- Clean up
+drop table t_top, t_sub1, t_sub2;
+-- Test case 3.2
+-- Segfault on one segment
+checkpoint;
+-- Skip checkpoints
+select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content > -1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:
+ Success:
+ Success:
+(3 rows)
+
+-- Create tables in subtransactions
+begin;
+create table t_top(i int) distributed by (i);
+savepoint sp1;
+create table t_sub1(i int) distributed by (i);
+savepoint sp2;
+create table t_sub2(i int) distributed by (i);
+commit;
+-- Start transaction and create tables in it
+begin;
+select createTables() check_files
+\gset
+-- Make sure that all the tables files exist on the segments
+:check_files
+9
+9
+9
+9
+9
+9
+-- Get segfault on a segment
+select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = 1;
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+select 1 from gp_dist_random('gp_id');
+ERROR:  Error on receive from seg1 slice1 127.0.0.1:6003 pid=64482: server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-- Rollback the transaction to make it possible to run queries after the error
+rollback;
+select force_mirrors_to_catch_up();
+ force_mirrors_to_catch_up 
+---------------------------
+ 
+(1 row)
+
+-- Make a checkpoint to remove orphaned files from segments where segfault did
+-- not happen
+select gp_inject_fault_infinite('checkpoint', 'reset', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content > -1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:
+ Success:
+ Success:
+(3 rows)
+
+checkpoint;
+-- Check that the tables files don't exist on the segments
+:check_files
+0
+0
+0
+0
+0
+0
+-- Check that the segment recovery didn't remove files of the tables which
+-- were created in subtransactions
+table t_sub1;
+ i 
+---
+(0 rows)
+
+table t_sub2;
+ i 
+---
+(0 rows)
+
+-- Clean up
+\unset check_files
+drop table t_top, t_sub1, t_sub2;
+drop function createTables();
+drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text);
+-- start_ignore
+-- end_ignore
diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule
index 42216d195377..bb84028de92a 100755
--- a/src/test/regress/greenplum_schedule
+++ b/src/test/regress/greenplum_schedule
@@ -313,4 +313,6 @@ test: gp_check_files
 
 test: gp_query_id
 
+test: gp_orphaned_files
+
 # end of tests
diff --git a/src/test/regress/sql/gp_orphaned_files.sql b/src/test/regress/sql/gp_orphaned_files.sql
new file mode 100644
index 000000000000..7af4dd17fc30
--- /dev/null
+++ b/src/test/regress/sql/gp_orphaned_files.sql
@@ -0,0 +1,360 @@
+-- start_ignore
+create extension if not exists gp_inject_fault;
+drop index if exists t_orphaned_r_i, t_orphaned_c_i;
+drop table if exists t_orphaned_h, t_orphaned_r, t_orphaned_c,
+                     t_top, t_sub1, t_sub2;
+-- Increase the number of connection attempts to a segment to 120, reduce
+-- the interval between attempts to 1 second. So the segments will have 120
+-- seconds to recover after segfault.  The demo cluser don't fail over to 
+-- a mirror if 120 second is enough for recovery
+\! gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly
+\! gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly
+\! gpstop -u
+-- end_ignore
+
+-- start_matchsubs
+-- m/ERROR:  Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/
+-- s/ERROR:  Error on receive from seg\d+ slice\d+ \d+.\d+.\d+.\d+:\d+ pid=\d+: server closed the connection unexpectedly/ERROR:  Error on receive from segX sliceX X.X.X.X:X pid=X: server closed the connection unexpectedly/
+-- end_matchsubs
+
+-- Test case 1
+-- Check that orphaned files are not left on the coordinator and the standby
+-- when the files are created after checkpoint
+
+-- Create tables of different access methods and return command to check their
+-- files existence on the coordinator and the standby
+create or replace function createTables() returns text as
+$$
+declare
+  cmd text;
+begin
+  create table t_orphaned_h(i int)
+  distributed by (i);
+
+  create table t_orphaned_r(i int)
+  with (appendonly=true, orientation=row)
+  distributed by (i);
+  -- Create index to create block directory table
+  create index t_orphaned_r_i on t_orphaned_r(i);
+
+  create table t_orphaned_c(i int)
+  with (appendonly=true, orientation=column)
+  distributed by (i);
+  -- Create index to create block directory table
+  create index t_orphaned_c_i on t_orphaned_c(i);
+
+  -- Ensure that the mirrors have applied the filesystem changes
+  perform force_mirrors_to_catch_up();
+
+  -- The command do not output PGDATA directories to make it possible to run
+  -- the test without docker
+  select '\! ' ||
+         string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ')
+                 || ' 2>/dev/null | wc -l' lswc
+    from (
+      select unnest(array['t_orphaned_h'::regclass,
+                          't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass,
+                          't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass])
+      union all
+      select unnest(array[segrelid,
+                          blkdirrelid, blkdiridxid,
+                          visimaprelid, visimapidxid])
+        from pg_catalog.pg_appendonly
+       where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass)
+    ) a
+  ) f,
+  (select datadir from gp_segment_configuration where content = -1) d;
+
+  return cmd;
+end
+$$ language plpgsql;
+
+checkpoint;
+
+-- Skip checkpoints on the coordinator
+select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = -1;
+
+-- Create tables in subtransactions
+begin;
+create table t_top(i int) distributed by (i);
+savepoint sp1;
+create table t_sub1(i int) distributed by (i);
+savepoint sp2;
+create table t_sub2(i int) distributed by (i);
+commit;
+
+-- Start transaction and create tables in it
+begin;
+select createTables() check_files
+\gset
+ 
+-- Make sure that the tables files exist on the coordinator and the standby
+:check_files
+
+-- Get segfault on the coordinator and reconnect after its restart
+select gp_inject_fault('exec_simple_query_start', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = -1;
+
+-- The error message from psql can be different, so ignore it
+\! psql postgres -c "select 1" 2> /dev/null
+-- Wait for the coordinator to be recovered
+\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done;
+\c regression
+
+-- All the inject faults have been reset after the coordinator restart
+
+select force_mirrors_to_catch_up();
+
+-- Check that the tables files don't exist on the coordinator and the standby
+:check_files
+
+-- Check that the coordinator recovery didn't remove files of the tables which
+-- were created in subtransactions
+table t_sub1;
+table t_sub2;
+
+-- Clean up
+drop table t_top, t_sub1, t_sub2;
+\unset check_files
+
+
+-- Test case 2
+-- Check that files are left untouched on the coordinator and the standby
+-- when the corresponding distributed commit record exists in WAL
+select gp_inject_fault('dtm_xlog_distributed_commit', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = -1;
+
+-- Create tables in a transaction. Get segfault right after the distributed
+-- commit record is flushed
+\! psql regression -c "begin; select createTables(); commit;"
+-- Wait for the coordinator to be recovered
+\! while [ `psql -tc "select 1;" postgres 2>/dev/null | wc -l` != '2' ]; do sleep 1; done;
+\c regression
+
+select force_mirrors_to_catch_up();
+
+-- Check that all the tables and its indexes files exist
+select '\! ' ||
+       string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir) lswc
+  from (
+    select 'ls ' || string_agg(pg_relation_filepath(a.unnest), ' ')
+                 || ' 2>/dev/null | wc -l' lswc
+    from (
+      select unnest(array['t_orphaned_h'::regclass,
+                          't_orphaned_r'::regclass, 't_orphaned_r_i'::regclass,
+                          't_orphaned_c'::regclass, 't_orphaned_c_i'::regclass])
+      union all
+      select unnest(array[segrelid,
+                          blkdirrelid, blkdiridxid,
+                          visimaprelid, visimapidxid])
+        from pg_catalog.pg_appendonly
+       where relid in ('t_orphaned_r'::regclass, 't_orphaned_c'::regclass)
+    ) a
+  ) f,
+  (select datadir from gp_segment_configuration where content = -1) d
+\gset
+
+:lswc
+
+-- Check that we can read data from the tables
+table t_orphaned_h;
+table t_orphaned_r;
+table t_orphaned_c;
+
+-- Clean up
+drop table t_orphaned_h, t_orphaned_r, t_orphaned_c;
+drop function createTables();
+
+
+-- Test case 3
+-- Check that orphaned files are not left on segments when the files are
+-- created after checkpoint
+
+create or replace function getTableSegFiles
+(t regclass, out gp_contentid smallint, out filepath text)
+as 'select current_setting(''gp_contentid'')::smallint, pg_relation_filepath(t)'
+language sql
+execute on all segments;
+
+-- Get list of the tables file names on each segment
+create or replace function createTables() returns text as
+$$
+declare
+  cmd text;
+begin
+  -- Minimal fillfactor to minimize rows number for creating second main fork file
+  create table t_orphaned_h(i int)
+  with (fillfactor=10)
+  distributed by (i);
+  -- Create the .1 file. Separate insert to create FSM. 
+  insert into t_orphaned_h select generate_series(1,9000000);
+
+  create table t_orphaned_r(i int)
+  with (appendonly=true, orientation=row)
+  distributed by (i);
+  -- Create the .1 file
+  insert into t_orphaned_r select generate_series(1,100);
+
+  -- Create the .128 file
+  create table t_orphaned_c
+  with (appendonly=true, orientation=column) as
+  select i as i, i*2 as j from generate_series(1,100) i
+  distributed by (i);
+  -- Create the .1 and .129 files
+  insert into t_orphaned_c
+  select i as i, i*2 as j from generate_series(1,100) i;
+
+  -- Ensure that the mirrors have applied the filesystem changes
+  perform force_mirrors_to_catch_up();
+
+  -- The command do not output PGDATA directories to make it possible to run
+  -- the test without docker
+  select '\! ' ||
+         string_agg('cd ' || datadir || '&&' || lswc, ';' order by datadir)
+  into cmd
+  from (
+    select gp_contentid,
+           'ls ' || string_agg(f, ' ') || ' 2>/dev/null | wc -l' lswc
+    from (
+      select gp_contentid, filepath || suf f
+        from getTableSegFiles('t_orphaned_h'),
+             (values(''), ('.1'), ('_fsm')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_r'),
+             (values(''), ('.1')) v(suf)
+      union all
+      select gp_contentid, filepath || suf
+        from getTableSegFiles('t_orphaned_c'),
+             (values(''), ('.1'), ('.128'), ('.129')) v(suf)
+    ) a
+    group by gp_contentid
+  ) f,
+  (select content, datadir from gp_segment_configuration where content > -1) d
+  where f.gp_contentid = d.content;
+
+  return cmd;
+end
+$$ language plpgsql;
+
+-- Test case 3.1
+-- Segfault on all segments
+checkpoint;
+
+-- Skip checkpoints
+select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content > -1;
+
+-- Create tables in subtransactions
+begin;
+create table t_top(i int) distributed by (i);
+savepoint sp1;
+create table t_sub1(i int) distributed by (i);
+savepoint sp2;
+create table t_sub2(i int) distributed by (i);
+commit;
+
+-- Start transaction and create tables in it
+begin;
+select createTables() check_files
+\gset
+
+-- Make sure that all the tables files exist on the segments
+:check_files
+
+-- Get segfault on all segments
+select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content != -1;
+
+select 1 from gp_dist_random('gp_id');
+
+-- Rollback the transaction to make it possible to run queries after the error
+rollback;
+
+select force_mirrors_to_catch_up();
+
+-- Check that the tables files don't exist on the segments
+:check_files
+
+-- Check that the segments recovery didn't remove files of the tables which
+-- were created in subtransactions
+table t_sub1;
+table t_sub2;
+
+-- Clean up
+drop table t_top, t_sub1, t_sub2;
+
+
+-- Test case 3.2
+-- Segfault on one segment
+checkpoint;
+
+-- Skip checkpoints
+select gp_inject_fault_infinite('checkpoint', 'skip', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content > -1;
+
+-- Create tables in subtransactions
+begin;
+create table t_top(i int) distributed by (i);
+savepoint sp1;
+create table t_sub1(i int) distributed by (i);
+savepoint sp2;
+create table t_sub2(i int) distributed by (i);
+commit;
+
+-- Start transaction and create tables in it
+begin;
+select createTables() check_files
+\gset
+
+-- Make sure that all the tables files exist on the segments
+:check_files
+
+-- Get segfault on a segment
+select gp_inject_fault('qe_exec_finished', 'segv', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content = 1;
+
+select 1 from gp_dist_random('gp_id');
+
+-- Rollback the transaction to make it possible to run queries after the error
+rollback;
+
+select force_mirrors_to_catch_up();
+
+-- Make a checkpoint to remove orphaned files from segments where segfault did
+-- not happen
+select gp_inject_fault_infinite('checkpoint', 'reset', dbid)
+  from gp_segment_configuration
+ where role = 'p' and content > -1;
+checkpoint;
+
+-- Check that the tables files don't exist on the segments
+:check_files
+
+-- Check that the segment recovery didn't remove files of the tables which
+-- were created in subtransactions
+table t_sub1;
+table t_sub2;
+
+
+-- Clean up
+\unset check_files
+drop table t_top, t_sub1, t_sub2;
+drop function createTables();
+drop function getTableSegFiles(t regclass, out gp_contentid smallint, out filepath text);
+-- start_ignore
+\! gpconfig -r gp_gang_creation_retry_timer --skipvalidation --masteronly
+\! gpconfig -r gp_gang_creation_retry_count --skipvalidation --masteronly
+\! gpstop -u
+-- end_ignore
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 760e74516117..684d8092aceb 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -637,6 +637,13 @@ Form_pg_ts_template
 Form_pg_type
 Form_pg_user_mapping
 FormatNode
+FreePageBtree
+FreePageBtreeHeader
+FreePageBtreeInternalKey
+FreePageBtreeLeafKey
+FreePageBtreeSearchResult
+FreePageManager
+FreePageSpanLeader
 FromCharDateMode
 FromExpr
 FuncCall
@@ -2132,6 +2139,12 @@ dlist_iter
 dlist_mutable_iter
 dlist_node
 ds_state
+dsa_area
+dsa_area_control
+dsa_area_pool
+dsa_area_span
+dsa_segment_header
+dsa_segment_map
 dsm_control_header
 dsm_control_item
 dsm_handle