Moved other yql/essentials libs YQL-19206

init commit_hash:7d4c435602078407bbf20dd3c32f9c90d2bbcbc0
author: vvvv <[email protected]> 2024-11-07 12:29:36 +0300
committer: vvvv <[email protected]> 2024-11-07 13:49:47 +0300
commit: d4c258e9431675bab6745c8638df6e3dfd4dca6b (patch)
tree: b5efcfa11351152a4c872fccaea35749141c0b11 /yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr
parent: 13a4f274caef5cfdaf0263b24e4d6bdd5521472b (diff)
9 files changed, 11377 insertions, 0 deletions
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/alignedalloc.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/alignedalloc.c
new file mode 100644
index 00000000000..627e988852b
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/alignedalloc.c
@@ -0,0 +1,154 @@
+/*-------------------------------------------------------------------------
+ *
+ * alignedalloc.c
+ *	  Allocator functions to implement palloc_aligned
+ *
+ * This is not a fully-fledged MemoryContext type as there is no means to
+ * create a MemoryContext of this type.  The code here only serves to allow
+ * operations such as pfree() and repalloc() to work correctly on a memory
+ * chunk that was allocated by palloc_aligned().
+ *
+ * Portions Copyright (c) 2022-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/alignedalloc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils_memorychunk.h"
+
+/*
+ * AlignedAllocFree
+*		Frees allocated memory; memory is removed from its owning context.
+*/
+void
+AlignedAllocFree(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	void	   *unaligned;
+
+	VALGRIND_MAKE_MEM_DEFINED(chunk, sizeof(MemoryChunk));
+
+	Assert(!MemoryChunkIsExternal(chunk));
+
+	/* obtain the original (unaligned) allocated pointer */
+	unaligned = MemoryChunkGetBlock(chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	if (!sentinel_ok(pointer, chunk->requested_size))
+		elog(WARNING, "detected write past chunk end in %s %p",
+			 GetMemoryChunkContext(unaligned)->name, chunk);
+#endif
+
+	pfree(unaligned);
+}
+
+/*
+ * AlignedAllocRealloc
+ *		Change the allocated size of a chunk and return possibly a different
+ *		pointer to a memory address aligned to the same boundary as the
+ *		originally requested alignment.  The contents of 'pointer' will be
+ *		copied into the returned pointer up until 'size'.  Any additional
+ *		memory will be uninitialized.
+ */
+void *
+AlignedAllocRealloc(void *pointer, Size size)
+{
+	MemoryChunk *redirchunk = PointerGetMemoryChunk(pointer);
+	Size		alignto;
+	void	   *unaligned;
+	MemoryContext ctx;
+	Size		old_size;
+	void	   *newptr;
+
+	VALGRIND_MAKE_MEM_DEFINED(redirchunk, sizeof(MemoryChunk));
+
+	alignto = MemoryChunkGetValue(redirchunk);
+	unaligned = MemoryChunkGetBlock(redirchunk);
+
+	/* sanity check this is a power of 2 value */
+	Assert((alignto & (alignto - 1)) == 0);
+
+	/*
+	 * Determine the size of the original allocation.  We can't determine this
+	 * exactly as GetMemoryChunkSpace() returns the total space used for the
+	 * allocation, which for contexts like aset includes rounding up to the
+	 * next power of 2.  However, this value is just used to memcpy() the old
+	 * data into the new allocation, so we only need to concern ourselves with
+	 * not reading beyond the end of the original allocation's memory.  The
+	 * drawback here is that we may copy more bytes than we need to, which
+	 * only amounts to wasted effort.  We can safely subtract the extra bytes
+	 * that we requested to allow us to align the pointer.  We must also
+	 * subtract the space for the unaligned pointer's MemoryChunk since
+	 * GetMemoryChunkSpace should have included that.  This does assume that
+	 * all context types use MemoryChunk as a chunk header.
+	 */
+	old_size = GetMemoryChunkSpace(unaligned) -
+		PallocAlignedExtraBytes(alignto) - sizeof(MemoryChunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* check that GetMemoryChunkSpace returned something realistic */
+	Assert(old_size >= redirchunk->requested_size);
+#endif
+
+	ctx = GetMemoryChunkContext(unaligned);
+	newptr = MemoryContextAllocAligned(ctx, size, alignto, 0);
+
+	/*
+	 * We may memcpy beyond the end of the original allocation request size,
+	 * so we must mark the entire allocation as defined.
+	 */
+	VALGRIND_MAKE_MEM_DEFINED(pointer, old_size);
+	memcpy(newptr, pointer, Min(size, old_size));
+	pfree(unaligned);
+
+	return newptr;
+}
+
+/*
+ * AlignedAllocGetChunkContext
+ *		Return the MemoryContext that 'pointer' belongs to.
+ */
+MemoryContext
+AlignedAllocGetChunkContext(void *pointer)
+{
+	MemoryChunk *redirchunk = PointerGetMemoryChunk(pointer);
+	MemoryContext cxt;
+
+	VALGRIND_MAKE_MEM_DEFINED(redirchunk, sizeof(MemoryChunk));
+
+	Assert(!MemoryChunkIsExternal(redirchunk));
+
+	cxt = GetMemoryChunkContext(MemoryChunkGetBlock(redirchunk));
+
+	VALGRIND_MAKE_MEM_NOACCESS(redirchunk, sizeof(MemoryChunk));
+
+	return cxt;
+}
+
+/*
+ * AlignedAllocGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+Size
+AlignedAllocGetChunkSpace(void *pointer)
+{
+	MemoryChunk *redirchunk = PointerGetMemoryChunk(pointer);
+	void	   *unaligned;
+	Size		space;
+
+	VALGRIND_MAKE_MEM_DEFINED(redirchunk, sizeof(MemoryChunk));
+
+	unaligned = MemoryChunkGetBlock(redirchunk);
+	space = GetMemoryChunkSpace(unaligned);
+
+	VALGRIND_MAKE_MEM_NOACCESS(redirchunk, sizeof(MemoryChunk));
+
+	return space;
+}
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/aset.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/aset.c
new file mode 100644
index 00000000000..fa39038a388
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/aset.c
@@ -0,0 +1,1659 @@
+/*-------------------------------------------------------------------------
+ *
+ * aset.c
+ *	  Allocation set definitions.
+ *
+ * AllocSet is our standard implementation of the abstract MemoryContext
+ * type.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/aset.c
+ *
+ * NOTE:
+ *	This is a new (Feb. 05, 1999) implementation of the allocation set
+ *	routines. AllocSet...() does not use OrderedSet...() any more.
+ *	Instead it manages allocations in a block pool by itself, combining
+ *	many small allocations in a few bigger blocks. AllocSetFree() normally
+ *	doesn't free() memory really. It just add's the free'd area to some
+ *	list for later reuse by AllocSetAlloc(). All memory blocks are free()'d
+ *	at once on AllocSetReset(), which happens when the memory context gets
+ *	destroyed.
+ *				Jan Wieck
+ *
+ *	Performance improvement from Tom Lane, 8/99: for extremely large request
+ *	sizes, we do want to be able to give the memory back to free() as soon
+ *	as it is pfree()'d.  Otherwise we risk tying up a lot of memory in
+ *	freelist entries that might never be usable.  This is specially needed
+ *	when the caller is repeatedly repalloc()'ing a block bigger and bigger;
+ *	the previous instances of the block were guaranteed to be wasted until
+ *	AllocSetReset() under the old way.
+ *
+ *	Further improvement 12/00: as the code stood, request sizes in the
+ *	midrange between "small" and "large" were handled very inefficiently,
+ *	because any sufficiently large free chunk would be used to satisfy a
+ *	request, even if it was much larger than necessary.  This led to more
+ *	and more wasted space in allocated chunks over time.  To fix, get rid
+ *	of the midrange behavior: we now handle only "small" power-of-2-size
+ *	chunks as chunks.  Anything "large" is passed off to malloc().  Change
+ *	the number of freelists to change the small/large boundary.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/pg_bitutils.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/memutils_memorychunk.h"
+#include "utils/memutils_internal.h"
+
+/*--------------------
+ * Chunk freelist k holds chunks of size 1 << (k + ALLOC_MINBITS),
+ * for k = 0 .. ALLOCSET_NUM_FREELISTS-1.
+ *
+ * Note that all chunks in the freelists have power-of-2 sizes.  This
+ * improves recyclability: we may waste some space, but the wasted space
+ * should stay pretty constant as requests are made and released.
+ *
+ * A request too large for the last freelist is handled by allocating a
+ * dedicated block from malloc().  The block still has a block header and
+ * chunk header, but when the chunk is freed we'll return the whole block
+ * to malloc(), not put it on our freelists.
+ *
+ * CAUTION: ALLOC_MINBITS must be large enough so that
+ * 1<<ALLOC_MINBITS is at least MAXALIGN,
+ * or we may fail to align the smallest chunks adequately.
+ * 8-byte alignment is enough on all currently known machines.  This 8-byte
+ * minimum also allows us to store a pointer to the next freelist item within
+ * the chunk of memory itself.
+ *
+ * With the current parameters, request sizes up to 8K are treated as chunks,
+ * larger requests go into dedicated blocks.  Change ALLOCSET_NUM_FREELISTS
+ * to adjust the boundary point; and adjust ALLOCSET_SEPARATE_THRESHOLD in
+ * memutils.h to agree.  (Note: in contexts with small maxBlockSize, we may
+ * set the allocChunkLimit to less than 8K, so as to avoid space wastage.)
+ *--------------------
+ */
+
+#define ALLOC_MINBITS		3	/* smallest chunk size is 8 bytes */
+#define ALLOCSET_NUM_FREELISTS	11
+#define ALLOC_CHUNK_LIMIT	(1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS))
+/* Size of largest chunk that we use a fixed size for */
+#define ALLOC_CHUNK_FRACTION	4
+/* We allow chunks to be at most 1/4 of maxBlockSize (less overhead) */
+
+/*--------------------
+ * The first block allocated for an allocset has size initBlockSize.
+ * Each time we have to allocate another block, we double the block size
+ * (if possible, and without exceeding maxBlockSize), so as to reduce
+ * the bookkeeping load on malloc().
+ *
+ * Blocks allocated to hold oversize chunks do not follow this rule, however;
+ * they are just however big they need to be to hold that single chunk.
+ *
+ * Also, if a minContextSize is specified, the first block has that size,
+ * and then initBlockSize is used for the next one.
+ *--------------------
+ */
+
+#define ALLOC_BLOCKHDRSZ	MAXALIGN(sizeof(AllocBlockData))
+#define ALLOC_CHUNKHDRSZ	sizeof(MemoryChunk)
+
+typedef struct AllocBlockData *AllocBlock;	/* forward reference */
+
+/*
+ * AllocPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *AllocPointer;
+
+/*
+ * AllocFreeListLink
+ *		When pfreeing memory, if we maintain a freelist for the given chunk's
+ *		size then we use a AllocFreeListLink to point to the current item in
+ *		the AllocSetContext's freelist and then set the given freelist element
+ *		to point to the chunk being freed.
+ */
+typedef struct AllocFreeListLink
+{
+	MemoryChunk *next;
+} AllocFreeListLink;
+
+/*
+ * Obtain a AllocFreeListLink for the given chunk.  Allocation sizes are
+ * always at least sizeof(AllocFreeListLink), so we reuse the pointer's memory
+ * itself to store the freelist link.
+ */
+#define GetFreeListLink(chkptr) \
+	(AllocFreeListLink *) ((char *) (chkptr) + ALLOC_CHUNKHDRSZ)
+
+/* Validate a freelist index retrieved from a chunk header */
+#define FreeListIdxIsValid(fidx) \
+	((fidx) >= 0 && (fidx) < ALLOCSET_NUM_FREELISTS)
+
+/* Determine the size of the chunk based on the freelist index */
+#define GetChunkSizeFromFreeListIdx(fidx) \
+	((((Size) 1) << ALLOC_MINBITS) << (fidx))
+
+/*
+ * AllocSetContext is our standard implementation of MemoryContext.
+ *
+ * Note: header.isReset means there is nothing for AllocSetReset to do.
+ * This is different from the aset being physically empty (empty blocks list)
+ * because we will still have a keeper block.  It's also different from the set
+ * being logically empty, because we don't attempt to detect pfree'ing the
+ * last active chunk.
+ */
+typedef struct AllocSetContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Info about storage allocated in this context: */
+	AllocBlock	blocks;			/* head of list of blocks in this set */
+	MemoryChunk *freelist[ALLOCSET_NUM_FREELISTS];	/* free chunk lists */
+	/* Allocation parameters for this context: */
+	Size		initBlockSize;	/* initial block size */
+	Size		maxBlockSize;	/* maximum block size */
+	Size		nextBlockSize;	/* next block size to allocate */
+	Size		allocChunkLimit;	/* effective chunk size limit */
+	AllocBlock	keeper;			/* keep this block over resets */
+	/* freelist this context could be put in, or -1 if not a candidate: */
+	int			freeListIndex;	/* index in context_freelists[], or -1 */
+} AllocSetContext;
+
+typedef AllocSetContext *AllocSet;
+
+/*
+ * AllocBlock
+ *		An AllocBlock is the unit of memory that is obtained by aset.c
+ *		from malloc().  It contains one or more MemoryChunks, which are
+ *		the units requested by palloc() and freed by pfree(). MemoryChunks
+ *		cannot be returned to malloc() individually, instead they are put
+ *		on freelists by pfree() and re-used by the next palloc() that has
+ *		a matching request size.
+ *
+ *		AllocBlockData is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+typedef struct AllocBlockData
+{
+	AllocSet	aset;			/* aset that owns this block */
+	AllocBlock	prev;			/* prev block in aset's blocks list, if any */
+	AllocBlock	next;			/* next block in aset's blocks list, if any */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+}			AllocBlockData;
+
+/*
+ * AllocPointerIsValid
+ *		True iff pointer is valid allocation pointer.
+ */
+#define AllocPointerIsValid(pointer) PointerIsValid(pointer)
+
+/*
+ * AllocSetIsValid
+ *		True iff set is valid allocation set.
+ */
+#define AllocSetIsValid(set) \
+	(PointerIsValid(set) && IsA(set, AllocSetContext))
+
+/*
+ * AllocBlockIsValid
+ *		True iff block is valid block of allocation set.
+ */
+#define AllocBlockIsValid(block) \
+	(PointerIsValid(block) && AllocSetIsValid((block)->aset))
+
+/*
+ * We always store external chunks on a dedicated block.  This makes fetching
+ * the block from an external chunk easy since it's always the first and only
+ * chunk on the block.
+ */
+#define ExternalChunkGetBlock(chunk) \
+	(AllocBlock) ((char *) chunk - ALLOC_BLOCKHDRSZ)
+
+/*
+ * Rather than repeatedly creating and deleting memory contexts, we keep some
+ * freed contexts in freelists so that we can hand them out again with little
+ * work.  Before putting a context in a freelist, we reset it so that it has
+ * only its initial malloc chunk and no others.  To be a candidate for a
+ * freelist, a context must have the same minContextSize/initBlockSize as
+ * other contexts in the list; but its maxBlockSize is irrelevant since that
+ * doesn't affect the size of the initial chunk.
+ *
+ * We currently provide one freelist for ALLOCSET_DEFAULT_SIZES contexts
+ * and one for ALLOCSET_SMALL_SIZES contexts; the latter works for
+ * ALLOCSET_START_SMALL_SIZES too, since only the maxBlockSize differs.
+ *
+ * Ordinarily, we re-use freelist contexts in last-in-first-out order, in
+ * hopes of improving locality of reference.  But if there get to be too
+ * many contexts in the list, we'd prefer to drop the most-recently-created
+ * contexts in hopes of keeping the process memory map compact.
+ * We approximate that by simply deleting all existing entries when the list
+ * overflows, on the assumption that queries that allocate a lot of contexts
+ * will probably free them in more or less reverse order of allocation.
+ *
+ * Contexts in a freelist are chained via their nextchild pointers.
+ */
+#define MAX_FREE_CONTEXTS 100	/* arbitrary limit on freelist length */
+
+typedef struct AllocSetFreeList
+{
+	int			num_free;		/* current list length */
+	AllocSetContext *first_free;	/* list header */
+} AllocSetFreeList;
+
+/* context_freelists[0] is for default params, [1] for small params */
+static __thread AllocSetFreeList context_freelists[2] =
+{
+	{
+		0, NULL
+	},
+	{
+		0, NULL
+	}
+};
+
+
+/* ----------
+ * AllocSetFreeIndex -
+ *
+ *		Depending on the size of an allocation compute which freechunk
+ *		list of the alloc set it belongs to.  Caller must have verified
+ *		that size <= ALLOC_CHUNK_LIMIT.
+ * ----------
+ */
+static inline int
+AllocSetFreeIndex(Size size)
+{
+	int			idx;
+
+	if (size > (1 << ALLOC_MINBITS))
+	{
+		/*----------
+		 * At this point we must compute ceil(log2(size >> ALLOC_MINBITS)).
+		 * This is the same as
+		 *		pg_leftmost_one_pos32((size - 1) >> ALLOC_MINBITS) + 1
+		 * or equivalently
+		 *		pg_leftmost_one_pos32(size - 1) - ALLOC_MINBITS + 1
+		 *
+		 * However, for platforms without intrinsic support, we duplicate the
+		 * logic here, allowing an additional optimization.  It's reasonable
+		 * to assume that ALLOC_CHUNK_LIMIT fits in 16 bits, so we can unroll
+		 * the byte-at-a-time loop in pg_leftmost_one_pos32 and just handle
+		 * the last two bytes.
+		 *
+		 * Yes, this function is enough of a hot-spot to make it worth this
+		 * much trouble.
+		 *----------
+		 */
+#ifdef HAVE_BITSCAN_REVERSE
+		idx = pg_leftmost_one_pos32((uint32) size - 1) - ALLOC_MINBITS + 1;
+#else
+		uint32		t,
+					tsize;
+
+		/* Statically assert that we only have a 16-bit input value. */
+		StaticAssertDecl(ALLOC_CHUNK_LIMIT < (1 << 16),
+						 "ALLOC_CHUNK_LIMIT must be less than 64kB");
+
+		tsize = size - 1;
+		t = tsize >> 8;
+		idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize];
+		idx -= ALLOC_MINBITS - 1;
+#endif
+
+		Assert(idx < ALLOCSET_NUM_FREELISTS);
+	}
+	else
+		idx = 0;
+
+	return idx;
+}
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * AllocSetContextCreateInternal
+ *		Create a new AllocSet context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (must be statically allocated)
+ * minContextSize: minimum context size
+ * initBlockSize: initial allocation block size
+ * maxBlockSize: maximum allocation block size
+ *
+ * Most callers should abstract the context size parameters using a macro
+ * such as ALLOCSET_DEFAULT_SIZES.
+ *
+ * Note: don't call this directly; go through the wrapper macro
+ * AllocSetContextCreate.
+ */
+MemoryContext
+AllocSetContextCreateInternal(MemoryContext parent,
+							  const char *name,
+							  Size minContextSize,
+							  Size initBlockSize,
+							  Size maxBlockSize)
+{
+	int			freeListIndex;
+	Size		firstBlockSize;
+	AllocSet	set;
+	AllocBlock	block;
+
+	/* ensure MemoryChunk's size is properly maxaligned */
+	StaticAssertDecl(ALLOC_CHUNKHDRSZ == MAXALIGN(ALLOC_CHUNKHDRSZ),
+					 "sizeof(MemoryChunk) is not maxaligned");
+	/* check we have enough space to store the freelist link */
+	StaticAssertDecl(sizeof(AllocFreeListLink) <= (1 << ALLOC_MINBITS),
+					 "sizeof(AllocFreeListLink) larger than minimum allocation size");
+
+	/*
+	 * First, validate allocation parameters.  Once these were regular runtime
+	 * tests and elog's, but in practice Asserts seem sufficient because
+	 * nobody varies their parameters at runtime.  We somewhat arbitrarily
+	 * enforce a minimum 1K block size.  We restrict the maximum block size to
+	 * MEMORYCHUNK_MAX_BLOCKOFFSET as MemoryChunks are limited to this in
+	 * regards to addressing the offset between the chunk and the block that
+	 * the chunk is stored on.  We would be unable to store the offset between
+	 * the chunk and block for any chunks that were beyond
+	 * MEMORYCHUNK_MAX_BLOCKOFFSET bytes into the block if the block was to be
+	 * larger than this.
+	 */
+	Assert(initBlockSize == MAXALIGN(initBlockSize) &&
+		   initBlockSize >= 1024);
+	Assert(maxBlockSize == MAXALIGN(maxBlockSize) &&
+		   maxBlockSize >= initBlockSize &&
+		   AllocHugeSizeIsValid(maxBlockSize)); /* must be safe to double */
+	Assert(minContextSize == 0 ||
+		   (minContextSize == MAXALIGN(minContextSize) &&
+			minContextSize >= 1024 &&
+			minContextSize <= maxBlockSize));
+	Assert(maxBlockSize <= MEMORYCHUNK_MAX_BLOCKOFFSET);
+
+	/*
+	 * Check whether the parameters match either available freelist.  We do
+	 * not need to demand a match of maxBlockSize.
+	 */
+	if (minContextSize == ALLOCSET_DEFAULT_MINSIZE &&
+		initBlockSize == ALLOCSET_DEFAULT_INITSIZE)
+		freeListIndex = 0;
+	else if (minContextSize == ALLOCSET_SMALL_MINSIZE &&
+			 initBlockSize == ALLOCSET_SMALL_INITSIZE)
+		freeListIndex = 1;
+	else
+		freeListIndex = -1;
+
+        freeListIndex = -1;
+
+	/*
+	 * If a suitable freelist entry exists, just recycle that context.
+	 */
+	if (freeListIndex >= 0)
+	{
+		AllocSetFreeList *freelist = &context_freelists[freeListIndex];
+
+		if (freelist->first_free != NULL)
+		{
+			/* Remove entry from freelist */
+			set = freelist->first_free;
+			freelist->first_free = (AllocSet) set->header.nextchild;
+			freelist->num_free--;
+
+			/* Update its maxBlockSize; everything else should be OK */
+			set->maxBlockSize = maxBlockSize;
+
+			/* Reinitialize its header, installing correct name and parent */
+			MemoryContextCreate((MemoryContext) set,
+								T_AllocSetContext,
+								MCTX_ASET_ID,
+								parent,
+								name);
+
+			((MemoryContext) set)->mem_allocated =
+				set->keeper->endptr - ((char *) set);
+
+			return (MemoryContext) set;
+		}
+	}
+
+	/* Determine size of initial block */
+	firstBlockSize = MAXALIGN(sizeof(AllocSetContext)) +
+		ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+	if (minContextSize != 0)
+		firstBlockSize = Max(firstBlockSize, minContextSize);
+	else
+		firstBlockSize = Max(firstBlockSize, initBlockSize);
+
+	/*
+	 * Allocate the initial block.  Unlike other aset.c blocks, it starts with
+	 * the context header and its block header follows that.
+	 */
+	set = (AllocSet) malloc(firstBlockSize);
+	if (set == NULL)
+	{
+		if (TopMemoryContext)
+			MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed while creating memory context \"%s\".",
+						   name)));
+	}
+
+	/*
+	 * Avoid writing code that can fail between here and MemoryContextCreate;
+	 * we'd leak the header/initial block if we ereport in this stretch.
+	 */
+
+	/* Fill in the initial block's block header */
+	block = (AllocBlock) (((char *) set) + MAXALIGN(sizeof(AllocSetContext)));
+	block->aset = set;
+	block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+	block->endptr = ((char *) set) + firstBlockSize;
+	block->prev = NULL;
+	block->next = NULL;
+
+	/* Mark unallocated space NOACCESS; leave the block header alone. */
+	VALGRIND_MAKE_MEM_NOACCESS(block->freeptr, block->endptr - block->freeptr);
+
+	/* Remember block as part of block list */
+	set->blocks = block;
+	/* Mark block as not to be released at reset time */
+	set->keeper = block;
+
+	/* Finish filling in aset-specific parts of the context header */
+	MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+
+	set->initBlockSize = initBlockSize;
+	set->maxBlockSize = maxBlockSize;
+	set->nextBlockSize = initBlockSize;
+	set->freeListIndex = freeListIndex;
+
+	/*
+	 * Compute the allocation chunk size limit for this context.  It can't be
+	 * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists.
+	 * If maxBlockSize is small then requests exceeding the maxBlockSize, or
+	 * even a significant fraction of it, should be treated as large chunks
+	 * too.  For the typical case of maxBlockSize a power of 2, the chunk size
+	 * limit will be at most 1/8th maxBlockSize, so that given a stream of
+	 * requests that are all the maximum chunk size we will waste at most
+	 * 1/8th of the allocated space.
+	 *
+	 * Also, allocChunkLimit must not exceed ALLOCSET_SEPARATE_THRESHOLD.
+	 */
+	StaticAssertStmt(ALLOC_CHUNK_LIMIT == ALLOCSET_SEPARATE_THRESHOLD,
+					 "ALLOC_CHUNK_LIMIT != ALLOCSET_SEPARATE_THRESHOLD");
+
+	/*
+	 * Determine the maximum size that a chunk can be before we allocate an
+	 * entire AllocBlock dedicated for that chunk.  We set the absolute limit
+	 * of that size as ALLOC_CHUNK_LIMIT but we reduce it further so that we
+	 * can fit about ALLOC_CHUNK_FRACTION chunks this size on a maximally
+	 * sized block.  (We opt to keep allocChunkLimit a power-of-2 value
+	 * primarily for legacy reasons rather than calculating it so that exactly
+	 * ALLOC_CHUNK_FRACTION chunks fit on a maximally sized block.)
+	 */
+	set->allocChunkLimit = ALLOC_CHUNK_LIMIT;
+	while ((Size) (set->allocChunkLimit + ALLOC_CHUNKHDRSZ) >
+		   (Size) ((maxBlockSize - ALLOC_BLOCKHDRSZ) / ALLOC_CHUNK_FRACTION))
+		set->allocChunkLimit >>= 1;
+
+	/* Finally, do the type-independent part of context creation */
+	MemoryContextCreate((MemoryContext) set,
+						T_AllocSetContext,
+						MCTX_ASET_ID,
+						parent,
+						name);
+
+	((MemoryContext) set)->mem_allocated = firstBlockSize;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * AllocSetReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * Actually, this routine has some discretion about what to do.
+ * It should mark all allocated chunks freed, but it need not necessarily
+ * give back all the resources the set owns.  Our actual implementation is
+ * that we give back all but the "keeper" block (which we must keep, since
+ * it shares a malloc chunk with the context header).  In this way, we don't
+ * thrash malloc() when a context is repeatedly reset after small allocations,
+ * which is typical behavior for per-tuple contexts.
+ */
+void
+AllocSetReset(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block;
+	Size		keepersize PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(AllocSetIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	AllocSetCheck(context);
+#endif
+
+	/* Remember keeper block size for Assert below */
+	keepersize = set->keeper->endptr - ((char *) set);
+
+	/* Clear chunk freelists */
+	MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+
+	block = set->blocks;
+
+	/* New blocks list will be just the keeper block */
+	set->blocks = set->keeper;
+
+	while (block != NULL)
+	{
+		AllocBlock	next = block->next;
+
+		if (block == set->keeper)
+		{
+			/* Reset the block, but don't return it to malloc */
+			char	   *datastart = ((char *) block) + ALLOC_BLOCKHDRSZ;
+
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(datastart, block->freeptr - datastart);
+#else
+			/* wipe_mem() would have done this */
+			VALGRIND_MAKE_MEM_NOACCESS(datastart, block->freeptr - datastart);
+#endif
+			block->freeptr = datastart;
+			block->prev = NULL;
+			block->next = NULL;
+		}
+		else
+		{
+			/* Normal case, release the block */
+			context->mem_allocated -= block->endptr - ((char *) block);
+
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, block->freeptr - ((char *) block));
+#endif
+			free(block);
+		}
+		block = next;
+	}
+
+	Assert(context->mem_allocated == keepersize);
+
+	/* Reset block size allocation sequence, too */
+	set->nextBlockSize = set->initBlockSize;
+}
+
+/*
+ * AllocSetDelete
+ *		Frees all memory which is allocated in the given set,
+ *		in preparation for deletion of the set.
+ *
+ * Unlike AllocSetReset, this *must* free all resources of the set.
+ */
+void
+AllocSetDelete(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block = set->blocks;
+	Size		keepersize PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(AllocSetIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	AllocSetCheck(context);
+#endif
+
+	/* Remember keeper block size for Assert below */
+	keepersize = set->keeper->endptr - ((char *) set);
+
+	/*
+	 * If the context is a candidate for a freelist, put it into that freelist
+	 * instead of destroying it.
+	 */
+	if (set->freeListIndex >= 0)
+	{
+		AllocSetFreeList *freelist = &context_freelists[set->freeListIndex];
+
+		/*
+		 * Reset the context, if it needs it, so that we aren't hanging on to
+		 * more than the initial malloc chunk.
+		 */
+		if (!context->isReset)
+			MemoryContextResetOnly(context);
+
+		/*
+		 * If the freelist is full, just discard what's already in it.  See
+		 * comments with context_freelists[].
+		 */
+		if (freelist->num_free >= MAX_FREE_CONTEXTS)
+		{
+			while (freelist->first_free != NULL)
+			{
+				AllocSetContext *oldset = freelist->first_free;
+
+				freelist->first_free = (AllocSetContext *) oldset->header.nextchild;
+				freelist->num_free--;
+
+				/* All that remains is to free the header/initial block */
+				free(oldset);
+			}
+			Assert(freelist->num_free == 0);
+		}
+
+		/* Now add the just-deleted context to the freelist. */
+		set->header.nextchild = (MemoryContext) freelist->first_free;
+		freelist->first_free = set;
+		freelist->num_free++;
+
+		return;
+	}
+
+	/* Free all blocks, except the keeper which is part of context header */
+	while (block != NULL)
+	{
+		AllocBlock	next = block->next;
+
+		if (block != set->keeper)
+			context->mem_allocated -= block->endptr - ((char *) block);
+
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, block->freeptr - ((char *) block));
+#endif
+
+		if (block != set->keeper)
+			free(block);
+
+		block = next;
+	}
+
+	Assert(context->mem_allocated == keepersize);
+
+	/* Finally, free the context header, including the keeper block */
+	free(set);
+}
+
+/*
+ * AllocSetAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ *
+ * Note: when using valgrind, it doesn't matter how the returned allocation
+ * is marked, as mcxt.c will set it to UNDEFINED.  In some paths we will
+ * return space that is marked NOACCESS - AllocSetRealloc has to beware!
+ */
+void *
+AllocSetAlloc(MemoryContext context, Size size)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block;
+	MemoryChunk *chunk;
+	int			fidx;
+	Size		chunk_size;
+	Size		blksize;
+
+	Assert(AllocSetIsValid(set));
+
+	/*
+	 * If requested size exceeds maximum for chunks, allocate an entire block
+	 * for this request.
+	 */
+	if (size > set->allocChunkLimit)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* ensure there's always space for the sentinel byte */
+		chunk_size = MAXALIGN(size + 1);
+#else
+		chunk_size = MAXALIGN(size);
+#endif
+
+		blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+		block = (AllocBlock) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		context->mem_allocated += blksize;
+
+		block->aset = set;
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (MemoryChunk *) (((char *) block) + ALLOC_BLOCKHDRSZ);
+
+		/* mark the MemoryChunk as externally managed */
+		MemoryChunkSetHdrMaskExternal(chunk, MCTX_ASET_ID);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		Assert(size < chunk_size);
+		set_sentinel(MemoryChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
+#endif
+
+		/*
+		 * Stick the new block underneath the active allocation block, if any,
+		 * so that we don't lose the use of the space remaining therein.
+		 */
+		if (set->blocks != NULL)
+		{
+			block->prev = set->blocks;
+			block->next = set->blocks->next;
+			if (block->next)
+				block->next->prev = block;
+			set->blocks->next = block;
+		}
+		else
+		{
+			block->prev = NULL;
+			block->next = NULL;
+			set->blocks = block;
+		}
+
+		/* Ensure any padding bytes are marked NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) MemoryChunkGetPointer(chunk) + size,
+								   chunk_size - size);
+
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+		return MemoryChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Request is small enough to be treated as a chunk.  Look in the
+	 * corresponding free list to see if there is a free chunk we could reuse.
+	 * If one is found, remove it from the free list, make it again a member
+	 * of the alloc set and return its data address.
+	 *
+	 * Note that we don't attempt to ensure there's space for the sentinel
+	 * byte here.  We expect a large proportion of allocations to be for sizes
+	 * which are already a power of 2.  If we were to always make space for a
+	 * sentinel byte in MEMORY_CONTEXT_CHECKING builds, then we'd end up
+	 * doubling the memory requirements for such allocations.
+	 */
+	fidx = AllocSetFreeIndex(size);
+	chunk = set->freelist[fidx];
+	if (chunk != NULL)
+	{
+		AllocFreeListLink *link = GetFreeListLink(chunk);
+
+		/* Allow access to the chunk header. */
+		VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+		Assert(fidx == MemoryChunkGetValue(chunk));
+
+		/* pop this chunk off the freelist */
+		VALGRIND_MAKE_MEM_DEFINED(link, sizeof(AllocFreeListLink));
+		set->freelist[fidx] = link->next;
+		VALGRIND_MAKE_MEM_NOACCESS(link, sizeof(AllocFreeListLink));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		if (size < GetChunkSizeFromFreeListIdx(fidx))
+			set_sentinel(MemoryChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
+#endif
+
+		/* Ensure any padding bytes are marked NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) MemoryChunkGetPointer(chunk) + size,
+								   GetChunkSizeFromFreeListIdx(fidx) - size);
+
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+		return MemoryChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Choose the actual chunk size to allocate.
+	 */
+	chunk_size = GetChunkSizeFromFreeListIdx(fidx);
+	Assert(chunk_size >= size);
+
+	/*
+	 * If there is enough room in the active allocation block, we will put the
+	 * chunk into that block.  Else must start a new one.
+	 */
+	if ((block = set->blocks) != NULL)
+	{
+		Size		availspace = block->endptr - block->freeptr;
+
+		if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ))
+		{
+			/*
+			 * The existing active (top) block does not have enough room for
+			 * the requested allocation, but it might still have a useful
+			 * amount of space in it.  Once we push it down in the block list,
+			 * we'll never try to allocate more space from it. So, before we
+			 * do that, carve up its free space into chunks that we can put on
+			 * the set's freelists.
+			 *
+			 * Because we can only get here when there's less than
+			 * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate
+			 * more than ALLOCSET_NUM_FREELISTS-1 times.
+			 */
+			while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ))
+			{
+				AllocFreeListLink *link;
+				Size		availchunk = availspace - ALLOC_CHUNKHDRSZ;
+				int			a_fidx = AllocSetFreeIndex(availchunk);
+
+				/*
+				 * In most cases, we'll get back the index of the next larger
+				 * freelist than the one we need to put this chunk on.  The
+				 * exception is when availchunk is exactly a power of 2.
+				 */
+				if (availchunk != GetChunkSizeFromFreeListIdx(a_fidx))
+				{
+					a_fidx--;
+					Assert(a_fidx >= 0);
+					availchunk = GetChunkSizeFromFreeListIdx(a_fidx);
+				}
+
+				chunk = (MemoryChunk *) (block->freeptr);
+
+				/* Prepare to initialize the chunk header. */
+				VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+				block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
+				availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
+
+				/* store the freelist index in the value field */
+				MemoryChunkSetHdrMask(chunk, block, a_fidx, MCTX_ASET_ID);
+#ifdef MEMORY_CONTEXT_CHECKING
+				chunk->requested_size = InvalidAllocSize;	/* mark it free */
+#endif
+				/* push this chunk onto the free list */
+				link = GetFreeListLink(chunk);
+
+				VALGRIND_MAKE_MEM_DEFINED(link, sizeof(AllocFreeListLink));
+				link->next = set->freelist[a_fidx];
+				VALGRIND_MAKE_MEM_NOACCESS(link, sizeof(AllocFreeListLink));
+
+				set->freelist[a_fidx] = chunk;
+			}
+			/* Mark that we need to create a new block */
+			block = NULL;
+		}
+	}
+
+	/*
+	 * Time to create a new regular (multi-chunk) block?
+	 */
+	if (block == NULL)
+	{
+		Size		required_size;
+
+		/*
+		 * The first such block has size initBlockSize, and we double the
+		 * space in each succeeding block, but not more than maxBlockSize.
+		 */
+		blksize = set->nextBlockSize;
+		set->nextBlockSize <<= 1;
+		if (set->nextBlockSize > set->maxBlockSize)
+			set->nextBlockSize = set->maxBlockSize;
+
+		/*
+		 * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
+		 * space... but try to keep it a power of 2.
+		 */
+		required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+		while (blksize < required_size)
+			blksize <<= 1;
+
+		/* Try to allocate it */
+		block = (AllocBlock) malloc(blksize);
+
+		/*
+		 * We could be asking for pretty big blocks here, so cope if malloc
+		 * fails.  But give up if there's less than 1 MB or so available...
+		 */
+		while (block == NULL && blksize > 1024 * 1024)
+		{
+			blksize >>= 1;
+			if (blksize < required_size)
+				break;
+			block = (AllocBlock) malloc(blksize);
+		}
+
+		if (block == NULL)
+			return NULL;
+
+		context->mem_allocated += blksize;
+
+		block->aset = set;
+		block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+		block->endptr = ((char *) block) + blksize;
+
+		/* Mark unallocated space NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+								   blksize - ALLOC_BLOCKHDRSZ);
+
+		block->prev = NULL;
+		block->next = set->blocks;
+		if (block->next)
+			block->next->prev = block;
+		set->blocks = block;
+	}
+
+	/*
+	 * OK, do the allocation
+	 */
+	chunk = (MemoryChunk *) (block->freeptr);
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
+	Assert(block->freeptr <= block->endptr);
+
+	/* store the free list index in the value field */
+	MemoryChunkSetHdrMask(chunk, block, fidx, MCTX_ASET_ID);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->requested_size = size;
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk_size)
+		set_sentinel(MemoryChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
+#endif
+
+	/* Ensure any padding bytes are marked NOACCESS. */
+	VALGRIND_MAKE_MEM_NOACCESS((char *) MemoryChunkGetPointer(chunk) + size,
+							   chunk_size - size);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+	return MemoryChunkGetPointer(chunk);
+}
+
+/*
+ * AllocSetFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+void
+AllocSetFree(void *pointer)
+{
+	AllocSet	set;
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+	{
+		/* Release single-chunk block. */
+		AllocBlock	block = ExternalChunkGetBlock(chunk);
+
+		/*
+		 * Try to verify that we have a sane block pointer: the block header
+		 * should reference an aset and the freeptr should match the endptr.
+		 */
+		if (!AllocBlockIsValid(block) || block->freeptr != block->endptr)
+			elog(ERROR, "could not find block containing chunk %p", chunk);
+
+		set = block->aset;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		{
+			/* Test for someone scribbling on unused space in chunk */
+			Assert(chunk->requested_size < (block->endptr - (char *) pointer));
+			if (!sentinel_ok(pointer, chunk->requested_size))
+				elog(WARNING, "detected write past chunk end in %s %p",
+					 set->header.name, chunk);
+		}
+#endif
+
+		/* OK, remove block from aset's list and free it */
+		if (block->prev)
+			block->prev->next = block->next;
+		else
+			set->blocks = block->next;
+		if (block->next)
+			block->next->prev = block->prev;
+
+		set->header.mem_allocated -= block->endptr - ((char *) block);
+
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, block->freeptr - ((char *) block));
+#endif
+		free(block);
+	}
+	else
+	{
+		AllocBlock	block = MemoryChunkGetBlock(chunk);
+		int			fidx;
+		AllocFreeListLink *link;
+
+		/*
+		 * In this path, for speed reasons we just Assert that the referenced
+		 * block is good.  We can also Assert that the value field is sane.
+		 * Future field experience may show that these Asserts had better
+		 * become regular runtime test-and-elog checks.
+		 */
+		Assert(AllocBlockIsValid(block));
+		set = block->aset;
+
+		fidx = MemoryChunkGetValue(chunk);
+		Assert(FreeListIdxIsValid(fidx));
+		link = GetFreeListLink(chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Test for someone scribbling on unused space in chunk */
+		if (chunk->requested_size < GetChunkSizeFromFreeListIdx(fidx))
+			if (!sentinel_ok(pointer, chunk->requested_size))
+				elog(WARNING, "detected write past chunk end in %s %p",
+					 set->header.name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(pointer, GetChunkSizeFromFreeListIdx(fidx));
+#endif
+		/* push this chunk onto the top of the free list */
+		VALGRIND_MAKE_MEM_DEFINED(link, sizeof(AllocFreeListLink));
+		link->next = set->freelist[fidx];
+		VALGRIND_MAKE_MEM_NOACCESS(link, sizeof(AllocFreeListLink));
+		set->freelist[fidx] = chunk;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+		/*
+		 * Reset requested_size to InvalidAllocSize in chunks that are on free
+		 * list.
+		 */
+		chunk->requested_size = InvalidAllocSize;
+#endif
+	}
+}
+
+/*
+ * AllocSetRealloc
+ *		Returns new pointer to allocated memory of given size or NULL if
+ *		request could not be completed; this memory is added to the set.
+ *		Memory associated with given pointer is copied into the new memory,
+ *		and the old memory is freed.
+ *
+ * Without MEMORY_CONTEXT_CHECKING, we don't know the old request size.  This
+ * makes our Valgrind client requests less-precise, hazarding false negatives.
+ * (In principle, we could use VALGRIND_GET_VBITS() to rediscover the old
+ * request size.)
+ */
+void *
+AllocSetRealloc(void *pointer, Size size)
+{
+	AllocBlock	block;
+	AllocSet	set;
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	Size		oldchksize;
+	int			fidx;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+	{
+		/*
+		 * The chunk must have been allocated as a single-chunk block.  Use
+		 * realloc() to make the containing block bigger, or smaller, with
+		 * minimum space wastage.
+		 */
+		Size		chksize;
+		Size		blksize;
+		Size		oldblksize;
+
+		block = ExternalChunkGetBlock(chunk);
+
+		/*
+		 * Try to verify that we have a sane block pointer: the block header
+		 * should reference an aset and the freeptr should match the endptr.
+		 */
+		if (!AllocBlockIsValid(block) || block->freeptr != block->endptr)
+			elog(ERROR, "could not find block containing chunk %p", chunk);
+
+		set = block->aset;
+
+		oldchksize = block->endptr - (char *) pointer;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* Test for someone scribbling on unused space in chunk */
+		Assert(chunk->requested_size < oldchksize);
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		/* ensure there's always space for the sentinel byte */
+		chksize = MAXALIGN(size + 1);
+#else
+		chksize = MAXALIGN(size);
+#endif
+
+		/* Do the realloc */
+		blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+		oldblksize = block->endptr - ((char *) block);
+
+		block = (AllocBlock) realloc(block, blksize);
+		if (block == NULL)
+		{
+			/* Disallow access to the chunk header. */
+			VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+			return NULL;
+		}
+
+		/* updated separately, not to underflow when (oldblksize > blksize) */
+		set->header.mem_allocated -= oldblksize;
+		set->header.mem_allocated += blksize;
+
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		/* Update pointers since block has likely been moved */
+		chunk = (MemoryChunk *) (((char *) block) + ALLOC_BLOCKHDRSZ);
+		pointer = MemoryChunkGetPointer(chunk);
+		if (block->prev)
+			block->prev->next = block;
+		else
+			set->blocks = block;
+		if (block->next)
+			block->next->prev = block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+		/*
+		 * We can only randomize the extra space if we know the prior request.
+		 * When using Valgrind, randomize_mem() also marks memory UNDEFINED.
+		 */
+		if (size > chunk->requested_size)
+			randomize_mem((char *) pointer + chunk->requested_size,
+						  size - chunk->requested_size);
+#else
+
+		/*
+		 * If this is an increase, realloc() will have marked any
+		 * newly-allocated part (from oldchksize to chksize) UNDEFINED, but we
+		 * also need to adjust trailing bytes from the old allocation (from
+		 * chunk->requested_size to oldchksize) as they are marked NOACCESS.
+		 * Make sure not to mark too many bytes in case chunk->requested_size
+		 * < size < oldchksize.
+		 */
+#ifdef USE_VALGRIND
+		if (Min(size, oldchksize) > chunk->requested_size)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + chunk->requested_size,
+										Min(size, oldchksize) - chunk->requested_size);
+#endif
+#endif
+
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		Assert(size < chksize);
+		set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We may need to adjust marking of bytes from the old allocation as
+		 * some of them may be marked NOACCESS.  We don't know how much of the
+		 * old chunk size was the requested size; it could have been as small
+		 * as one byte.  We have to be conservative and just mark the entire
+		 * old portion DEFINED.  Make sure not to mark memory beyond the new
+		 * allocation in case it's smaller than the old one.
+		 */
+		VALGRIND_MAKE_MEM_DEFINED(pointer, Min(size, oldchksize));
+#endif
+
+		/* Ensure any padding bytes are marked NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size);
+
+		/* Disallow access to the chunk header . */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+		return pointer;
+	}
+
+	block = MemoryChunkGetBlock(chunk);
+
+	/*
+	 * In this path, for speed reasons we just Assert that the referenced
+	 * block is good. We can also Assert that the value field is sane. Future
+	 * field experience may show that these Asserts had better become regular
+	 * runtime test-and-elog checks.
+	 */
+	Assert(AllocBlockIsValid(block));
+	set = block->aset;
+
+	fidx = MemoryChunkGetValue(chunk);
+	Assert(FreeListIdxIsValid(fidx));
+	oldchksize = GetChunkSizeFromFreeListIdx(fidx);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < oldchksize)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/*
+	 * Chunk sizes are aligned to power of 2 in AllocSetAlloc().  Maybe the
+	 * allocated area already is >= the new size.  (In particular, we will
+	 * fall out here if the requested size is a decrease.)
+	 */
+	if (oldchksize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->requested_size = size;
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldchksize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		if (size < oldchksize)
+			set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldchksize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+		return pointer;
+	}
+	else
+	{
+		/*
+		 * Enlarge-a-small-chunk case.  We just do this by brute force, ie,
+		 * allocate a new chunk and copy the data.  Since we know the existing
+		 * data isn't huge, this won't involve any great memcpy expense, so
+		 * it's not worth being smarter.  (At one time we tried to avoid
+		 * memcpy when it was possible to enlarge the chunk in-place, but that
+		 * turns out to misbehave unpleasantly for repeated cycles of
+		 * palloc/repalloc/pfree: the eventually freed chunks go into the
+		 * wrong freelist for the next initial palloc request, and so we leak
+		 * memory indefinitely.  See pgsql-hackers archives for 2007-08-11.)
+		 */
+		AllocPointer newPointer;
+		Size		oldsize;
+
+		/* allocate new chunk */
+		newPointer = AllocSetAlloc((MemoryContext) set, size);
+
+		/* leave immediately if request was not completed */
+		if (newPointer == NULL)
+		{
+			/* Disallow access to the chunk header. */
+			VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+			return NULL;
+		}
+
+		/*
+		 * AllocSetAlloc() may have returned a region that is still NOACCESS.
+		 * Change it to UNDEFINED for the moment; memcpy() will then transfer
+		 * definedness from the old allocation to the new.  If we know the old
+		 * allocation, copy just that much.  Otherwise, make the entire old
+		 * chunk defined to avoid errors as we copy the currently-NOACCESS
+		 * trailing bytes.
+		 */
+		VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+		oldsize = chunk->requested_size;
+#else
+		oldsize = oldchksize;
+		VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+		/* transfer existing data (certain to fit) */
+		memcpy(newPointer, pointer, oldsize);
+
+		/* free old chunk */
+		AllocSetFree(pointer);
+
+		return newPointer;
+	}
+}
+
+/*
+ * AllocSetGetChunkContext
+ *		Return the MemoryContext that 'pointer' belongs to.
+ */
+MemoryContext
+AllocSetGetChunkContext(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	AllocBlock	block;
+	AllocSet	set;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+		block = ExternalChunkGetBlock(chunk);
+	else
+		block = (AllocBlock) MemoryChunkGetBlock(chunk);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+	Assert(AllocBlockIsValid(block));
+	set = block->aset;
+
+	return &set->header;
+}
+
+/*
+ * AllocSetGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+Size
+AllocSetGetChunkSpace(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	int			fidx;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+	{
+		AllocBlock	block = ExternalChunkGetBlock(chunk);
+
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+		Assert(AllocBlockIsValid(block));
+
+		return block->endptr - (char *) chunk;
+	}
+
+	fidx = MemoryChunkGetValue(chunk);
+	Assert(FreeListIdxIsValid(fidx));
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+	return GetChunkSizeFromFreeListIdx(fidx) + ALLOC_CHUNKHDRSZ;
+}
+
+/*
+ * AllocSetIsEmpty
+ *		Is an allocset empty of any allocated space?
+ */
+bool
+AllocSetIsEmpty(MemoryContext context)
+{
+	Assert(AllocSetIsValid(context));
+
+	/*
+	 * For now, we say "empty" only if the context is new or just reset. We
+	 * could examine the freelists to determine if all space has been freed,
+	 * but it's not really worth the trouble for present uses of this
+	 * functionality.
+	 */
+	if (context->isReset)
+		return true;
+	return false;
+}
+
+/*
+ * AllocSetStats
+ *		Compute stats about memory consumption of an allocset.
+ *
+ * printfunc: if not NULL, pass a human-readable stats string to this.
+ * passthru: pass this pointer through to printfunc.
+ * totals: if not NULL, add stats about this context into *totals.
+ * print_to_stderr: print stats to stderr if true, elog otherwise.
+ */
+void
+AllocSetStats(MemoryContext context,
+			  MemoryStatsPrintFunc printfunc, void *passthru,
+			  MemoryContextCounters *totals, bool print_to_stderr)
+{
+	AllocSet	set = (AllocSet) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace;
+	Size		freespace = 0;
+	AllocBlock	block;
+	int			fidx;
+
+	Assert(AllocSetIsValid(set));
+
+	/* Include context header in totalspace */
+	totalspace = MAXALIGN(sizeof(AllocSetContext));
+
+	for (block = set->blocks; block != NULL; block = block->next)
+	{
+		nblocks++;
+		totalspace += block->endptr - ((char *) block);
+		freespace += block->endptr - block->freeptr;
+	}
+	for (fidx = 0; fidx < ALLOCSET_NUM_FREELISTS; fidx++)
+	{
+		Size		chksz = GetChunkSizeFromFreeListIdx(fidx);
+		MemoryChunk *chunk = set->freelist[fidx];
+
+		while (chunk != NULL)
+		{
+			AllocFreeListLink *link = GetFreeListLink(chunk);
+
+			/* Allow access to the chunk header. */
+			VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+			Assert(MemoryChunkGetValue(chunk) == fidx);
+			VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+			freechunks++;
+			freespace += chksz + ALLOC_CHUNKHDRSZ;
+
+			VALGRIND_MAKE_MEM_DEFINED(link, sizeof(AllocFreeListLink));
+			chunk = link->next;
+			VALGRIND_MAKE_MEM_NOACCESS(link, sizeof(AllocFreeListLink));
+		}
+	}
+
+	if (printfunc)
+	{
+		char		stats_string[200];
+
+		snprintf(stats_string, sizeof(stats_string),
+				 "%zu total in %zu blocks; %zu free (%zu chunks); %zu used",
+				 totalspace, nblocks, freespace, freechunks,
+				 totalspace - freespace);
+		printfunc(context, passthru, stats_string, print_to_stderr);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * AllocSetCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+void
+AllocSetCheck(MemoryContext context)
+{
+	AllocSet	set = (AllocSet) context;
+	const char *name = set->header.name;
+	AllocBlock	prevblock;
+	AllocBlock	block;
+	Size		total_allocated = 0;
+
+	for (prevblock = NULL, block = set->blocks;
+		 block != NULL;
+		 prevblock = block, block = block->next)
+	{
+		char	   *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ;
+		long		blk_used = block->freeptr - bpoz;
+		long		blk_data = 0;
+		long		nchunks = 0;
+		bool		has_external_chunk = false;
+
+		if (set->keeper == block)
+			total_allocated += block->endptr - ((char *) set);
+		else
+			total_allocated += block->endptr - ((char *) block);
+
+		/*
+		 * Empty block - empty can be keeper-block only
+		 */
+		if (!blk_used)
+		{
+			if (set->keeper != block)
+				elog(WARNING, "problem in alloc set %s: empty block %p",
+					 name, block);
+		}
+
+		/*
+		 * Check block header fields
+		 */
+		if (block->aset != set ||
+			block->prev != prevblock ||
+			block->freeptr < bpoz ||
+			block->freeptr > block->endptr)
+			elog(WARNING, "problem in alloc set %s: corrupt header in block %p",
+				 name, block);
+
+		/*
+		 * Chunk walker
+		 */
+		while (bpoz < block->freeptr)
+		{
+			MemoryChunk *chunk = (MemoryChunk *) bpoz;
+			Size		chsize,
+						dsize;
+
+			/* Allow access to the chunk header. */
+			VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+			if (MemoryChunkIsExternal(chunk))
+			{
+				chsize = block->endptr - (char *) MemoryChunkGetPointer(chunk); /* aligned chunk size */
+				has_external_chunk = true;
+
+				/* make sure this chunk consumes the entire block */
+				if (chsize + ALLOC_CHUNKHDRSZ != blk_used)
+					elog(WARNING, "problem in alloc set %s: bad single-chunk %p in block %p",
+						 name, chunk, block);
+			}
+			else
+			{
+				int			fidx = MemoryChunkGetValue(chunk);
+
+				if (!FreeListIdxIsValid(fidx))
+					elog(WARNING, "problem in alloc set %s: bad chunk size for chunk %p in block %p",
+						 name, chunk, block);
+
+				chsize = GetChunkSizeFromFreeListIdx(fidx); /* aligned chunk size */
+
+				/*
+				 * Check the stored block offset correctly references this
+				 * block.
+				 */
+				if (block != MemoryChunkGetBlock(chunk))
+					elog(WARNING, "problem in alloc set %s: bad block offset for chunk %p in block %p",
+						 name, chunk, block);
+			}
+			dsize = chunk->requested_size;	/* real data */
+
+			/* an allocated chunk's requested size must be <= the chsize */
+			if (dsize != InvalidAllocSize && dsize > chsize)
+				elog(WARNING, "problem in alloc set %s: req size > alloc size for chunk %p in block %p",
+					 name, chunk, block);
+
+			/* chsize must not be smaller than the first freelist's size */
+			if (chsize < (1 << ALLOC_MINBITS))
+				elog(WARNING, "problem in alloc set %s: bad size %zu for chunk %p in block %p",
+					 name, chsize, chunk, block);
+
+			/*
+			 * Check for overwrite of padding space in an allocated chunk.
+			 */
+			if (dsize != InvalidAllocSize && dsize < chsize &&
+				!sentinel_ok(chunk, ALLOC_CHUNKHDRSZ + dsize))
+				elog(WARNING, "problem in alloc set %s: detected write past chunk end in block %p, chunk %p",
+					 name, block, chunk);
+
+			/* if chunk is allocated, disallow access to the chunk header */
+			if (dsize != InvalidAllocSize)
+				VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
+
+			blk_data += chsize;
+			nchunks++;
+
+			bpoz += ALLOC_CHUNKHDRSZ + chsize;
+		}
+
+		if ((blk_data + (nchunks * ALLOC_CHUNKHDRSZ)) != blk_used)
+			elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p",
+				 name, block);
+
+		if (has_external_chunk && nchunks > 1)
+			elog(WARNING, "problem in alloc set %s: external chunk on non-dedicated block %p",
+				 name, block);
+	}
+
+	Assert(total_allocated == context->mem_allocated);
+}
+
+#endif							/* MEMORY_CONTEXT_CHECKING */
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/dsa.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/dsa.c
new file mode 100644
index 00000000000..2739169165e
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/dsa.c
@@ -0,0 +1,2331 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsa.c
+ *	  Dynamic shared memory areas.
+ *
+ * This module provides dynamic shared memory areas which are built on top of
+ * DSM segments.  While dsm.c allows segments of memory of shared memory to be
+ * created and shared between backends, it isn't designed to deal with small
+ * objects.  A DSA area is a shared memory heap usually backed by one or more
+ * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
+ * Alternatively, it can be created in pre-existing shared memory, including a
+ * DSM segment, and then create extra DSM segments as required.  Unlike the
+ * regular system heap, it deals in pseudo-pointers which must be converted to
+ * backend-local pointers before they are dereferenced.  These pseudo-pointers
+ * can however be shared with other backends, and can be used to construct
+ * shared data structures.
+ *
+ * Each DSA area manages a set of DSM segments, adding new segments as
+ * required and detaching them when they are no longer needed.  Each segment
+ * contains a number of 4KB pages, a free page manager for tracking
+ * consecutive runs of free pages, and a page map for tracking the source of
+ * objects allocated on each page.  Allocation requests above 8KB are handled
+ * by choosing a segment and finding consecutive free pages in its free page
+ * manager.  Allocation requests for smaller sizes are handled using pools of
+ * objects of a selection of sizes.  Each pool consists of a number of 16 page
+ * (64KB) superblocks allocated in the same way as large objects.  Allocation
+ * of large objects and new superblocks is serialized by a single LWLock, but
+ * allocation of small objects from pre-existing superblocks uses one LWLock
+ * per pool.  Currently there is one pool, and therefore one lock, per size
+ * class.  Per-core pools to increase concurrency and strategies for reducing
+ * the resulting fragmentation are areas for future research.  Each superblock
+ * is managed with a 'span', which tracks the superblock's freelist.  Free
+ * requests are handled by looking in the page map to find which span an
+ * address was allocated from, so that small objects can be returned to the
+ * appropriate free list, and large object pages can be returned directly to
+ * the free page map.  When allocating, simple heuristics for selecting
+ * segments and superblocks try to encourage occupied memory to be
+ * concentrated, increasing the likelihood that whole superblocks can become
+ * empty and be returned to the free page manager, and whole segments can
+ * become empty and be returned to the operating system.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/dsa.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "port/pg_bitutils.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/dsa.h"
+#include "utils/freepage.h"
+#include "utils/memutils.h"
+
+/*
+ * The size of the initial DSM segment that backs a dsa_area created by
+ * dsa_create.  After creating some number of segments of this size we'll
+ * double this size, and so on.  Larger segments may be created if necessary
+ * to satisfy large requests.
+ */
+#define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
+
+/*
+ * How many segments to create before we double the segment size.  If this is
+ * low, then there is likely to be a lot of wasted space in the largest
+ * segment.  If it is high, then we risk running out of segment slots (see
+ * dsm.c's limits on total number of segments), or limiting the total size
+ * an area can manage when using small pointers.
+ */
+#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
+
+/*
+ * The number of bits used to represent the offset part of a dsa_pointer.
+ * This controls the maximum size of a segment, the maximum possible
+ * allocation size and also the maximum number of segments per area.
+ */
+#if SIZEOF_DSA_POINTER == 4
+#define DSA_OFFSET_WIDTH 27		/* 32 segments of size up to 128MB */
+#else
+#define DSA_OFFSET_WIDTH 40		/* 1024 segments of size up to 1TB */
+#endif
+
+/*
+ * The maximum number of DSM segments that an area can own, determined by
+ * the number of bits remaining (but capped at 1024).
+ */
+#define DSA_MAX_SEGMENTS \
+	Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
+
+/* The bitmask for extracting the offset from a dsa_pointer. */
+#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
+
+/* The maximum size of a DSM segment. */
+#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
+
+/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
+#define DSA_PAGES_PER_SUPERBLOCK		16
+
+/*
+ * A magic number used as a sanity check for following DSM segments belonging
+ * to a DSA area (this number will be XORed with the area handle and
+ * the segment index).
+ */
+#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
+
+/* Build a dsa_pointer given a segment number and offset. */
+#define DSA_MAKE_POINTER(segment_number, offset) \
+	(((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
+
+/* Extract the segment number from a dsa_pointer. */
+#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
+
+/* Extract the offset from a dsa_pointer. */
+#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
+
+/* The type used for index segment indexes (zero based). */
+typedef size_t dsa_segment_index;
+
+/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
+#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
+
+/*
+ * How many bins of segments do we have?  The bins are used to categorize
+ * segments by their largest contiguous run of free pages.
+ */
+#define DSA_NUM_SEGMENT_BINS 16
+
+/*
+ * What is the lowest bin that holds segments that *might* have n contiguous
+ * free pages?	There is no point in looking in segments in lower bins; they
+ * definitely can't service a request for n free pages.
+ */
+static inline size_t
+contiguous_pages_to_segment_bin(size_t n)
+{
+	size_t		bin;
+
+	if (n == 0)
+		bin = 0;
+	else
+		bin = pg_leftmost_one_pos_size_t(n) + 1;
+
+	return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
+}
+
+/* Macros for access to locks. */
+#define DSA_AREA_LOCK(area) (&area->control->lock)
+#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
+
+/*
+ * The header for an individual segment.  This lives at the start of each DSM
+ * segment owned by a DSA area including the first segment (where it appears
+ * as part of the dsa_area_control struct).
+ */
+typedef struct
+{
+	/* Sanity check magic value. */
+	uint32		magic;
+	/* Total number of pages in this segment (excluding metadata area). */
+	size_t		usable_pages;
+	/* Total size of this segment in bytes. */
+	size_t		size;
+
+	/*
+	 * Index of the segment that precedes this one in the same segment bin, or
+	 * DSA_SEGMENT_INDEX_NONE if this is the first one.
+	 */
+	dsa_segment_index prev;
+
+	/*
+	 * Index of the segment that follows this one in the same segment bin, or
+	 * DSA_SEGMENT_INDEX_NONE if this is the last one.
+	 */
+	dsa_segment_index next;
+	/* The index of the bin that contains this segment. */
+	size_t		bin;
+
+	/*
+	 * A flag raised to indicate that this segment is being returned to the
+	 * operating system and has been unpinned.
+	 */
+	bool		freed;
+} dsa_segment_header;
+
+/*
+ * Metadata for one superblock.
+ *
+ * For most blocks, span objects are stored out-of-line; that is, the span
+ * object is not stored within the block itself.  But, as an exception, for a
+ * "span of spans", the span object is stored "inline".  The allocation is
+ * always exactly one page, and the dsa_area_span object is located at
+ * the beginning of that page.  The size class is DSA_SCLASS_BLOCK_OF_SPANS,
+ * and the remaining fields are used just as they would be in an ordinary
+ * block.  We can't allocate spans out of ordinary superblocks because
+ * creating an ordinary superblock requires us to be able to allocate a span
+ * *first*.  Doing it this way avoids that circularity.
+ */
+typedef struct
+{
+	dsa_pointer pool;			/* Containing pool. */
+	dsa_pointer prevspan;		/* Previous span. */
+	dsa_pointer nextspan;		/* Next span. */
+	dsa_pointer start;			/* Starting address. */
+	size_t		npages;			/* Length of span in pages. */
+	uint16		size_class;		/* Size class. */
+	uint16		ninitialized;	/* Maximum number of objects ever allocated. */
+	uint16		nallocatable;	/* Number of objects currently allocatable. */
+	uint16		firstfree;		/* First object on free list. */
+	uint16		nmax;			/* Maximum number of objects ever possible. */
+	uint16		fclass;			/* Current fullness class. */
+} dsa_area_span;
+
+/*
+ * Given a pointer to an object in a span, access the index of the next free
+ * object in the same span (ie in the span's freelist) as an L-value.
+ */
+#define NextFreeObjectIndex(object) (* (uint16 *) (object))
+
+/*
+ * Small allocations are handled by dividing a single block of memory into
+ * many small objects of equal size.  The possible allocation sizes are
+ * defined by the following array.  Larger size classes are spaced more widely
+ * than smaller size classes.  We fudge the spacing for size classes >1kB to
+ * avoid space wastage: based on the knowledge that we plan to allocate 64kB
+ * blocks, we bump the maximum object size up to the largest multiple of
+ * 8 bytes that still lets us fit the same number of objects into one block.
+ *
+ * NB: Because of this fudging, if we were ever to use differently-sized blocks
+ * for small allocations, these size classes would need to be reworked to be
+ * optimal for the new size.
+ *
+ * NB: The optimal spacing for size classes, as well as the size of the blocks
+ * out of which small objects are allocated, is not a question that has one
+ * right answer.  Some allocators (such as tcmalloc) use more closely-spaced
+ * size classes than we do here, while others (like aset.c) use more
+ * widely-spaced classes.  Spacing the classes more closely avoids wasting
+ * memory within individual chunks, but also means a larger number of
+ * potentially-unfilled blocks.
+ */
+static const uint16 dsa_size_classes[] = {
+	sizeof(dsa_area_span), 0,	/* special size classes */
+	8, 16, 24, 32, 40, 48, 56, 64,	/* 8 classes separated by 8 bytes */
+	80, 96, 112, 128,			/* 4 classes separated by 16 bytes */
+	160, 192, 224, 256,			/* 4 classes separated by 32 bytes */
+	320, 384, 448, 512,			/* 4 classes separated by 64 bytes */
+	640, 768, 896, 1024,		/* 4 classes separated by 128 bytes */
+	1280, 1560, 1816, 2048,		/* 4 classes separated by ~256 bytes */
+	2616, 3120, 3640, 4096,		/* 4 classes separated by ~512 bytes */
+	5456, 6552, 7280, 8192		/* 4 classes separated by ~1024 bytes */
+};
+#define DSA_NUM_SIZE_CLASSES				lengthof(dsa_size_classes)
+
+/* Special size classes. */
+#define DSA_SCLASS_BLOCK_OF_SPANS		0
+#define DSA_SCLASS_SPAN_LARGE			1
+
+/*
+ * The following lookup table is used to map the size of small objects
+ * (less than 1kB) onto the corresponding size class.  To use this table,
+ * round the size of the object up to the next multiple of 8 bytes, and then
+ * index into this array.
+ */
+static const uint8 dsa_size_class_map[] = {
+	2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+	20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
+};
+#define DSA_SIZE_CLASS_MAP_QUANTUM	8
+
+/*
+ * Superblocks are binned by how full they are.  Generally, each fullness
+ * class corresponds to one quartile, but the block being used for
+ * allocations is always at the head of the list for fullness class 1,
+ * regardless of how full it really is.
+ */
+#define DSA_FULLNESS_CLASSES		4
+
+/*
+ * A dsa_area_pool represents a set of objects of a given size class.
+ *
+ * Perhaps there should be multiple pools for the same size class for
+ * contention avoidance, but for now there is just one!
+ */
+typedef struct
+{
+	/* A lock protecting access to this pool. */
+	LWLock		lock;
+	/* A set of linked lists of spans, arranged by fullness. */
+	dsa_pointer spans[DSA_FULLNESS_CLASSES];
+	/* Should we pad this out to a cacheline boundary? */
+} dsa_area_pool;
+
+/*
+ * The control block for an area.  This lives in shared memory, at the start of
+ * the first DSM segment controlled by this area.
+ */
+typedef struct
+{
+	/* The segment header for the first segment. */
+	dsa_segment_header segment_header;
+	/* The handle for this area. */
+	dsa_handle	handle;
+	/* The handles of the segments owned by this area. */
+	dsm_handle	segment_handles[DSA_MAX_SEGMENTS];
+	/* Lists of segments, binned by maximum contiguous run of free pages. */
+	dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
+	/* The object pools for each size class. */
+	dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
+	/* The total size of all active segments. */
+	size_t		total_segment_size;
+	/* The maximum total size of backing storage we are allowed. */
+	size_t		max_total_segment_size;
+	/* Highest used segment index in the history of this area. */
+	dsa_segment_index high_segment_index;
+	/* The reference count for this area. */
+	int			refcnt;
+	/* A flag indicating that this area has been pinned. */
+	bool		pinned;
+	/* The number of times that segments have been freed. */
+	size_t		freed_segment_counter;
+	/* The LWLock tranche ID. */
+	int			lwlock_tranche_id;
+	/* The general lock (protects everything except object pools). */
+	LWLock		lock;
+} dsa_area_control;
+
+/* Given a pointer to a pool, find a dsa_pointer. */
+#define DsaAreaPoolToDsaPointer(area, p)	\
+	DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
+
+/*
+ * A dsa_segment_map is stored within the backend-private memory of each
+ * individual backend.  It holds the base address of the segment within that
+ * backend, plus the addresses of key objects within the segment.  Those
+ * could instead be derived from the base address but it's handy to have them
+ * around.
+ */
+typedef struct
+{
+	dsm_segment *segment;		/* DSM segment */
+	char	   *mapped_address; /* Address at which segment is mapped */
+	dsa_segment_header *header; /* Header (same as mapped_address) */
+	FreePageManager *fpm;		/* Free page manager within segment. */
+	dsa_pointer *pagemap;		/* Page map within segment. */
+} dsa_segment_map;
+
+/*
+ * Per-backend state for a storage area.  Backends obtain one of these by
+ * creating an area or attaching to an existing one using a handle.  Each
+ * process that needs to use an area uses its own object to track where the
+ * segments are mapped.
+ */
+struct dsa_area
+{
+	/* Pointer to the control object in shared memory. */
+	dsa_area_control *control;
+
+	/* Has the mapping been pinned? */
+	bool		mapping_pinned;
+
+	/*
+	 * This backend's array of segment maps, ordered by segment index
+	 * corresponding to control->segment_handles.  Some of the area's segments
+	 * may not be mapped in this backend yet, and some slots may have been
+	 * freed and need to be detached; these operations happen on demand.
+	 */
+	dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
+
+	/* The highest segment index this backend has ever mapped. */
+	dsa_segment_index high_segment_index;
+
+	/* The last observed freed_segment_counter. */
+	size_t		freed_segment_counter;
+};
+
+#define DSA_SPAN_NOTHING_FREE	((uint16) -1)
+#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
+
+/* Given a pointer to a segment_map, obtain a segment index number. */
+#define get_segment_index(area, segment_map_ptr) \
+	(segment_map_ptr - &area->segment_maps[0])
+
+static void init_span(dsa_area *area, dsa_pointer span_pointer,
+					  dsa_area_pool *pool, dsa_pointer start, size_t npages,
+					  uint16 size_class);
+static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
+								int fromclass, int toclass);
+static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
+static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
+									 int size_class);
+static dsa_segment_map *get_segment_by_index(dsa_area *area,
+											 dsa_segment_index index);
+static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
+static void unlink_span(dsa_area *area, dsa_area_span *span);
+static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
+									   dsa_pointer span_pointer, int fclass);
+static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
+static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
+static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
+static dsa_area *create_internal(void *place, size_t size,
+								 int tranche_id,
+								 dsm_handle control_handle,
+								 dsm_segment *control_segment);
+static dsa_area *attach_internal(void *place, dsm_segment *segment,
+								 dsa_handle handle);
+static void check_for_freed_segments(dsa_area *area);
+static void check_for_freed_segments_locked(dsa_area *area);
+static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
+
+/*
+ * Create a new shared area in a new DSM segment.  Further DSM segments will
+ * be allocated as required to extend the available space.
+ *
+ * We can't allocate a LWLock tranche_id within this function, because tranche
+ * IDs are a scarce resource; there are only 64k available, using low numbers
+ * when possible matters, and we have no provision for recycling them.  So,
+ * we require the caller to provide one.
+ */
+dsa_area *
+dsa_create(int tranche_id)
+{
+	dsm_segment *segment;
+	dsa_area   *area;
+
+	/*
+	 * Create the DSM segment that will hold the shared control object and the
+	 * first segment of usable space.
+	 */
+	segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
+
+	/*
+	 * All segments backing this area are pinned, so that DSA can explicitly
+	 * control their lifetime (otherwise a newly created segment belonging to
+	 * this area might be freed when the only backend that happens to have it
+	 * mapped in ends, corrupting the area).
+	 */
+	dsm_pin_segment(segment);
+
+	/* Create a new DSA area with the control object in this segment. */
+	area = create_internal(dsm_segment_address(segment),
+						   DSA_INITIAL_SEGMENT_SIZE,
+						   tranche_id,
+						   dsm_segment_handle(segment), segment);
+
+	/* Clean up when the control segment detaches. */
+	on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+				  PointerGetDatum(dsm_segment_address(segment)));
+
+	return area;
+}
+
+/*
+ * Create a new shared area in an existing shared memory space, which may be
+ * either DSM or Postmaster-initialized memory.  DSM segments will be
+ * allocated as required to extend the available space, though that can be
+ * prevented with dsa_set_size_limit(area, size) using the same size provided
+ * to dsa_create_in_place.
+ *
+ * Areas created in-place must eventually be released by the backend that
+ * created them and all backends that attach to them.  This can be done
+ * explicitly with dsa_release_in_place, or, in the special case that 'place'
+ * happens to be in a pre-existing DSM segment, by passing in a pointer to the
+ * segment so that a detach hook can be registered with the containing DSM
+ * segment.
+ *
+ * See dsa_create() for a note about the tranche arguments.
+ */
+dsa_area *
+dsa_create_in_place(void *place, size_t size,
+					int tranche_id, dsm_segment *segment)
+{
+	dsa_area   *area;
+
+	area = create_internal(place, size, tranche_id,
+						   DSM_HANDLE_INVALID, NULL);
+
+	/*
+	 * Clean up when the control segment detaches, if a containing DSM segment
+	 * was provided.
+	 */
+	if (segment != NULL)
+		on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+					  PointerGetDatum(place));
+
+	return area;
+}
+
+/*
+ * Obtain a handle that can be passed to other processes so that they can
+ * attach to the given area.  Cannot be called for areas created with
+ * dsa_create_in_place.
+ */
+dsa_handle
+dsa_get_handle(dsa_area *area)
+{
+	Assert(area->control->handle != DSA_HANDLE_INVALID);
+	return area->control->handle;
+}
+
+/*
+ * Attach to an area given a handle generated (possibly in another process) by
+ * dsa_get_handle.  The area must have been created with dsa_create (not
+ * dsa_create_in_place).
+ */
+dsa_area *
+dsa_attach(dsa_handle handle)
+{
+	dsm_segment *segment;
+	dsa_area   *area;
+
+	/*
+	 * An area handle is really a DSM segment handle for the first segment, so
+	 * we go ahead and attach to that.
+	 */
+	segment = dsm_attach(handle);
+	if (segment == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not attach to dynamic shared area")));
+
+	area = attach_internal(dsm_segment_address(segment), segment, handle);
+
+	/* Clean up when the control segment detaches. */
+	on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+				  PointerGetDatum(dsm_segment_address(segment)));
+
+	return area;
+}
+
+/*
+ * Attach to an area that was created with dsa_create_in_place.  The caller
+ * must somehow know the location in memory that was used when the area was
+ * created, though it may be mapped at a different virtual address in this
+ * process.
+ *
+ * See dsa_create_in_place for note about releasing in-place areas, and the
+ * optional 'segment' argument which can be provided to allow automatic
+ * release if the containing memory happens to be a DSM segment.
+ */
+dsa_area *
+dsa_attach_in_place(void *place, dsm_segment *segment)
+{
+	dsa_area   *area;
+
+	area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
+
+	/*
+	 * Clean up when the control segment detaches, if a containing DSM segment
+	 * was provided.
+	 */
+	if (segment != NULL)
+		on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+					  PointerGetDatum(place));
+
+	return area;
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place.  The 'segment' argument is ignored but provides an
+ * interface suitable for on_dsm_detach, for the convenience of users who want
+ * to create a DSA segment inside an existing DSM segment and have it
+ * automatically released when the containing DSM segment is detached.
+ * 'place' should be the address of the place where the area was created.
+ *
+ * This callback is automatically registered for the DSM segment containing
+ * the control object of in-place areas when a segment is provided to
+ * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
+ * with dsa_create.
+ */
+void
+dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
+{
+	dsa_release_in_place(DatumGetPointer(place));
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place.  The 'code' argument is ignored but provides an
+ * interface suitable for on_shmem_exit or before_shmem_exit, for the
+ * convenience of users who want to create a DSA segment inside shared memory
+ * other than a DSM segment and have it automatically release at backend exit.
+ * 'place' should be the address of the place where the area was created.
+ */
+void
+dsa_on_shmem_exit_release_in_place(int code, Datum place)
+{
+	dsa_release_in_place(DatumGetPointer(place));
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place.  It is preferable to use one of the 'dsa_on_XXX'
+ * callbacks so that this is managed automatically, because failure to release
+ * an area created in-place leaks its segments permanently.
+ *
+ * This is also called automatically for areas produced by dsa_create or
+ * dsa_attach as an implementation detail.
+ */
+void
+dsa_release_in_place(void *place)
+{
+	dsa_area_control *control = (dsa_area_control *) place;
+	int			i;
+
+	LWLockAcquire(&control->lock, LW_EXCLUSIVE);
+	Assert(control->segment_header.magic ==
+		   (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
+	Assert(control->refcnt > 0);
+	if (--control->refcnt == 0)
+	{
+		for (i = 0; i <= control->high_segment_index; ++i)
+		{
+			dsm_handle	handle;
+
+			handle = control->segment_handles[i];
+			if (handle != DSM_HANDLE_INVALID)
+				dsm_unpin_segment(handle);
+		}
+	}
+	LWLockRelease(&control->lock);
+}
+
+/*
+ * Keep a DSA area attached until end of session or explicit detach.
+ *
+ * By default, areas are owned by the current resource owner, which means they
+ * are detached automatically when that scope ends.
+ */
+void
+dsa_pin_mapping(dsa_area *area)
+{
+	int			i;
+
+	Assert(!area->mapping_pinned);
+	area->mapping_pinned = true;
+
+	for (i = 0; i <= area->high_segment_index; ++i)
+		if (area->segment_maps[i].segment != NULL)
+			dsm_pin_mapping(area->segment_maps[i].segment);
+}
+
+/*
+ * Allocate memory in this storage area.  The return value is a dsa_pointer
+ * that can be passed to other processes, and converted to a local pointer
+ * with dsa_get_address.  'flags' is a bitmap which should be constructed
+ * from the following values:
+ *
+ * DSA_ALLOC_HUGE allows allocations >= 1GB.  Otherwise, such allocations
+ * will result in an ERROR.
+ *
+ * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
+ * no memory is available or a size limit established by dsa_set_size_limit
+ * would be exceeded.  Otherwise, such allocations will result in an ERROR.
+ *
+ * DSA_ALLOC_ZERO causes the allocated memory to be zeroed.  Otherwise, the
+ * contents of newly-allocated memory are indeterminate.
+ *
+ * These flags correspond to similarly named flags used by
+ * MemoryContextAllocExtended().  See also the macros dsa_allocate and
+ * dsa_allocate0 which expand to a call to this function with commonly used
+ * flags.
+ */
+dsa_pointer
+dsa_allocate_extended(dsa_area *area, size_t size, int flags)
+{
+	uint16		size_class;
+	dsa_pointer start_pointer;
+	dsa_segment_map *segment_map;
+	dsa_pointer result;
+
+	Assert(size > 0);
+
+	/* Sanity check on huge individual allocation size. */
+	if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
+		((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
+		elog(ERROR, "invalid DSA memory alloc request size %zu", size);
+
+	/*
+	 * If bigger than the largest size class, just grab a run of pages from
+	 * the free page manager, instead of allocating an object from a pool.
+	 * There will still be a span, but it's a special class of span that
+	 * manages this whole allocation and simply gives all pages back to the
+	 * free page manager when dsa_free is called.
+	 */
+	if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
+	{
+		size_t		npages = fpm_size_to_pages(size);
+		size_t		first_page;
+		dsa_pointer span_pointer;
+		dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
+
+		/* Obtain a span object. */
+		span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
+		if (!DsaPointerIsValid(span_pointer))
+		{
+			/* Raise error unless asked not to. */
+			if ((flags & DSA_ALLOC_NO_OOM) == 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory"),
+						 errdetail("Failed on DSA request of size %zu.",
+								   size)));
+			return InvalidDsaPointer;
+		}
+
+		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+
+		/* Find a segment from which to allocate. */
+		segment_map = get_best_segment(area, npages);
+		if (segment_map == NULL)
+			segment_map = make_new_segment(area, npages);
+		if (segment_map == NULL)
+		{
+			/* Can't make any more segments: game over. */
+			LWLockRelease(DSA_AREA_LOCK(area));
+			dsa_free(area, span_pointer);
+
+			/* Raise error unless asked not to. */
+			if ((flags & DSA_ALLOC_NO_OOM) == 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory"),
+						 errdetail("Failed on DSA request of size %zu.",
+								   size)));
+			return InvalidDsaPointer;
+		}
+
+		/*
+		 * Ask the free page manager for a run of pages.  This should always
+		 * succeed, since both get_best_segment and make_new_segment should
+		 * only return a non-NULL pointer if it actually contains enough
+		 * contiguous freespace.  If it does fail, something in our backend
+		 * private state is out of whack, so use FATAL to kill the process.
+		 */
+		if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
+			elog(FATAL,
+				 "dsa_allocate could not find %zu free pages", npages);
+		LWLockRelease(DSA_AREA_LOCK(area));
+
+		start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
+										 first_page * FPM_PAGE_SIZE);
+
+		/* Initialize span and pagemap. */
+		LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
+					  LW_EXCLUSIVE);
+		init_span(area, span_pointer, pool, start_pointer, npages,
+				  DSA_SCLASS_SPAN_LARGE);
+		segment_map->pagemap[first_page] = span_pointer;
+		LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
+
+		/* Zero-initialize the memory if requested. */
+		if ((flags & DSA_ALLOC_ZERO) != 0)
+			memset(dsa_get_address(area, start_pointer), 0, size);
+
+		return start_pointer;
+	}
+
+	/* Map allocation to a size class. */
+	if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
+	{
+		int			mapidx;
+
+		/* For smaller sizes we have a lookup table... */
+		mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
+				  DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
+		size_class = dsa_size_class_map[mapidx];
+	}
+	else
+	{
+		uint16		min;
+		uint16		max;
+
+		/* ... and for the rest we search by binary chop. */
+		min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
+		max = lengthof(dsa_size_classes) - 1;
+
+		while (min < max)
+		{
+			uint16		mid = (min + max) / 2;
+			uint16		class_size = dsa_size_classes[mid];
+
+			if (class_size < size)
+				min = mid + 1;
+			else
+				max = mid;
+		}
+
+		size_class = min;
+	}
+	Assert(size <= dsa_size_classes[size_class]);
+	Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
+
+	/* Attempt to allocate an object from the appropriate pool. */
+	result = alloc_object(area, size_class);
+
+	/* Check for failure to allocate. */
+	if (!DsaPointerIsValid(result))
+	{
+		/* Raise error unless asked not to. */
+		if ((flags & DSA_ALLOC_NO_OOM) == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory"),
+					 errdetail("Failed on DSA request of size %zu.", size)));
+		return InvalidDsaPointer;
+	}
+
+	/* Zero-initialize the memory if requested. */
+	if ((flags & DSA_ALLOC_ZERO) != 0)
+		memset(dsa_get_address(area, result), 0, size);
+
+	return result;
+}
+
+/*
+ * Free memory obtained with dsa_allocate.
+ */
+void
+dsa_free(dsa_area *area, dsa_pointer dp)
+{
+	dsa_segment_map *segment_map;
+	int			pageno;
+	dsa_pointer span_pointer;
+	dsa_area_span *span;
+	char	   *superblock;
+	char	   *object;
+	size_t		size;
+	int			size_class;
+
+	/* Make sure we don't have a stale segment in the slot 'dp' refers to. */
+	check_for_freed_segments(area);
+
+	/* Locate the object, span and pool. */
+	segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
+	pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
+	span_pointer = segment_map->pagemap[pageno];
+	span = dsa_get_address(area, span_pointer);
+	superblock = dsa_get_address(area, span->start);
+	object = dsa_get_address(area, dp);
+	size_class = span->size_class;
+	if (size_class >= lengthof(dsa_size_classes))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				errmsg("invalid span size"),
+				errdetail("Invalid span->size_class value %zu, but dsa_size_classes size is %zu.", size_class, lengthof(dsa_size_classes))));
+	}
+	size = dsa_size_classes[size_class];
+
+	/*
+	 * Special case for large objects that live in a special span: we return
+	 * those pages directly to the free page manager and free the span.
+	 */
+	if (span->size_class == DSA_SCLASS_SPAN_LARGE)
+	{
+
+#ifdef CLOBBER_FREED_MEMORY
+		memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
+#endif
+
+		/* Give pages back to free page manager. */
+		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+		FreePageManagerPut(segment_map->fpm,
+						   DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
+						   span->npages);
+
+		/* Move segment to appropriate bin if necessary. */
+		rebin_segment(area, segment_map);
+		LWLockRelease(DSA_AREA_LOCK(area));
+
+		/* Unlink span. */
+		LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
+					  LW_EXCLUSIVE);
+		unlink_span(area, span);
+		LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
+		/* Free the span object so it can be reused. */
+		dsa_free(area, span_pointer);
+		return;
+	}
+
+#ifdef CLOBBER_FREED_MEMORY
+	memset(object, 0x7f, size);
+#endif
+
+	LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+
+	/* Put the object on the span's freelist. */
+	Assert(object >= superblock);
+	Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
+	Assert((object - superblock) % size == 0);
+	NextFreeObjectIndex(object) = span->firstfree;
+	span->firstfree = (object - superblock) / size;
+	++span->nallocatable;
+
+	/*
+	 * See if the span needs to moved to a different fullness class, or be
+	 * freed so its pages can be given back to the segment.
+	 */
+	if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
+	{
+		/*
+		 * The block was completely full and is located in the
+		 * highest-numbered fullness class, which is never scanned for free
+		 * chunks.  We must move it to the next-lower fullness class.
+		 */
+		unlink_span(area, span);
+		add_span_to_fullness_class(area, span, span_pointer,
+								   DSA_FULLNESS_CLASSES - 2);
+
+		/*
+		 * If this is the only span, and there is no active span, then we
+		 * should probably move this span to fullness class 1.  (Otherwise if
+		 * you allocate exactly all the objects in the only span, it moves to
+		 * class 3, then you free them all, it moves to 2, and then is given
+		 * back, leaving no active span).
+		 */
+	}
+	else if (span->nallocatable == span->nmax &&
+			 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
+	{
+		/*
+		 * This entire block is free, and it's not the active block for this
+		 * size class.  Return the memory to the free page manager. We don't
+		 * do this for the active block to prevent hysteresis: if we
+		 * repeatedly allocate and free the only chunk in the active block, it
+		 * will be very inefficient if we deallocate and reallocate the block
+		 * every time.
+		 */
+		destroy_superblock(area, span_pointer);
+	}
+
+	LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+}
+
+/*
+ * Obtain a backend-local address for a dsa_pointer.  'dp' must point to
+ * memory allocated by the given area (possibly in another process) that
+ * hasn't yet been freed.  This may cause a segment to be mapped into the
+ * current process if required, and may cause freed segments to be unmapped.
+ */
+void *
+dsa_get_address(dsa_area *area, dsa_pointer dp)
+{
+	dsa_segment_index index;
+	size_t		offset;
+
+	/* Convert InvalidDsaPointer to NULL. */
+	if (!DsaPointerIsValid(dp))
+		return NULL;
+
+	/* Process any requests to detach from freed segments. */
+	check_for_freed_segments(area);
+
+	/* Break the dsa_pointer into its components. */
+	index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
+	offset = DSA_EXTRACT_OFFSET(dp);
+	Assert(index < DSA_MAX_SEGMENTS);
+
+	/* Check if we need to cause this segment to be mapped in. */
+	if (unlikely(area->segment_maps[index].mapped_address == NULL))
+	{
+		/* Call for effect (we don't need the result). */
+		get_segment_by_index(area, index);
+	}
+
+	return area->segment_maps[index].mapped_address + offset;
+}
+
+/*
+ * Pin this area, so that it will continue to exist even if all backends
+ * detach from it.  In that case, the area can still be reattached to if a
+ * handle has been recorded somewhere.
+ */
+void
+dsa_pin(dsa_area *area)
+{
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	if (area->control->pinned)
+	{
+		LWLockRelease(DSA_AREA_LOCK(area));
+		elog(ERROR, "dsa_area already pinned");
+	}
+	area->control->pinned = true;
+	++area->control->refcnt;
+	LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Undo the effects of dsa_pin, so that the given area can be freed when no
+ * backends are attached to it.  May be called only if dsa_pin has been
+ * called.
+ */
+void
+dsa_unpin(dsa_area *area)
+{
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	Assert(area->control->refcnt > 1);
+	if (!area->control->pinned)
+	{
+		LWLockRelease(DSA_AREA_LOCK(area));
+		elog(ERROR, "dsa_area not pinned");
+	}
+	area->control->pinned = false;
+	--area->control->refcnt;
+	LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Set the total size limit for this area.  This limit is checked whenever new
+ * segments need to be allocated from the operating system.  If the new size
+ * limit is already exceeded, this has no immediate effect.
+ *
+ * Note that the total virtual memory usage may be temporarily larger than
+ * this limit when segments have been freed, but not yet detached by all
+ * backends that have attached to them.
+ */
+void
+dsa_set_size_limit(dsa_area *area, size_t limit)
+{
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	area->control->max_total_segment_size = limit;
+	LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Aggressively free all spare memory in the hope of returning DSM segments to
+ * the operating system.
+ */
+void
+dsa_trim(dsa_area *area)
+{
+	int			size_class;
+
+	/*
+	 * Trim in reverse pool order so we get to the spans-of-spans last, just
+	 * in case any become entirely free while processing all the other pools.
+	 */
+	for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
+	{
+		dsa_area_pool *pool = &area->control->pools[size_class];
+		dsa_pointer span_pointer;
+
+		if (size_class == DSA_SCLASS_SPAN_LARGE)
+		{
+			/* Large object frees give back segments aggressively already. */
+			continue;
+		}
+
+		/*
+		 * Search fullness class 1 only.  That is where we expect to find an
+		 * entirely empty superblock (entirely empty superblocks in other
+		 * fullness classes are returned to the free page map by dsa_free).
+		 */
+		LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+		span_pointer = pool->spans[1];
+		while (DsaPointerIsValid(span_pointer))
+		{
+			dsa_area_span *span = dsa_get_address(area, span_pointer);
+			dsa_pointer next = span->nextspan;
+
+			if (span->nallocatable == span->nmax)
+				destroy_superblock(area, span_pointer);
+
+			span_pointer = next;
+		}
+		LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+	}
+}
+
+/*
+ * Print out debugging information about the internal state of the shared
+ * memory area.
+ */
+void
+dsa_dump(dsa_area *area)
+{
+	size_t		i,
+				j;
+
+	/*
+	 * Note: This gives an inconsistent snapshot as it acquires and releases
+	 * individual locks as it goes...
+	 */
+
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	check_for_freed_segments_locked(area);
+	fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
+	fprintf(stderr, "  max_total_segment_size: %zu\n",
+			area->control->max_total_segment_size);
+	fprintf(stderr, "  total_segment_size: %zu\n",
+			area->control->total_segment_size);
+	fprintf(stderr, "  refcnt: %d\n", area->control->refcnt);
+	fprintf(stderr, "  pinned: %c\n", area->control->pinned ? 't' : 'f');
+	fprintf(stderr, "  segment bins:\n");
+	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
+	{
+		if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
+		{
+			dsa_segment_index segment_index;
+
+			if (i == 0)
+				fprintf(stderr,
+						"    segment bin %zu (no contiguous free pages):\n", i);
+			else
+				fprintf(stderr,
+						"    segment bin %zu (at least %d contiguous pages free):\n",
+						i, 1 << (i - 1));
+			segment_index = area->control->segment_bins[i];
+			while (segment_index != DSA_SEGMENT_INDEX_NONE)
+			{
+				dsa_segment_map *segment_map;
+
+				segment_map =
+					get_segment_by_index(area, segment_index);
+
+				fprintf(stderr,
+						"      segment index %zu, usable_pages = %zu, "
+						"contiguous_pages = %zu, mapped at %p\n",
+						segment_index,
+						segment_map->header->usable_pages,
+						fpm_largest(segment_map->fpm),
+						segment_map->mapped_address);
+				segment_index = segment_map->header->next;
+			}
+		}
+	}
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	fprintf(stderr, "  pools:\n");
+	for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
+	{
+		bool		found = false;
+
+		LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
+		for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
+			if (DsaPointerIsValid(area->control->pools[i].spans[j]))
+				found = true;
+		if (found)
+		{
+			if (i == DSA_SCLASS_BLOCK_OF_SPANS)
+				fprintf(stderr, "    pool for blocks of span objects:\n");
+			else if (i == DSA_SCLASS_SPAN_LARGE)
+				fprintf(stderr, "    pool for large object spans:\n");
+			else
+				fprintf(stderr,
+						"    pool for size class %zu (object size %hu bytes):\n",
+						i, dsa_size_classes[i]);
+			for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
+			{
+				if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
+					fprintf(stderr, "      fullness class %zu is empty\n", j);
+				else
+				{
+					dsa_pointer span_pointer = area->control->pools[i].spans[j];
+
+					fprintf(stderr, "      fullness class %zu:\n", j);
+					while (DsaPointerIsValid(span_pointer))
+					{
+						dsa_area_span *span;
+
+						span = dsa_get_address(area, span_pointer);
+						fprintf(stderr,
+								"        span descriptor at "
+								DSA_POINTER_FORMAT ", superblock at "
+								DSA_POINTER_FORMAT
+								", pages = %zu, objects free = %hu/%hu\n",
+								span_pointer, span->start, span->npages,
+								span->nallocatable, span->nmax);
+						span_pointer = span->nextspan;
+					}
+				}
+			}
+		}
+		LWLockRelease(DSA_SCLASS_LOCK(area, i));
+	}
+}
+
+/*
+ * Return the smallest size that you can successfully provide to
+ * dsa_create_in_place.
+ */
+size_t
+dsa_minimum_size(void)
+{
+	size_t		size;
+	int			pages = 0;
+
+	size = MAXALIGN(sizeof(dsa_area_control)) +
+		MAXALIGN(sizeof(FreePageManager));
+
+	/* Figure out how many pages we need, including the page map... */
+	while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
+	{
+		++pages;
+		size += sizeof(dsa_pointer);
+	}
+
+	return pages * FPM_PAGE_SIZE;
+}
+
+/*
+ * Workhorse function for dsa_create and dsa_create_in_place.
+ */
+static dsa_area *
+create_internal(void *place, size_t size,
+				int tranche_id,
+				dsm_handle control_handle,
+				dsm_segment *control_segment)
+{
+	dsa_area_control *control;
+	dsa_area   *area;
+	dsa_segment_map *segment_map;
+	size_t		usable_pages;
+	size_t		total_pages;
+	size_t		metadata_bytes;
+	int			i;
+
+	/* Sanity check on the space we have to work in. */
+	if (size < dsa_minimum_size())
+		elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
+			 dsa_minimum_size(), size);
+
+	/* Now figure out how much space is usable */
+	total_pages = size / FPM_PAGE_SIZE;
+	metadata_bytes =
+		MAXALIGN(sizeof(dsa_area_control)) +
+		MAXALIGN(sizeof(FreePageManager)) +
+		total_pages * sizeof(dsa_pointer);
+	/* Add padding up to next page boundary. */
+	if (metadata_bytes % FPM_PAGE_SIZE != 0)
+		metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+	Assert(metadata_bytes <= size);
+	usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
+
+	/*
+	 * Initialize the dsa_area_control object located at the start of the
+	 * space.
+	 */
+	control = (dsa_area_control *) place;
+	memset(place, 0, sizeof(*control));
+	control->segment_header.magic =
+		DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
+	control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
+	control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
+	control->segment_header.usable_pages = usable_pages;
+	control->segment_header.freed = false;
+	control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
+	control->handle = control_handle;
+	control->max_total_segment_size = (size_t) -1;
+	control->total_segment_size = size;
+	control->segment_handles[0] = control_handle;
+	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
+		control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
+	control->refcnt = 1;
+	control->lwlock_tranche_id = tranche_id;
+
+	/*
+	 * Create the dsa_area object that this backend will use to access the
+	 * area.  Other backends will need to obtain their own dsa_area object by
+	 * attaching.
+	 */
+	area = palloc(sizeof(dsa_area));
+	area->control = control;
+	area->mapping_pinned = false;
+	memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
+	area->high_segment_index = 0;
+	area->freed_segment_counter = 0;
+	LWLockInitialize(&control->lock, control->lwlock_tranche_id);
+	for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
+		LWLockInitialize(DSA_SCLASS_LOCK(area, i),
+						 control->lwlock_tranche_id);
+
+	/* Set up the segment map for this process's mapping. */
+	segment_map = &area->segment_maps[0];
+	segment_map->segment = control_segment;
+	segment_map->mapped_address = place;
+	segment_map->header = (dsa_segment_header *) place;
+	segment_map->fpm = (FreePageManager *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_area_control)));
+	segment_map->pagemap = (dsa_pointer *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_area_control)) +
+		 MAXALIGN(sizeof(FreePageManager)));
+
+	/* Set up the free page map. */
+	FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
+	/* There can be 0 usable pages if size is dsa_minimum_size(). */
+
+	if (usable_pages > 0)
+		FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
+						   usable_pages);
+
+	/* Put this segment into the appropriate bin. */
+	control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
+	segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
+
+	return area;
+}
+
+/*
+ * Workhorse function for dsa_attach and dsa_attach_in_place.
+ */
+static dsa_area *
+attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
+{
+	dsa_area_control *control;
+	dsa_area   *area;
+	dsa_segment_map *segment_map;
+
+	control = (dsa_area_control *) place;
+	Assert(control->handle == handle);
+	Assert(control->segment_handles[0] == handle);
+	Assert(control->segment_header.magic ==
+		   (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
+
+	/* Build the backend-local area object. */
+	area = palloc(sizeof(dsa_area));
+	area->control = control;
+	area->mapping_pinned = false;
+	memset(&area->segment_maps[0], 0,
+		   sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
+	area->high_segment_index = 0;
+
+	/* Set up the segment map for this process's mapping. */
+	segment_map = &area->segment_maps[0];
+	segment_map->segment = segment; /* NULL for in-place */
+	segment_map->mapped_address = place;
+	segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
+	segment_map->fpm = (FreePageManager *)
+		(segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
+	segment_map->pagemap = (dsa_pointer *)
+		(segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
+		 MAXALIGN(sizeof(FreePageManager)));
+
+	/* Bump the reference count. */
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	if (control->refcnt == 0)
+	{
+		/* We can't attach to a DSA area that has already been destroyed. */
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not attach to dynamic shared area")));
+	}
+	++control->refcnt;
+	area->freed_segment_counter = area->control->freed_segment_counter;
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	return area;
+}
+
+/*
+ * Add a new span to fullness class 1 of the indicated pool.
+ */
+static void
+init_span(dsa_area *area,
+		  dsa_pointer span_pointer,
+		  dsa_area_pool *pool, dsa_pointer start, size_t npages,
+		  uint16 size_class)
+{
+	dsa_area_span *span = dsa_get_address(area, span_pointer);
+	size_t		obsize = dsa_size_classes[size_class];
+
+	/*
+	 * The per-pool lock must be held because we manipulate the span list for
+	 * this pool.
+	 */
+	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+
+	/* Push this span onto the front of the span list for fullness class 1. */
+	if (DsaPointerIsValid(pool->spans[1]))
+	{
+		dsa_area_span *head = (dsa_area_span *)
+			dsa_get_address(area, pool->spans[1]);
+
+		head->prevspan = span_pointer;
+	}
+	span->pool = DsaAreaPoolToDsaPointer(area, pool);
+	span->nextspan = pool->spans[1];
+	span->prevspan = InvalidDsaPointer;
+	pool->spans[1] = span_pointer;
+
+	span->start = start;
+	span->npages = npages;
+	span->size_class = size_class;
+	span->ninitialized = 0;
+	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+	{
+		/*
+		 * A block-of-spans contains its own descriptor, so mark one object as
+		 * initialized and reduce the count of allocatable objects by one.
+		 * Doing this here has the side effect of also reducing nmax by one,
+		 * which is important to make sure we free this object at the correct
+		 * time.
+		 */
+		span->ninitialized = 1;
+		span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
+	}
+	else if (size_class != DSA_SCLASS_SPAN_LARGE)
+		span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
+	span->firstfree = DSA_SPAN_NOTHING_FREE;
+	span->nmax = span->nallocatable;
+	span->fclass = 1;
+}
+
+/*
+ * Transfer the first span in one fullness class to the head of another
+ * fullness class.
+ */
+static bool
+transfer_first_span(dsa_area *area,
+					dsa_area_pool *pool, int fromclass, int toclass)
+{
+	dsa_pointer span_pointer;
+	dsa_area_span *span;
+	dsa_area_span *nextspan;
+
+	/* Can't do it if source list is empty. */
+	span_pointer = pool->spans[fromclass];
+	if (!DsaPointerIsValid(span_pointer))
+		return false;
+
+	/* Remove span from head of source list. */
+	span = dsa_get_address(area, span_pointer);
+	pool->spans[fromclass] = span->nextspan;
+	if (DsaPointerIsValid(span->nextspan))
+	{
+		nextspan = (dsa_area_span *)
+			dsa_get_address(area, span->nextspan);
+		nextspan->prevspan = InvalidDsaPointer;
+	}
+
+	/* Add span to head of target list. */
+	span->nextspan = pool->spans[toclass];
+	pool->spans[toclass] = span_pointer;
+	if (DsaPointerIsValid(span->nextspan))
+	{
+		nextspan = (dsa_area_span *)
+			dsa_get_address(area, span->nextspan);
+		nextspan->prevspan = span_pointer;
+	}
+	span->fclass = toclass;
+
+	return true;
+}
+
+/*
+ * Allocate one object of the requested size class from the given area.
+ */
+static inline dsa_pointer
+alloc_object(dsa_area *area, int size_class)
+{
+	dsa_area_pool *pool = &area->control->pools[size_class];
+	dsa_area_span *span;
+	dsa_pointer block;
+	dsa_pointer result;
+	char	   *object;
+	size_t		size;
+
+	/*
+	 * Even though ensure_active_superblock can in turn call alloc_object if
+	 * it needs to allocate a new span, that's always from a different pool,
+	 * and the order of lock acquisition is always the same, so it's OK that
+	 * we hold this lock for the duration of this function.
+	 */
+	Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+	LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+
+	/*
+	 * If there's no active superblock, we must successfully obtain one or
+	 * fail the request.
+	 */
+	if (!DsaPointerIsValid(pool->spans[1]) &&
+		!ensure_active_superblock(area, pool, size_class))
+	{
+		result = InvalidDsaPointer;
+	}
+	else
+	{
+		/*
+		 * There should be a block in fullness class 1 at this point, and it
+		 * should never be completely full.  Thus we can either pop an object
+		 * from the free list or, failing that, initialize a new object.
+		 */
+		Assert(DsaPointerIsValid(pool->spans[1]));
+		span = (dsa_area_span *)
+			dsa_get_address(area, pool->spans[1]);
+		Assert(span->nallocatable > 0);
+		block = span->start;
+		Assert(size_class < DSA_NUM_SIZE_CLASSES);
+		size = dsa_size_classes[size_class];
+		if (span->firstfree != DSA_SPAN_NOTHING_FREE)
+		{
+			result = block + span->firstfree * size;
+			object = dsa_get_address(area, result);
+			span->firstfree = NextFreeObjectIndex(object);
+		}
+		else
+		{
+			result = block + span->ninitialized * size;
+			++span->ninitialized;
+		}
+		--span->nallocatable;
+
+		/* If it's now full, move it to the highest-numbered fullness class. */
+		if (span->nallocatable == 0)
+			transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
+	}
+
+	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+	LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+
+	return result;
+}
+
+/*
+ * Ensure an active (i.e. fullness class 1) superblock, unless all existing
+ * superblocks are completely full and no more can be allocated.
+ *
+ * Fullness classes K of 0..N are loosely intended to represent blocks whose
+ * utilization percentage is at least K/N, but we only enforce this rigorously
+ * for the highest-numbered fullness class, which always contains exactly
+ * those blocks that are completely full.  It's otherwise acceptable for a
+ * block to be in a higher-numbered fullness class than the one to which it
+ * logically belongs.  In addition, the active block, which is always the
+ * first block in fullness class 1, is permitted to have a higher allocation
+ * percentage than would normally be allowable for that fullness class; we
+ * don't move it until it's completely full, and then it goes to the
+ * highest-numbered fullness class.
+ *
+ * It might seem odd that the active block is the head of fullness class 1
+ * rather than fullness class 0, but experience with other allocators has
+ * shown that it's usually better to allocate from a block that's moderately
+ * full rather than one that's nearly empty.  Insofar as is reasonably
+ * possible, we want to avoid performing new allocations in a block that would
+ * otherwise become empty soon.
+ */
+static bool
+ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
+						 int size_class)
+{
+	dsa_pointer span_pointer;
+	dsa_pointer start_pointer;
+	size_t		obsize = dsa_size_classes[size_class];
+	size_t		nmax;
+	int			fclass;
+	size_t		npages = 1;
+	size_t		first_page;
+	size_t		i;
+	dsa_segment_map *segment_map;
+
+	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+
+	/*
+	 * Compute the number of objects that will fit in a block of this size
+	 * class.  Span-of-spans blocks are just a single page, and the first
+	 * object isn't available for use because it describes the block-of-spans
+	 * itself.
+	 */
+	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+		nmax = FPM_PAGE_SIZE / obsize - 1;
+	else
+		nmax = DSA_SUPERBLOCK_SIZE / obsize;
+
+	/*
+	 * If fullness class 1 is empty, try to find a span to put in it by
+	 * scanning higher-numbered fullness classes (excluding the last one,
+	 * whose blocks are certain to all be completely full).
+	 */
+	for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
+	{
+		span_pointer = pool->spans[fclass];
+
+		while (DsaPointerIsValid(span_pointer))
+		{
+			int			tfclass;
+			dsa_area_span *span;
+			dsa_area_span *nextspan;
+			dsa_area_span *prevspan;
+			dsa_pointer next_span_pointer;
+
+			span = (dsa_area_span *)
+				dsa_get_address(area, span_pointer);
+			next_span_pointer = span->nextspan;
+
+			/* Figure out what fullness class should contain this span. */
+			tfclass = (nmax - span->nallocatable)
+				* (DSA_FULLNESS_CLASSES - 1) / nmax;
+
+			/* Look up next span. */
+			if (DsaPointerIsValid(span->nextspan))
+				nextspan = (dsa_area_span *)
+					dsa_get_address(area, span->nextspan);
+			else
+				nextspan = NULL;
+
+			/*
+			 * If utilization has dropped enough that this now belongs in some
+			 * other fullness class, move it there.
+			 */
+			if (tfclass < fclass)
+			{
+				/* Remove from the current fullness class list. */
+				if (pool->spans[fclass] == span_pointer)
+				{
+					/* It was the head; remove it. */
+					Assert(!DsaPointerIsValid(span->prevspan));
+					pool->spans[fclass] = span->nextspan;
+					if (nextspan != NULL)
+						nextspan->prevspan = InvalidDsaPointer;
+				}
+				else
+				{
+					/* It was not the head. */
+					Assert(DsaPointerIsValid(span->prevspan));
+					prevspan = (dsa_area_span *)
+						dsa_get_address(area, span->prevspan);
+					prevspan->nextspan = span->nextspan;
+				}
+				if (nextspan != NULL)
+					nextspan->prevspan = span->prevspan;
+
+				/* Push onto the head of the new fullness class list. */
+				span->nextspan = pool->spans[tfclass];
+				pool->spans[tfclass] = span_pointer;
+				span->prevspan = InvalidDsaPointer;
+				if (DsaPointerIsValid(span->nextspan))
+				{
+					nextspan = (dsa_area_span *)
+						dsa_get_address(area, span->nextspan);
+					nextspan->prevspan = span_pointer;
+				}
+				span->fclass = tfclass;
+			}
+
+			/* Advance to next span on list. */
+			span_pointer = next_span_pointer;
+		}
+
+		/* Stop now if we found a suitable block. */
+		if (DsaPointerIsValid(pool->spans[1]))
+			return true;
+	}
+
+	/*
+	 * If there are no blocks that properly belong in fullness class 1, pick
+	 * one from some other fullness class and move it there anyway, so that we
+	 * have an allocation target.  Our last choice is to transfer a block
+	 * that's almost empty (and might become completely empty soon if left
+	 * alone), but even that is better than failing, which is what we must do
+	 * if there are no blocks at all with freespace.
+	 */
+	Assert(!DsaPointerIsValid(pool->spans[1]));
+	for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
+		if (transfer_first_span(area, pool, fclass, 1))
+			return true;
+	if (!DsaPointerIsValid(pool->spans[1]) &&
+		transfer_first_span(area, pool, 0, 1))
+		return true;
+
+	/*
+	 * We failed to find an existing span with free objects, so we need to
+	 * allocate a new superblock and construct a new span to manage it.
+	 *
+	 * First, get a dsa_area_span object to describe the new superblock block
+	 * ... unless this allocation is for a dsa_area_span object, in which case
+	 * that's surely not going to work.  We handle that case by storing the
+	 * span describing a block-of-spans inline.
+	 */
+	if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+	{
+		span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
+		if (!DsaPointerIsValid(span_pointer))
+			return false;
+		npages = DSA_PAGES_PER_SUPERBLOCK;
+	}
+
+	/* Find or create a segment and allocate the superblock. */
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	segment_map = get_best_segment(area, npages);
+	if (segment_map == NULL)
+	{
+		segment_map = make_new_segment(area, npages);
+		if (segment_map == NULL)
+		{
+			LWLockRelease(DSA_AREA_LOCK(area));
+			return false;
+		}
+	}
+
+	/*
+	 * This shouldn't happen: get_best_segment() or make_new_segment()
+	 * promised that we can successfully allocate npages.
+	 */
+	if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
+		elog(FATAL,
+			 "dsa_allocate could not find %zu free pages for superblock",
+			 npages);
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	/* Compute the start of the superblock. */
+	start_pointer =
+		DSA_MAKE_POINTER(get_segment_index(area, segment_map),
+						 first_page * FPM_PAGE_SIZE);
+
+	/*
+	 * If this is a block-of-spans, carve the descriptor right out of the
+	 * allocated space.
+	 */
+	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+	{
+		/*
+		 * We have a pointer into the segment.  We need to build a dsa_pointer
+		 * from the segment index and offset into the segment.
+		 */
+		span_pointer = start_pointer;
+	}
+
+	/* Initialize span and pagemap. */
+	init_span(area, span_pointer, pool, start_pointer, npages, size_class);
+	for (i = 0; i < npages; ++i)
+		segment_map->pagemap[first_page + i] = span_pointer;
+
+	return true;
+}
+
+/*
+ * Return the segment map corresponding to a given segment index, mapping the
+ * segment in if necessary.  For internal segment book-keeping, this is called
+ * with the area lock held.  It is also called by dsa_free and dsa_get_address
+ * without any locking, relying on the fact they have a known live segment
+ * index and they always call check_for_freed_segments to ensures that any
+ * freed segment occupying the same slot is detached first.
+ */
+static dsa_segment_map *
+get_segment_by_index(dsa_area *area, dsa_segment_index index)
+{
+	if (unlikely(area->segment_maps[index].mapped_address == NULL))
+	{
+		dsm_handle	handle;
+		dsm_segment *segment;
+		dsa_segment_map *segment_map;
+
+		/*
+		 * If we are reached by dsa_free or dsa_get_address, there must be at
+		 * least one object allocated in the referenced segment.  Otherwise,
+		 * their caller has a double-free or access-after-free bug, which we
+		 * have no hope of detecting.  So we know it's safe to access this
+		 * array slot without holding a lock; it won't change underneath us.
+		 * Furthermore, we know that we can see the latest contents of the
+		 * slot, as explained in check_for_freed_segments, which those
+		 * functions call before arriving here.
+		 */
+		handle = area->control->segment_handles[index];
+
+		/* It's an error to try to access an unused slot. */
+		if (handle == DSM_HANDLE_INVALID)
+			elog(ERROR,
+				 "dsa_area could not attach to a segment that has been freed");
+
+		segment = dsm_attach(handle);
+		if (segment == NULL)
+			elog(ERROR, "dsa_area could not attach to segment");
+		if (area->mapping_pinned)
+			dsm_pin_mapping(segment);
+		segment_map = &area->segment_maps[index];
+		segment_map->segment = segment;
+		segment_map->mapped_address = dsm_segment_address(segment);
+		segment_map->header =
+			(dsa_segment_header *) segment_map->mapped_address;
+		segment_map->fpm = (FreePageManager *)
+			(segment_map->mapped_address +
+			 MAXALIGN(sizeof(dsa_segment_header)));
+		segment_map->pagemap = (dsa_pointer *)
+			(segment_map->mapped_address +
+			 MAXALIGN(sizeof(dsa_segment_header)) +
+			 MAXALIGN(sizeof(FreePageManager)));
+
+		/* Remember the highest index this backend has ever mapped. */
+		if (area->high_segment_index < index)
+			area->high_segment_index = index;
+
+		Assert(segment_map->header->magic ==
+			   (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
+	}
+
+	/*
+	 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
+	 * but it's a bug in the calling code and undefined behavior if the
+	 * address is not live (ie if the segment might possibly have been freed,
+	 * they're trying to use a dangling pointer).
+	 *
+	 * For dsa.c code that holds the area lock to manipulate segment_bins
+	 * lists, it would be a bug if we ever reach a freed segment here.  After
+	 * it's marked as freed, the only thing any backend should do with it is
+	 * unmap it, and it should always have done that in
+	 * check_for_freed_segments_locked() before arriving here to resolve an
+	 * index to a segment_map.
+	 *
+	 * Either way we can assert that we aren't returning a freed segment.
+	 */
+	Assert(!area->segment_maps[index].header->freed);
+
+	return &area->segment_maps[index];
+}
+
+/*
+ * Return a superblock to the free page manager.  If the underlying segment
+ * has become entirely free, then return it to the operating system.
+ *
+ * The appropriate pool lock must be held.
+ */
+static void
+destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
+{
+	dsa_area_span *span = dsa_get_address(area, span_pointer);
+	int			size_class = span->size_class;
+	dsa_segment_map *segment_map;
+
+
+	/* Remove it from its fullness class list. */
+	unlink_span(area, span);
+
+	/*
+	 * Note: Here we acquire the area lock while we already hold a per-pool
+	 * lock.  We never hold the area lock and then take a pool lock, or we
+	 * could deadlock.
+	 */
+	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	check_for_freed_segments_locked(area);
+	segment_map =
+		get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
+	FreePageManagerPut(segment_map->fpm,
+					   DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
+					   span->npages);
+	/* Check if the segment is now entirely free. */
+	if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
+	{
+		dsa_segment_index index = get_segment_index(area, segment_map);
+
+		/* If it's not the segment with extra control data, free it. */
+		if (index != 0)
+		{
+			/*
+			 * Give it back to the OS, and allow other backends to detect that
+			 * they need to detach.
+			 */
+			unlink_segment(area, segment_map);
+			segment_map->header->freed = true;
+			Assert(area->control->total_segment_size >=
+				   segment_map->header->size);
+			area->control->total_segment_size -=
+				segment_map->header->size;
+			dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
+			dsm_detach(segment_map->segment);
+			area->control->segment_handles[index] = DSM_HANDLE_INVALID;
+			++area->control->freed_segment_counter;
+			segment_map->segment = NULL;
+			segment_map->header = NULL;
+			segment_map->mapped_address = NULL;
+		}
+	}
+
+	/* Move segment to appropriate bin if necessary. */
+	if (segment_map->header != NULL)
+		rebin_segment(area, segment_map);
+
+	LWLockRelease(DSA_AREA_LOCK(area));
+
+	/*
+	 * Span-of-spans blocks store the span which describes them within the
+	 * block itself, so freeing the storage implicitly frees the descriptor
+	 * also.  If this is a block of any other type, we need to separately free
+	 * the span object also.  This recursive call to dsa_free will acquire the
+	 * span pool's lock.  We can't deadlock because the acquisition order is
+	 * always some other pool and then the span pool.
+	 */
+	if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+		dsa_free(area, span_pointer);
+}
+
+static void
+unlink_span(dsa_area *area, dsa_area_span *span)
+{
+	if (DsaPointerIsValid(span->nextspan))
+	{
+		dsa_area_span *next = dsa_get_address(area, span->nextspan);
+
+		next->prevspan = span->prevspan;
+	}
+	if (DsaPointerIsValid(span->prevspan))
+	{
+		dsa_area_span *prev = dsa_get_address(area, span->prevspan);
+
+		prev->nextspan = span->nextspan;
+	}
+	else
+	{
+		dsa_area_pool *pool = dsa_get_address(area, span->pool);
+
+		pool->spans[span->fclass] = span->nextspan;
+	}
+}
+
+static void
+add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
+						   dsa_pointer span_pointer,
+						   int fclass)
+{
+	dsa_area_pool *pool = dsa_get_address(area, span->pool);
+
+	if (DsaPointerIsValid(pool->spans[fclass]))
+	{
+		dsa_area_span *head = dsa_get_address(area,
+											  pool->spans[fclass]);
+
+		head->prevspan = span_pointer;
+	}
+	span->prevspan = InvalidDsaPointer;
+	span->nextspan = pool->spans[fclass];
+	pool->spans[fclass] = span_pointer;
+	span->fclass = fclass;
+}
+
+/*
+ * Detach from an area that was either created or attached to by this process.
+ */
+void
+dsa_detach(dsa_area *area)
+{
+	int			i;
+
+	/* Detach from all segments. */
+	for (i = 0; i <= area->high_segment_index; ++i)
+		if (area->segment_maps[i].segment != NULL)
+			dsm_detach(area->segment_maps[i].segment);
+
+	/*
+	 * Note that 'detaching' (= detaching from DSM segments) doesn't include
+	 * 'releasing' (= adjusting the reference count).  It would be nice to
+	 * combine these operations, but client code might never get around to
+	 * calling dsa_detach because of an error path, and a detach hook on any
+	 * particular segment is too late to detach other segments in the area
+	 * without risking a 'leak' warning in the non-error path.
+	 */
+
+	/* Free the backend-local area object. */
+	pfree(area);
+}
+
+/*
+ * Unlink a segment from the bin that contains it.
+ */
+static void
+unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
+{
+	if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *prev;
+
+		prev = get_segment_by_index(area, segment_map->header->prev);
+		prev->header->next = segment_map->header->next;
+	}
+	else
+	{
+		Assert(area->control->segment_bins[segment_map->header->bin] ==
+			   get_segment_index(area, segment_map));
+		area->control->segment_bins[segment_map->header->bin] =
+			segment_map->header->next;
+	}
+	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *next;
+
+		next = get_segment_by_index(area, segment_map->header->next);
+		next->header->prev = segment_map->header->prev;
+	}
+}
+
+/*
+ * Find a segment that could satisfy a request for 'npages' of contiguous
+ * memory, or return NULL if none can be found.  This may involve attaching to
+ * segments that weren't previously attached so that we can query their free
+ * pages map.
+ */
+static dsa_segment_map *
+get_best_segment(dsa_area *area, size_t npages)
+{
+	size_t		bin;
+
+	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+	check_for_freed_segments_locked(area);
+
+	/*
+	 * Start searching from the first bin that *might* have enough contiguous
+	 * pages.
+	 */
+	for (bin = contiguous_pages_to_segment_bin(npages);
+		 bin < DSA_NUM_SEGMENT_BINS;
+		 ++bin)
+	{
+		/*
+		 * The minimum contiguous size that any segment in this bin should
+		 * have.  We'll re-bin if we see segments with fewer.
+		 */
+		size_t		threshold = (size_t) 1 << (bin - 1);
+		dsa_segment_index segment_index;
+
+		/* Search this bin for a segment with enough contiguous space. */
+		segment_index = area->control->segment_bins[bin];
+		while (segment_index != DSA_SEGMENT_INDEX_NONE)
+		{
+			dsa_segment_map *segment_map;
+			dsa_segment_index next_segment_index;
+			size_t		contiguous_pages;
+
+			segment_map = get_segment_by_index(area, segment_index);
+			next_segment_index = segment_map->header->next;
+			contiguous_pages = fpm_largest(segment_map->fpm);
+
+			/* Not enough for the request, still enough for this bin. */
+			if (contiguous_pages >= threshold && contiguous_pages < npages)
+			{
+				segment_index = next_segment_index;
+				continue;
+			}
+
+			/* Re-bin it if it's no longer in the appropriate bin. */
+			if (contiguous_pages < threshold)
+			{
+				rebin_segment(area, segment_map);
+
+				/*
+				 * But fall through to see if it's enough to satisfy this
+				 * request anyway....
+				 */
+			}
+
+			/* Check if we are done. */
+			if (contiguous_pages >= npages)
+				return segment_map;
+
+			/* Continue searching the same bin. */
+			segment_index = next_segment_index;
+		}
+	}
+
+	/* Not found. */
+	return NULL;
+}
+
+/*
+ * Create a new segment that can handle at least requested_pages.  Returns
+ * NULL if the requested total size limit or maximum allowed number of
+ * segments would be exceeded.
+ */
+static dsa_segment_map *
+make_new_segment(dsa_area *area, size_t requested_pages)
+{
+	dsa_segment_index new_index;
+	size_t		metadata_bytes;
+	size_t		total_size;
+	size_t		total_pages;
+	size_t		usable_pages;
+	dsa_segment_map *segment_map;
+	dsm_segment *segment;
+
+	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+
+	/* Find a segment slot that is not in use (linearly for now). */
+	for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
+	{
+		if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
+			break;
+	}
+	if (new_index == DSA_MAX_SEGMENTS)
+		return NULL;
+
+	/*
+	 * If the total size limit is already exceeded, then we exit early and
+	 * avoid arithmetic wraparound in the unsigned expressions below.
+	 */
+	if (area->control->total_segment_size >=
+		area->control->max_total_segment_size)
+		return NULL;
+
+	/*
+	 * The size should be at least as big as requested, and at least big
+	 * enough to follow a geometric series that approximately doubles the
+	 * total storage each time we create a new segment.  We use geometric
+	 * growth because the underlying DSM system isn't designed for large
+	 * numbers of segments (otherwise we might even consider just using one
+	 * DSM segment for each large allocation and for each superblock, and then
+	 * we wouldn't need to use FreePageManager).
+	 *
+	 * We decide on a total segment size first, so that we produce tidy
+	 * power-of-two sized segments.  This is a good property to have if we
+	 * move to huge pages in the future.  Then we work back to the number of
+	 * pages we can fit.
+	 */
+	total_size = DSA_INITIAL_SEGMENT_SIZE *
+		((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
+	total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
+	total_size = Min(total_size,
+					 area->control->max_total_segment_size -
+					 area->control->total_segment_size);
+
+	total_pages = total_size / FPM_PAGE_SIZE;
+	metadata_bytes =
+		MAXALIGN(sizeof(dsa_segment_header)) +
+		MAXALIGN(sizeof(FreePageManager)) +
+		sizeof(dsa_pointer) * total_pages;
+
+	/* Add padding up to next page boundary. */
+	if (metadata_bytes % FPM_PAGE_SIZE != 0)
+		metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+	if (total_size <= metadata_bytes)
+		return NULL;
+	usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
+	Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
+
+	/* See if that is enough... */
+	if (requested_pages > usable_pages)
+	{
+		/*
+		 * We'll make an odd-sized segment, working forward from the requested
+		 * number of pages.
+		 */
+		usable_pages = requested_pages;
+		metadata_bytes =
+			MAXALIGN(sizeof(dsa_segment_header)) +
+			MAXALIGN(sizeof(FreePageManager)) +
+			usable_pages * sizeof(dsa_pointer);
+
+		/* Add padding up to next page boundary. */
+		if (metadata_bytes % FPM_PAGE_SIZE != 0)
+			metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+		total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
+
+		/* Is that too large for dsa_pointer's addressing scheme? */
+		if (total_size > DSA_MAX_SEGMENT_SIZE)
+			return NULL;
+
+		/* Would that exceed the limit? */
+		if (total_size > area->control->max_total_segment_size -
+			area->control->total_segment_size)
+			return NULL;
+	}
+
+	/* Create the segment. */
+	segment = dsm_create(total_size, 0);
+	if (segment == NULL)
+		return NULL;
+	dsm_pin_segment(segment);
+	if (area->mapping_pinned)
+		dsm_pin_mapping(segment);
+
+	/* Store the handle in shared memory to be found by index. */
+	area->control->segment_handles[new_index] =
+		dsm_segment_handle(segment);
+	/* Track the highest segment index in the history of the area. */
+	if (area->control->high_segment_index < new_index)
+		area->control->high_segment_index = new_index;
+	/* Track the highest segment index this backend has ever mapped. */
+	if (area->high_segment_index < new_index)
+		area->high_segment_index = new_index;
+	/* Track total size of all segments. */
+	area->control->total_segment_size += total_size;
+	Assert(area->control->total_segment_size <=
+		   area->control->max_total_segment_size);
+
+	/* Build a segment map for this segment in this backend. */
+	segment_map = &area->segment_maps[new_index];
+	segment_map->segment = segment;
+	segment_map->mapped_address = dsm_segment_address(segment);
+	segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
+	segment_map->fpm = (FreePageManager *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_segment_header)));
+	segment_map->pagemap = (dsa_pointer *)
+		(segment_map->mapped_address +
+		 MAXALIGN(sizeof(dsa_segment_header)) +
+		 MAXALIGN(sizeof(FreePageManager)));
+
+	/* Set up the free page map. */
+	FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
+	FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
+					   usable_pages);
+
+	/* Set up the segment header and put it in the appropriate bin. */
+	segment_map->header->magic =
+		DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
+	segment_map->header->usable_pages = usable_pages;
+	segment_map->header->size = total_size;
+	segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
+	segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
+	segment_map->header->next =
+		area->control->segment_bins[segment_map->header->bin];
+	segment_map->header->freed = false;
+	area->control->segment_bins[segment_map->header->bin] = new_index;
+	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *next =
+			get_segment_by_index(area, segment_map->header->next);
+
+		Assert(next->header->bin == segment_map->header->bin);
+		next->header->prev = new_index;
+	}
+
+	return segment_map;
+}
+
+/*
+ * Check if any segments have been freed by destroy_superblock, so we can
+ * detach from them in this backend.  This function is called by
+ * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
+ * received can be resolved to the correct segment.
+ *
+ * The danger we want to defend against is that there could be an old segment
+ * mapped into a given slot in this backend, and the dsa_pointer they have
+ * might refer to some new segment in the same slot.  So those functions must
+ * be sure to process all instructions to detach from a freed segment that had
+ * been generated by the time this process received the dsa_pointer, before
+ * they call get_segment_by_index.
+ */
+static void
+check_for_freed_segments(dsa_area *area)
+{
+	size_t		freed_segment_counter;
+
+	/*
+	 * Any other process that has freed a segment has incremented
+	 * freed_segment_counter while holding an LWLock, and that must precede
+	 * any backend creating a new segment in the same slot while holding an
+	 * LWLock, and that must precede the creation of any dsa_pointer pointing
+	 * into the new segment which might reach us here, and the caller must
+	 * have sent the dsa_pointer to this process using appropriate memory
+	 * synchronization (some kind of locking or atomic primitive or system
+	 * call).  So all we need to do on the reading side is ask for the load of
+	 * freed_segment_counter to follow the caller's load of the dsa_pointer it
+	 * has, and we can be sure to detect any segments that had been freed as
+	 * of the time that the dsa_pointer reached this process.
+	 */
+	pg_read_barrier();
+	freed_segment_counter = area->control->freed_segment_counter;
+	if (unlikely(area->freed_segment_counter != freed_segment_counter))
+	{
+		/* Check all currently mapped segments to find what's been freed. */
+		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+		check_for_freed_segments_locked(area);
+		LWLockRelease(DSA_AREA_LOCK(area));
+	}
+}
+
+/*
+ * Workhorse for check_for_freed_segments(), and also used directly in path
+ * where the area lock is already held.  This should be called after acquiring
+ * the lock but before looking up any segment by index number, to make sure we
+ * unmap any stale segments that might have previously had the same index as a
+ * current segment.
+ */
+static void
+check_for_freed_segments_locked(dsa_area *area)
+{
+	size_t		freed_segment_counter;
+	int			i;
+
+	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+	freed_segment_counter = area->control->freed_segment_counter;
+	if (unlikely(area->freed_segment_counter != freed_segment_counter))
+	{
+		for (i = 0; i <= area->high_segment_index; ++i)
+		{
+			if (area->segment_maps[i].header != NULL &&
+				area->segment_maps[i].header->freed)
+			{
+				dsm_detach(area->segment_maps[i].segment);
+				area->segment_maps[i].segment = NULL;
+				area->segment_maps[i].header = NULL;
+				area->segment_maps[i].mapped_address = NULL;
+			}
+		}
+		area->freed_segment_counter = freed_segment_counter;
+	}
+}
+
+/*
+ * Re-bin segment if it's no longer in the appropriate bin.
+ */
+static void
+rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
+{
+	size_t		new_bin;
+	dsa_segment_index segment_index;
+
+	new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
+	if (segment_map->header->bin == new_bin)
+		return;
+
+	/* Remove it from its current bin. */
+	unlink_segment(area, segment_map);
+
+	/* Push it onto the front of its new bin. */
+	segment_index = get_segment_index(area, segment_map);
+	segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
+	segment_map->header->next = area->control->segment_bins[new_bin];
+	segment_map->header->bin = new_bin;
+	area->control->segment_bins[new_bin] = segment_index;
+	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+	{
+		dsa_segment_map *next;
+
+		next = get_segment_by_index(area, segment_map->header->next);
+		Assert(next->header->bin == new_bin);
+		next->header->prev = segment_index;
+	}
+}
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/freepage.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/freepage.c
new file mode 100644
index 00000000000..8f9ea090faa
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/freepage.c
@@ -0,0 +1,1886 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.c
+ *	  Management of free memory pages.
+ *
+ * The intention of this code is to provide infrastructure for memory
+ * allocators written specifically for PostgreSQL.  At least in the case
+ * of dynamic shared memory, we can't simply use malloc() or even
+ * relatively thin wrappers like palloc() which sit on top of it, because
+ * no allocator built into the operating system will deal with relative
+ * pointers.  In the future, we may find other cases in which greater
+ * control over our own memory management seems desirable.
+ *
+ * A FreePageManager keeps track of which 4kB pages of memory are currently
+ * unused from the point of view of some higher-level memory allocator.
+ * Unlike a user-facing allocator such as palloc(), a FreePageManager can
+ * only allocate and free in units of whole pages, and freeing an
+ * allocation can only be done given knowledge of its length in pages.
+ *
+ * Since a free page manager has only a fixed amount of dedicated memory,
+ * and since there is no underlying allocator, it uses the free pages
+ * it is given to manage to store its bookkeeping data.  It keeps multiple
+ * freelists of runs of pages, sorted by the size of the run; the head of
+ * each freelist is stored in the FreePageManager itself, and the first
+ * page of each run contains a relative pointer to the next run. See
+ * FreePageManagerGetInternal for more details on how the freelists are
+ * managed.
+ *
+ * To avoid memory fragmentation, it's important to consolidate adjacent
+ * spans of pages whenever possible; otherwise, large allocation requests
+ * might not be satisfied even when sufficient contiguous space is
+ * available.  Therefore, in addition to the freelists, we maintain an
+ * in-memory btree of free page ranges ordered by page number.  If a
+ * range being freed precedes or follows a range that is already free,
+ * the existing range is extended; if it exactly bridges the gap between
+ * free ranges, then the two existing ranges are consolidated with the
+ * newly-freed range to form one great big range of free pages.
+ *
+ * When there is only one range of free pages, the btree is trivial and
+ * is stored within the FreePageManager proper; otherwise, pages are
+ * allocated from the area under management as needed.  Even in cases
+ * where memory fragmentation is very severe, only a tiny fraction of
+ * the pages under management are consumed by this btree.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/freepage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+
+#include "utils/freepage.h"
+#include "utils/relptr.h"
+
+
+/* Magic numbers to identify various page types */
+#define FREE_PAGE_SPAN_LEADER_MAGIC		0xea4020f0
+#define FREE_PAGE_LEAF_MAGIC			0x98eae728
+#define FREE_PAGE_INTERNAL_MAGIC		0x19aa32c9
+
+/* Doubly linked list of spans of free pages; stored in first page of span. */
+struct FreePageSpanLeader
+{
+	int			magic;			/* always FREE_PAGE_SPAN_LEADER_MAGIC */
+	Size		npages;			/* number of pages in span */
+	RelptrFreePageSpanLeader prev;
+	RelptrFreePageSpanLeader next;
+};
+
+/* Common header for btree leaf and internal pages. */
+typedef struct FreePageBtreeHeader
+{
+	int			magic;			/* FREE_PAGE_LEAF_MAGIC or
+								 * FREE_PAGE_INTERNAL_MAGIC */
+	Size		nused;			/* number of items used */
+	RelptrFreePageBtree parent; /* uplink */
+} FreePageBtreeHeader;
+
+/* Internal key; points to next level of btree. */
+typedef struct FreePageBtreeInternalKey
+{
+	Size		first_page;		/* low bound for keys on child page */
+	RelptrFreePageBtree child;	/* downlink */
+} FreePageBtreeInternalKey;
+
+/* Leaf key; no payload data. */
+typedef struct FreePageBtreeLeafKey
+{
+	Size		first_page;		/* first page in span */
+	Size		npages;			/* number of pages in span */
+} FreePageBtreeLeafKey;
+
+/* Work out how many keys will fit on a page. */
+#define FPM_ITEMS_PER_INTERNAL_PAGE \
+	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+		sizeof(FreePageBtreeInternalKey))
+#define FPM_ITEMS_PER_LEAF_PAGE \
+	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+		sizeof(FreePageBtreeLeafKey))
+
+/* A btree page of either sort */
+struct FreePageBtree
+{
+	FreePageBtreeHeader hdr;
+	union
+	{
+		FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
+		FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
+	}			u;
+};
+
+/* Results of a btree search */
+typedef struct FreePageBtreeSearchResult
+{
+	FreePageBtree *page;
+	Size		index;
+	bool		found;
+	unsigned	split_pages;
+} FreePageBtreeSearchResult;
+
+/* Helper functions */
+static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
+											FreePageBtree *btp);
+static Size FreePageBtreeCleanup(FreePageManager *fpm);
+static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
+												   FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
+													FreePageBtree *btp);
+static Size FreePageBtreeFirstKey(FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
+static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
+										Size index, Size first_page, FreePageBtree *child);
+static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
+									Size first_page, Size npages);
+static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
+static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
+								Size index);
+static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
+static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+								FreePageBtreeSearchResult *result);
+static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
+static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
+static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
+											 FreePageBtree *btp);
+static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
+static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+									 FreePageBtree *parent, int level, StringInfo buf);
+static void FreePageManagerDumpSpans(FreePageManager *fpm,
+									 FreePageSpanLeader *span, Size expected_pages,
+									 StringInfo buf);
+static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
+									   Size *first_page);
+static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
+									   Size npages, bool soft);
+static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
+static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
+								   Size npages);
+static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
+static void FreePageManagerUpdateLargest(FreePageManager *fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+static Size sum_free_pages(FreePageManager *fpm);
+#endif
+
+/*
+ * Initialize a new, empty free page manager.
+ *
+ * 'fpm' should reference caller-provided memory large enough to contain a
+ * FreePageManager.  We'll initialize it here.
+ *
+ * 'base' is the address to which all pointers are relative.  When managing
+ * a dynamic shared memory segment, it should normally be the base of the
+ * segment.  When managing backend-private memory, it can be either NULL or,
+ * if managing a single contiguous extent of memory, the start of that extent.
+ */
+void
+FreePageManagerInitialize(FreePageManager *fpm, char *base)
+{
+	Size		f;
+
+	relptr_store(base, fpm->self, fpm);
+	relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+	relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
+	fpm->btree_depth = 0;
+	fpm->btree_recycle_count = 0;
+	fpm->singleton_first_page = 0;
+	fpm->singleton_npages = 0;
+	fpm->contiguous_pages = 0;
+	fpm->contiguous_pages_dirty = true;
+#ifdef FPM_EXTRA_ASSERTS
+	fpm->free_pages = 0;
+#endif
+
+	for (f = 0; f < FPM_NUM_FREELISTS; f++)
+		relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
+}
+
+/*
+ * Allocate a run of pages of the given length from the free page manager.
+ * The return value indicates whether we were able to satisfy the request;
+ * if true, the first page of the allocation is stored in *first_page.
+ */
+bool
+FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
+{
+	bool		result;
+	Size		contiguous_pages;
+
+	result = FreePageManagerGetInternal(fpm, npages, first_page);
+
+	/*
+	 * It's a bit counterintuitive, but allocating pages can actually create
+	 * opportunities for cleanup that create larger ranges.  We might pull a
+	 * key out of the btree that enables the item at the head of the btree
+	 * recycle list to be inserted; and then if there are more items behind it
+	 * one of those might cause two currently-separated ranges to merge,
+	 * creating a single range of contiguous pages larger than any that
+	 * existed previously.  It might be worth trying to improve the cleanup
+	 * algorithm to avoid such corner cases, but for now we just notice the
+	 * condition and do the appropriate reporting.
+	 */
+	contiguous_pages = FreePageBtreeCleanup(fpm);
+	if (fpm->contiguous_pages < contiguous_pages)
+		fpm->contiguous_pages = contiguous_pages;
+
+	/*
+	 * FreePageManagerGetInternal may have set contiguous_pages_dirty.
+	 * Recompute contiguous_pages if so.
+	 */
+	FreePageManagerUpdateLargest(fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+	if (result)
+	{
+		Assert(fpm->free_pages >= npages);
+		fpm->free_pages -= npages;
+	}
+	Assert(fpm->free_pages == sum_free_pages(fpm));
+	Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
+#endif
+	return result;
+}
+
+#ifdef FPM_EXTRA_ASSERTS
+static void
+sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
+{
+	char	   *base = fpm_segment_base(fpm);
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
+		   btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	++*sum;
+	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		Size		index;
+
+
+		for (index = 0; index < btp->hdr.nused; ++index)
+		{
+			FreePageBtree *child;
+
+			child = relptr_access(base, btp->u.internal_key[index].child);
+			sum_free_pages_recurse(fpm, child, sum);
+		}
+	}
+}
+static Size
+sum_free_pages(FreePageManager *fpm)
+{
+	FreePageSpanLeader *recycle;
+	char	   *base = fpm_segment_base(fpm);
+	Size		sum = 0;
+	int			list;
+
+	/* Count the spans by scanning the freelists. */
+	for (list = 0; list < FPM_NUM_FREELISTS; ++list)
+	{
+
+		if (!relptr_is_null(fpm->freelist[list]))
+		{
+			FreePageSpanLeader *candidate =
+				relptr_access(base, fpm->freelist[list]);
+
+			do
+			{
+				sum += candidate->npages;
+				candidate = relptr_access(base, candidate->next);
+			} while (candidate != NULL);
+		}
+	}
+
+	/* Count btree internal pages. */
+	if (fpm->btree_depth > 0)
+	{
+		FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+		sum_free_pages_recurse(fpm, root, &sum);
+	}
+
+	/* Count the recycle list. */
+	for (recycle = relptr_access(base, fpm->btree_recycle);
+		 recycle != NULL;
+		 recycle = relptr_access(base, recycle->next))
+	{
+		Assert(recycle->npages == 1);
+		++sum;
+	}
+
+	return sum;
+}
+#endif
+
+/*
+ * Compute the size of the largest run of pages that the user could
+ * successfully get.
+ */
+static Size
+FreePageManagerLargestContiguous(FreePageManager *fpm)
+{
+	char	   *base;
+	Size		largest;
+
+	base = fpm_segment_base(fpm);
+	largest = 0;
+	if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
+	{
+		FreePageSpanLeader *candidate;
+
+		candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
+		do
+		{
+			if (candidate->npages > largest)
+				largest = candidate->npages;
+			candidate = relptr_access(base, candidate->next);
+		} while (candidate != NULL);
+	}
+	else
+	{
+		Size		f = FPM_NUM_FREELISTS - 1;
+
+		do
+		{
+			--f;
+			if (!relptr_is_null(fpm->freelist[f]))
+			{
+				largest = f + 1;
+				break;
+			}
+		} while (f > 0);
+	}
+
+	return largest;
+}
+
+/*
+ * Recompute the size of the largest run of pages that the user could
+ * successfully get, if it has been marked dirty.
+ */
+static void
+FreePageManagerUpdateLargest(FreePageManager *fpm)
+{
+	if (!fpm->contiguous_pages_dirty)
+		return;
+
+	fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
+	fpm->contiguous_pages_dirty = false;
+}
+
+/*
+ * Transfer a run of pages to the free page manager.
+ */
+void
+FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
+{
+	Size		contiguous_pages;
+
+	Assert(npages > 0);
+
+	/* Record the new pages. */
+	contiguous_pages =
+		FreePageManagerPutInternal(fpm, first_page, npages, false);
+
+	/*
+	 * If the new range we inserted into the page manager was contiguous with
+	 * an existing range, it may have opened up cleanup opportunities.
+	 */
+	if (contiguous_pages > npages)
+	{
+		Size		cleanup_contiguous_pages;
+
+		cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
+		if (cleanup_contiguous_pages > contiguous_pages)
+			contiguous_pages = cleanup_contiguous_pages;
+	}
+
+	/* See if we now have a new largest chunk. */
+	if (fpm->contiguous_pages < contiguous_pages)
+		fpm->contiguous_pages = contiguous_pages;
+
+	/*
+	 * The earlier call to FreePageManagerPutInternal may have set
+	 * contiguous_pages_dirty if it needed to allocate internal pages, so
+	 * recompute contiguous_pages if necessary.
+	 */
+	FreePageManagerUpdateLargest(fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+	fpm->free_pages += npages;
+	Assert(fpm->free_pages == sum_free_pages(fpm));
+	Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
+#endif
+}
+
+/*
+ * Produce a debugging dump of the state of a free page manager.
+ */
+char *
+FreePageManagerDump(FreePageManager *fpm)
+{
+	char	   *base = fpm_segment_base(fpm);
+	StringInfoData buf;
+	FreePageSpanLeader *recycle;
+	bool		dumped_any_freelist = false;
+	Size		f;
+
+	/* Initialize output buffer. */
+	initStringInfo(&buf);
+
+	/* Dump general stuff. */
+	appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
+					 relptr_offset(fpm->self), fpm->contiguous_pages);
+
+	/* Dump btree. */
+	if (fpm->btree_depth > 0)
+	{
+		FreePageBtree *root;
+
+		appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
+		root = relptr_access(base, fpm->btree_root);
+		FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
+	}
+	else if (fpm->singleton_npages > 0)
+	{
+		appendStringInfo(&buf, "singleton: %zu(%zu)\n",
+						 fpm->singleton_first_page, fpm->singleton_npages);
+	}
+
+	/* Dump btree recycle list. */
+	recycle = relptr_access(base, fpm->btree_recycle);
+	if (recycle != NULL)
+	{
+		appendStringInfoString(&buf, "btree recycle:");
+		FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
+	}
+
+	/* Dump free lists. */
+	for (f = 0; f < FPM_NUM_FREELISTS; ++f)
+	{
+		FreePageSpanLeader *span;
+
+		if (relptr_is_null(fpm->freelist[f]))
+			continue;
+		if (!dumped_any_freelist)
+		{
+			appendStringInfoString(&buf, "freelists:\n");
+			dumped_any_freelist = true;
+		}
+		appendStringInfo(&buf, "  %zu:", f + 1);
+		span = relptr_access(base, fpm->freelist[f]);
+		FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
+	}
+
+	/* And return result to caller. */
+	return buf.data;
+}
+
+
+/*
+ * The first_page value stored at index zero in any non-root page must match
+ * the first_page value stored in its parent at the index which points to that
+ * page.  So when the value stored at index zero in a btree page changes, we've
+ * got to walk up the tree adjusting ancestor keys until we reach an ancestor
+ * where that key isn't index zero.  This function should be called after
+ * updating the first key on the target page; it will propagate the change
+ * upward as far as needed.
+ *
+ * We assume here that the first key on the page has not changed enough to
+ * require changes in the ordering of keys on its ancestor pages.  Thus,
+ * if we search the parent page for the first key greater than or equal to
+ * the first key on the current page, the downlink to this page will be either
+ * the exact index returned by the search (if the first key decreased)
+ * or one less (if the first key increased).
+ */
+static void
+FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		first_page;
+	FreePageBtree *parent;
+	FreePageBtree *child;
+
+	/* This might be either a leaf or an internal page. */
+	Assert(btp->hdr.nused > 0);
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+	{
+		Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+		first_page = btp->u.leaf_key[0].first_page;
+	}
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+		first_page = btp->u.internal_key[0].first_page;
+	}
+	child = btp;
+
+	/* Loop until we find an ancestor that does not require adjustment. */
+	for (;;)
+	{
+		Size		s;
+
+		parent = relptr_access(base, child->hdr.parent);
+		if (parent == NULL)
+			break;
+		s = FreePageBtreeSearchInternal(parent, first_page);
+
+		/* Key is either at index s or index s-1; figure out which. */
+		if (s >= parent->hdr.nused)
+		{
+			Assert(s == parent->hdr.nused);
+			--s;
+		}
+		else
+		{
+			FreePageBtree *check;
+
+			check = relptr_access(base, parent->u.internal_key[s].child);
+			if (check != child)
+			{
+				Assert(s > 0);
+				--s;
+			}
+		}
+
+#ifdef USE_ASSERT_CHECKING
+		/* Debugging double-check. */
+		{
+			FreePageBtree *check;
+
+			check = relptr_access(base, parent->u.internal_key[s].child);
+			Assert(s < parent->hdr.nused);
+			Assert(child == check);
+		}
+#endif
+
+		/* Update the parent key. */
+		parent->u.internal_key[s].first_page = first_page;
+
+		/*
+		 * If this is the first key in the parent, go up another level; else
+		 * done.
+		 */
+		if (s > 0)
+			break;
+		child = parent;
+	}
+}
+
+/*
+ * Attempt to reclaim space from the free-page btree.  The return value is
+ * the largest range of contiguous pages created by the cleanup operation.
+ */
+static Size
+FreePageBtreeCleanup(FreePageManager *fpm)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		max_contiguous_pages = 0;
+
+	/* Attempt to shrink the depth of the btree. */
+	while (!relptr_is_null(fpm->btree_root))
+	{
+		FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+		/* If the root contains only one key, reduce depth by one. */
+		if (root->hdr.nused == 1)
+		{
+			/* Shrink depth of tree by one. */
+			Assert(fpm->btree_depth > 0);
+			--fpm->btree_depth;
+			if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+			{
+				/* If root is a leaf, convert only entry to singleton range. */
+				relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+				fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+				fpm->singleton_npages = root->u.leaf_key[0].npages;
+			}
+			else
+			{
+				FreePageBtree *newroot;
+
+				/* If root is an internal page, make only child the root. */
+				Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+				relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
+				newroot = relptr_access(base, fpm->btree_root);
+				relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
+			}
+			FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
+		}
+		else if (root->hdr.nused == 2 &&
+				 root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			Size		end_of_first;
+			Size		start_of_second;
+
+			end_of_first = root->u.leaf_key[0].first_page +
+				root->u.leaf_key[0].npages;
+			start_of_second = root->u.leaf_key[1].first_page;
+
+			if (end_of_first + 1 == start_of_second)
+			{
+				Size		root_page = fpm_pointer_to_page(base, root);
+
+				if (end_of_first == root_page)
+				{
+					FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
+					FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
+					fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+					fpm->singleton_npages = root->u.leaf_key[0].npages +
+						root->u.leaf_key[1].npages + 1;
+					fpm->btree_depth = 0;
+					relptr_store(base, fpm->btree_root,
+								 (FreePageBtree *) NULL);
+					FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+										   fpm->singleton_npages);
+					Assert(max_contiguous_pages == 0);
+					max_contiguous_pages = fpm->singleton_npages;
+				}
+			}
+
+			/* Whether it worked or not, it's time to stop. */
+			break;
+		}
+		else
+		{
+			/* Nothing more to do.  Stop. */
+			break;
+		}
+	}
+
+	/*
+	 * Attempt to free recycled btree pages.  We skip this if releasing the
+	 * recycled page would require a btree page split, because the page we're
+	 * trying to recycle would be consumed by the split, which would be
+	 * counterproductive.
+	 *
+	 * We also currently only ever attempt to recycle the first page on the
+	 * list; that could be made more aggressive, but it's not clear that the
+	 * complexity would be worthwhile.
+	 */
+	while (fpm->btree_recycle_count > 0)
+	{
+		FreePageBtree *btp;
+		Size		first_page;
+		Size		contiguous_pages;
+
+		btp = FreePageBtreeGetRecycled(fpm);
+		first_page = fpm_pointer_to_page(base, btp);
+		contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
+		if (contiguous_pages == 0)
+		{
+			FreePageBtreeRecycle(fpm, first_page);
+			break;
+		}
+		else
+		{
+			if (contiguous_pages > max_contiguous_pages)
+				max_contiguous_pages = contiguous_pages;
+		}
+	}
+
+	return max_contiguous_pages;
+}
+
+/*
+ * Consider consolidating the given page with its left or right sibling,
+ * if it's fairly empty.
+ */
+static void
+FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtree *np;
+	Size		max;
+
+	/*
+	 * We only try to consolidate pages that are less than a third full. We
+	 * could be more aggressive about this, but that might risk performing
+	 * consolidation only to end up splitting again shortly thereafter.  Since
+	 * the btree should be very small compared to the space under management,
+	 * our goal isn't so much to ensure that it always occupies the absolutely
+	 * smallest possible number of pages as to reclaim pages before things get
+	 * too egregiously out of hand.
+	 */
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		max = FPM_ITEMS_PER_LEAF_PAGE;
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		max = FPM_ITEMS_PER_INTERNAL_PAGE;
+	}
+	if (btp->hdr.nused >= max / 3)
+		return;
+
+	/*
+	 * If we can fit our right sibling's keys onto this page, consolidate.
+	 */
+	np = FreePageBtreeFindRightSibling(base, btp);
+	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+	{
+		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
+				   sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
+			btp->hdr.nused += np->hdr.nused;
+		}
+		else
+		{
+			memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
+				   sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
+			btp->hdr.nused += np->hdr.nused;
+			FreePageBtreeUpdateParentPointers(base, btp);
+		}
+		FreePageBtreeRemovePage(fpm, np);
+		return;
+	}
+
+	/*
+	 * If we can fit our keys onto our left sibling's page, consolidate. In
+	 * this case, we move our keys onto the other page rather than vice versa,
+	 * to avoid having to adjust ancestor keys.
+	 */
+	np = FreePageBtreeFindLeftSibling(base, btp);
+	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+	{
+		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		{
+			memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
+				   sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
+			np->hdr.nused += btp->hdr.nused;
+		}
+		else
+		{
+			memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
+				   sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
+			np->hdr.nused += btp->hdr.nused;
+			FreePageBtreeUpdateParentPointers(base, np);
+		}
+		FreePageBtreeRemovePage(fpm, btp);
+		return;
+	}
+}
+
+/*
+ * Find the passed page's left sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately precedes ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
+{
+	FreePageBtree *p = btp;
+	int			levels = 0;
+
+	/* Move up until we can move left. */
+	for (;;)
+	{
+		Size		first_page;
+		Size		index;
+
+		first_page = FreePageBtreeFirstKey(p);
+		p = relptr_access(base, p->hdr.parent);
+
+		if (p == NULL)
+			return NULL;		/* we were passed the rightmost page */
+
+		index = FreePageBtreeSearchInternal(p, first_page);
+		if (index > 0)
+		{
+			Assert(p->u.internal_key[index].first_page == first_page);
+			p = relptr_access(base, p->u.internal_key[index - 1].child);
+			break;
+		}
+		Assert(index == 0);
+		++levels;
+	}
+
+	/* Descend left. */
+	while (levels > 0)
+	{
+		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
+		--levels;
+	}
+	Assert(p->hdr.magic == btp->hdr.magic);
+
+	return p;
+}
+
+/*
+ * Find the passed page's right sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately follows ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
+{
+	FreePageBtree *p = btp;
+	int			levels = 0;
+
+	/* Move up until we can move right. */
+	for (;;)
+	{
+		Size		first_page;
+		Size		index;
+
+		first_page = FreePageBtreeFirstKey(p);
+		p = relptr_access(base, p->hdr.parent);
+
+		if (p == NULL)
+			return NULL;		/* we were passed the rightmost page */
+
+		index = FreePageBtreeSearchInternal(p, first_page);
+		if (index < p->hdr.nused - 1)
+		{
+			Assert(p->u.internal_key[index].first_page == first_page);
+			p = relptr_access(base, p->u.internal_key[index + 1].child);
+			break;
+		}
+		Assert(index == p->hdr.nused - 1);
+		++levels;
+	}
+
+	/* Descend left. */
+	while (levels > 0)
+	{
+		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		p = relptr_access(base, p->u.internal_key[0].child);
+		--levels;
+	}
+	Assert(p->hdr.magic == btp->hdr.magic);
+
+	return p;
+}
+
+/*
+ * Get the first key on a btree page.
+ */
+static Size
+FreePageBtreeFirstKey(FreePageBtree *btp)
+{
+	Assert(btp->hdr.nused > 0);
+
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		return btp->u.leaf_key[0].first_page;
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		return btp->u.internal_key[0].first_page;
+	}
+}
+
+/*
+ * Get a page from the btree recycle list for use as a btree page.
+ */
+static FreePageBtree *
+FreePageBtreeGetRecycled(FreePageManager *fpm)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
+	FreePageSpanLeader *newhead;
+
+	Assert(victim != NULL);
+	newhead = relptr_access(base, victim->next);
+	if (newhead != NULL)
+		relptr_copy(newhead->prev, victim->prev);
+	relptr_store(base, fpm->btree_recycle, newhead);
+	Assert(fpm_pointer_is_page_aligned(base, victim));
+	fpm->btree_recycle_count--;
+	return (FreePageBtree *) victim;
+}
+
+/*
+ * Insert an item into an internal page.
+ */
+static void
+FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
+							Size first_page, FreePageBtree *child)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+	Assert(index <= btp->hdr.nused);
+	memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
+			sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
+	btp->u.internal_key[index].first_page = first_page;
+	relptr_store(base, btp->u.internal_key[index].child, child);
+	++btp->hdr.nused;
+}
+
+/*
+ * Insert an item into a leaf page.
+ */
+static void
+FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
+						Size npages)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+	Assert(index <= btp->hdr.nused);
+	memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
+			sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+	btp->u.leaf_key[index].first_page = first_page;
+	btp->u.leaf_key[index].npages = npages;
+	++btp->hdr.nused;
+}
+
+/*
+ * Put a page on the btree recycle list.
+ */
+static void
+FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
+	FreePageSpanLeader *span;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+	span->npages = 1;
+	relptr_store(base, span->next, head);
+	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+	if (head != NULL)
+		relptr_store(base, head->prev, span);
+	relptr_store(base, fpm->btree_recycle, span);
+	fpm->btree_recycle_count++;
+}
+
+/*
+ * Remove an item from the btree at the given position on the given page.
+ */
+static void
+FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
+{
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(index < btp->hdr.nused);
+
+	/* When last item is removed, extirpate entire page from btree. */
+	if (btp->hdr.nused == 1)
+	{
+		FreePageBtreeRemovePage(fpm, btp);
+		return;
+	}
+
+	/* Physically remove the key from the page. */
+	--btp->hdr.nused;
+	if (index < btp->hdr.nused)
+		memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
+				sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+
+	/* If we just removed the first key, adjust ancestor keys. */
+	if (index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, btp);
+
+	/* Consider whether to consolidate this page with a sibling. */
+	FreePageBtreeConsolidate(fpm, btp);
+}
+
+/*
+ * Remove a page from the btree.  Caller is responsible for having relocated
+ * any keys from this page that are still wanted.  The page is placed on the
+ * recycled list.
+ */
+static void
+FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtree *parent;
+	Size		index;
+	Size		first_page;
+
+	for (;;)
+	{
+		/* Find parent page. */
+		parent = relptr_access(base, btp->hdr.parent);
+		if (parent == NULL)
+		{
+			/* We are removing the root page. */
+			relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+			fpm->btree_depth = 0;
+			Assert(fpm->singleton_first_page == 0);
+			Assert(fpm->singleton_npages == 0);
+			return;
+		}
+
+		/*
+		 * If the parent contains only one item, we need to remove it as well.
+		 */
+		if (parent->hdr.nused > 1)
+			break;
+		FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+		btp = parent;
+	}
+
+	/* Find and remove the downlink. */
+	first_page = FreePageBtreeFirstKey(btp);
+	if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+	{
+		index = FreePageBtreeSearchLeaf(parent, first_page);
+		Assert(index < parent->hdr.nused);
+		if (index < parent->hdr.nused - 1)
+			memmove(&parent->u.leaf_key[index],
+					&parent->u.leaf_key[index + 1],
+					sizeof(FreePageBtreeLeafKey)
+					* (parent->hdr.nused - index - 1));
+	}
+	else
+	{
+		index = FreePageBtreeSearchInternal(parent, first_page);
+		Assert(index < parent->hdr.nused);
+		if (index < parent->hdr.nused - 1)
+			memmove(&parent->u.internal_key[index],
+					&parent->u.internal_key[index + 1],
+					sizeof(FreePageBtreeInternalKey)
+					* (parent->hdr.nused - index - 1));
+	}
+	parent->hdr.nused--;
+	Assert(parent->hdr.nused > 0);
+
+	/* Recycle the page. */
+	FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+
+	/* Adjust ancestor keys if needed. */
+	if (index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, parent);
+
+	/* Consider whether to consolidate the parent with a sibling. */
+	FreePageBtreeConsolidate(fpm, parent);
+}
+
+/*
+ * Search the btree for an entry for the given first page and initialize
+ * *result with the results of the search.  result->page and result->index
+ * indicate either the position of an exact match or the position at which
+ * the new key should be inserted.  result->found is true for an exact match,
+ * otherwise false.  result->split_pages will contain the number of additional
+ * btree pages that will be needed when performing a split to insert a key.
+ * Except as described above, the contents of fields in the result object are
+ * undefined on return.
+ */
+static void
+FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+					FreePageBtreeSearchResult *result)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtree *btp = relptr_access(base, fpm->btree_root);
+	Size		index;
+
+	result->split_pages = 1;
+
+	/* If the btree is empty, there's nothing to find. */
+	if (btp == NULL)
+	{
+		result->page = NULL;
+		result->found = false;
+		return;
+	}
+
+	/* Descend until we hit a leaf. */
+	while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		FreePageBtree *child;
+		bool		found_exact;
+
+		index = FreePageBtreeSearchInternal(btp, first_page);
+		found_exact = index < btp->hdr.nused &&
+			btp->u.internal_key[index].first_page == first_page;
+
+		/*
+		 * If we found an exact match we descend directly.  Otherwise, we
+		 * descend into the child to the left if possible so that we can find
+		 * the insertion point at that child's high end.
+		 */
+		if (!found_exact && index > 0)
+			--index;
+
+		/* Track required split depth for leaf insert. */
+		if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
+		{
+			Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+			result->split_pages++;
+		}
+		else
+			result->split_pages = 0;
+
+		/* Descend to appropriate child page. */
+		Assert(index < btp->hdr.nused);
+		child = relptr_access(base, btp->u.internal_key[index].child);
+		Assert(relptr_access(base, child->hdr.parent) == btp);
+		btp = child;
+	}
+
+	/* Track required split depth for leaf insert. */
+	if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
+	{
+		Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+		result->split_pages++;
+	}
+	else
+		result->split_pages = 0;
+
+	/* Search leaf page. */
+	index = FreePageBtreeSearchLeaf(btp, first_page);
+
+	/* Assemble results. */
+	result->page = btp;
+	result->index = index;
+	result->found = index < btp->hdr.nused &&
+		first_page == btp->u.leaf_key[index].first_page;
+}
+
+/*
+ * Search an internal page for the first key greater than or equal to a given
+ * page number.  Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
+{
+	Size		low = 0;
+	Size		high = btp->hdr.nused;
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
+
+	while (low < high)
+	{
+		Size		mid = (low + high) / 2;
+		Size		val = btp->u.internal_key[mid].first_page;
+
+		if (first_page == val)
+			return mid;
+		else if (first_page < val)
+			high = mid;
+		else
+			low = mid + 1;
+	}
+
+	return low;
+}
+
+/*
+ * Search a leaf page for the first key greater than or equal to a given
+ * page number.  Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
+{
+	Size		low = 0;
+	Size		high = btp->hdr.nused;
+
+	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+	Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
+
+	while (low < high)
+	{
+		Size		mid = (low + high) / 2;
+		Size		val = btp->u.leaf_key[mid].first_page;
+
+		if (first_page == val)
+			return mid;
+		else if (first_page < val)
+			high = mid;
+		else
+			low = mid + 1;
+	}
+
+	return low;
+}
+
+/*
+ * Allocate a new btree page and move half the keys from the provided page
+ * to the new page.  Caller is responsible for making sure that there's a
+ * page available from fpm->btree_recycle.  Returns a pointer to the new page,
+ * to which caller must add a downlink.
+ */
+static FreePageBtree *
+FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
+{
+	FreePageBtree *newsibling;
+
+	newsibling = FreePageBtreeGetRecycled(fpm);
+	newsibling->hdr.magic = btp->hdr.magic;
+	newsibling->hdr.nused = btp->hdr.nused / 2;
+	relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
+	btp->hdr.nused -= newsibling->hdr.nused;
+
+	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+		memcpy(&newsibling->u.leaf_key,
+			   &btp->u.leaf_key[btp->hdr.nused],
+			   sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
+	else
+	{
+		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+		memcpy(&newsibling->u.internal_key,
+			   &btp->u.internal_key[btp->hdr.nused],
+			   sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
+		FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
+	}
+
+	return newsibling;
+}
+
+/*
+ * When internal pages are split or merged, the parent pointers of their
+ * children must be updated.
+ */
+static void
+FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
+{
+	Size		i;
+
+	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+	for (i = 0; i < btp->hdr.nused; ++i)
+	{
+		FreePageBtree *child;
+
+		child = relptr_access(base, btp->u.internal_key[i].child);
+		relptr_store(base, child->hdr.parent, btp);
+	}
+}
+
+/*
+ * Debugging dump of btree data.
+ */
+static void
+FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+						 FreePageBtree *parent, int level, StringInfo buf)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		pageno = fpm_pointer_to_page(base, btp);
+	Size		index;
+	FreePageBtree *check_parent;
+
+	check_stack_depth();
+	check_parent = relptr_access(base, btp->hdr.parent);
+	appendStringInfo(buf, "  %zu@%d %c", pageno, level,
+					 btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
+	if (parent != check_parent)
+		appendStringInfo(buf, " [actual parent %zu, expected %zu]",
+						 fpm_pointer_to_page(base, check_parent),
+						 fpm_pointer_to_page(base, parent));
+	appendStringInfoChar(buf, ':');
+	for (index = 0; index < btp->hdr.nused; ++index)
+	{
+		if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+			appendStringInfo(buf, " %zu->%zu",
+							 btp->u.internal_key[index].first_page,
+							 relptr_offset(btp->u.internal_key[index].child) / FPM_PAGE_SIZE);
+		else
+			appendStringInfo(buf, " %zu(%zu)",
+							 btp->u.leaf_key[index].first_page,
+							 btp->u.leaf_key[index].npages);
+	}
+	appendStringInfoChar(buf, '\n');
+
+	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+	{
+		for (index = 0; index < btp->hdr.nused; ++index)
+		{
+			FreePageBtree *child;
+
+			child = relptr_access(base, btp->u.internal_key[index].child);
+			FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
+		}
+	}
+}
+
+/*
+ * Debugging dump of free-span data.
+ */
+static void
+FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
+						 Size expected_pages, StringInfo buf)
+{
+	char	   *base = fpm_segment_base(fpm);
+
+	while (span != NULL)
+	{
+		if (span->npages != expected_pages)
+			appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
+							 span->npages);
+		else
+			appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
+		span = relptr_access(base, span->next);
+	}
+
+	appendStringInfoChar(buf, '\n');
+}
+
+/*
+ * This function allocates a run of pages of the given length from the free
+ * page manager.
+ */
+static bool
+FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *victim = NULL;
+	FreePageSpanLeader *prev;
+	FreePageSpanLeader *next;
+	FreePageBtreeSearchResult result;
+	Size		victim_page = 0;	/* placate compiler */
+	Size		f;
+
+	/*
+	 * Search for a free span.
+	 *
+	 * Right now, we use a simple best-fit policy here, but it's possible for
+	 * this to result in memory fragmentation if we're repeatedly asked to
+	 * allocate chunks just a little smaller than what we have available.
+	 * Hopefully, this is unlikely, because we expect most requests to be
+	 * single pages or superblock-sized chunks -- but no policy can be optimal
+	 * under all circumstances unless it has knowledge of future allocation
+	 * patterns.
+	 */
+	for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
+	{
+		/* Skip empty freelists. */
+		if (relptr_is_null(fpm->freelist[f]))
+			continue;
+
+		/*
+		 * All of the freelists except the last one contain only items of a
+		 * single size, so we just take the first one.  But the final free
+		 * list contains everything too big for any of the other lists, so we
+		 * need to search the list.
+		 */
+		if (f < FPM_NUM_FREELISTS - 1)
+			victim = relptr_access(base, fpm->freelist[f]);
+		else
+		{
+			FreePageSpanLeader *candidate;
+
+			candidate = relptr_access(base, fpm->freelist[f]);
+			do
+			{
+				if (candidate->npages >= npages && (victim == NULL ||
+													victim->npages > candidate->npages))
+				{
+					victim = candidate;
+					if (victim->npages == npages)
+						break;
+				}
+				candidate = relptr_access(base, candidate->next);
+			} while (candidate != NULL);
+		}
+		break;
+	}
+
+	/* If we didn't find an allocatable span, return failure. */
+	if (victim == NULL)
+		return false;
+
+	/* Remove span from free list. */
+	Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
+	prev = relptr_access(base, victim->prev);
+	next = relptr_access(base, victim->next);
+	if (prev != NULL)
+		relptr_copy(prev->next, victim->next);
+	else
+		relptr_copy(fpm->freelist[f], victim->next);
+	if (next != NULL)
+		relptr_copy(next->prev, victim->prev);
+	victim_page = fpm_pointer_to_page(base, victim);
+
+	/* Decide whether we might be invalidating contiguous_pages. */
+	if (f == FPM_NUM_FREELISTS - 1 &&
+		victim->npages == fpm->contiguous_pages)
+	{
+		/*
+		 * The victim span came from the oversized freelist, and had the same
+		 * size as the longest span.  There may or may not be another one of
+		 * the same size, so contiguous_pages must be recomputed just to be
+		 * safe.
+		 */
+		fpm->contiguous_pages_dirty = true;
+	}
+	else if (f + 1 == fpm->contiguous_pages &&
+			 relptr_is_null(fpm->freelist[f]))
+	{
+		/*
+		 * The victim span came from a fixed sized freelist, and it was the
+		 * list for spans of the same size as the current longest span, and
+		 * the list is now empty after removing the victim.  So
+		 * contiguous_pages must be recomputed without a doubt.
+		 */
+		fpm->contiguous_pages_dirty = true;
+	}
+
+	/*
+	 * If we haven't initialized the btree yet, the victim must be the single
+	 * span stored within the FreePageManager itself.  Otherwise, we need to
+	 * update the btree.
+	 */
+	if (relptr_is_null(fpm->btree_root))
+	{
+		Assert(victim_page == fpm->singleton_first_page);
+		Assert(victim->npages == fpm->singleton_npages);
+		Assert(victim->npages >= npages);
+		fpm->singleton_first_page += npages;
+		fpm->singleton_npages -= npages;
+		if (fpm->singleton_npages > 0)
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+	}
+	else
+	{
+		/*
+		 * If the span we found is exactly the right size, remove it from the
+		 * btree completely.  Otherwise, adjust the btree entry to reflect the
+		 * still-unallocated portion of the span, and put that portion on the
+		 * appropriate free list.
+		 */
+		FreePageBtreeSearch(fpm, victim_page, &result);
+		Assert(result.found);
+		if (victim->npages == npages)
+			FreePageBtreeRemove(fpm, result.page, result.index);
+		else
+		{
+			FreePageBtreeLeafKey *key;
+
+			/* Adjust btree to reflect remaining pages. */
+			Assert(victim->npages > npages);
+			key = &result.page->u.leaf_key[result.index];
+			Assert(key->npages == victim->npages);
+			key->first_page += npages;
+			key->npages -= npages;
+			if (result.index == 0)
+				FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+			/* Put the unallocated pages back on the appropriate free list. */
+			FreePagePushSpanLeader(fpm, victim_page + npages,
+								   victim->npages - npages);
+		}
+	}
+
+	/* Return results to caller. */
+	*first_page = fpm_pointer_to_page(base, victim);
+	return true;
+}
+
+/*
+ * Put a range of pages into the btree and freelists, consolidating it with
+ * existing free spans just before and/or after it.  If 'soft' is true,
+ * only perform the insertion if it can be done without allocating new btree
+ * pages; if false, do it always.  Returns 0 if the soft flag caused the
+ * insertion to be skipped, or otherwise the size of the contiguous span
+ * created by the insertion.  This may be larger than npages if we're able
+ * to consolidate with an adjacent range.
+ */
+static Size
+FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
+						   bool soft)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageBtreeSearchResult result;
+	FreePageBtreeLeafKey *prevkey = NULL;
+	FreePageBtreeLeafKey *nextkey = NULL;
+	FreePageBtree *np;
+	Size		nindex;
+
+	Assert(npages > 0);
+
+	/* We can store a single free span without initializing the btree. */
+	if (fpm->btree_depth == 0)
+	{
+		if (fpm->singleton_npages == 0)
+		{
+			/* Don't have a span yet; store this one. */
+			fpm->singleton_first_page = first_page;
+			fpm->singleton_npages = npages;
+			FreePagePushSpanLeader(fpm, first_page, npages);
+			return fpm->singleton_npages;
+		}
+		else if (fpm->singleton_first_page + fpm->singleton_npages ==
+				 first_page)
+		{
+			/* New span immediately follows sole existing span. */
+			fpm->singleton_npages += npages;
+			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+			return fpm->singleton_npages;
+		}
+		else if (first_page + npages == fpm->singleton_first_page)
+		{
+			/* New span immediately precedes sole existing span. */
+			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+			fpm->singleton_first_page = first_page;
+			fpm->singleton_npages += npages;
+			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+								   fpm->singleton_npages);
+			return fpm->singleton_npages;
+		}
+		else
+		{
+			/* Not contiguous; we need to initialize the btree. */
+			Size		root_page;
+			FreePageBtree *root;
+
+			if (!relptr_is_null(fpm->btree_recycle))
+				root = FreePageBtreeGetRecycled(fpm);
+			else if (soft)
+				return 0;		/* Should not allocate if soft. */
+			else if (FreePageManagerGetInternal(fpm, 1, &root_page))
+				root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
+			else
+			{
+				/* We'd better be able to get a page from the existing range. */
+				elog(FATAL, "free page manager btree is corrupt");
+			}
+
+			/* Create the btree and move the preexisting range into it. */
+			root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
+			root->hdr.nused = 1;
+			relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
+			root->u.leaf_key[0].first_page = fpm->singleton_first_page;
+			root->u.leaf_key[0].npages = fpm->singleton_npages;
+			relptr_store(base, fpm->btree_root, root);
+			fpm->singleton_first_page = 0;
+			fpm->singleton_npages = 0;
+			fpm->btree_depth = 1;
+
+			/*
+			 * Corner case: it may be that the btree root took the very last
+			 * free page.  In that case, the sole btree entry covers a zero
+			 * page run, which is invalid.  Overwrite it with the entry we're
+			 * trying to insert and get out.
+			 */
+			if (root->u.leaf_key[0].npages == 0)
+			{
+				root->u.leaf_key[0].first_page = first_page;
+				root->u.leaf_key[0].npages = npages;
+				FreePagePushSpanLeader(fpm, first_page, npages);
+				return npages;
+			}
+
+			/* Fall through to insert the new key. */
+		}
+	}
+
+	/* Search the btree. */
+	FreePageBtreeSearch(fpm, first_page, &result);
+	Assert(!result.found);
+	if (result.index > 0)
+		prevkey = &result.page->u.leaf_key[result.index - 1];
+	if (result.index < result.page->hdr.nused)
+	{
+		np = result.page;
+		nindex = result.index;
+		nextkey = &result.page->u.leaf_key[result.index];
+	}
+	else
+	{
+		np = FreePageBtreeFindRightSibling(base, result.page);
+		nindex = 0;
+		if (np != NULL)
+			nextkey = &np->u.leaf_key[0];
+	}
+
+	/* Consolidate with the previous entry if possible. */
+	if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
+	{
+		bool		remove_next = false;
+		Size		result;
+
+		Assert(prevkey->first_page + prevkey->npages == first_page);
+		prevkey->npages = (first_page - prevkey->first_page) + npages;
+
+		/* Check whether we can *also* consolidate with the following entry. */
+		if (nextkey != NULL &&
+			prevkey->first_page + prevkey->npages >= nextkey->first_page)
+		{
+			Assert(prevkey->first_page + prevkey->npages ==
+				   nextkey->first_page);
+			prevkey->npages = (nextkey->first_page - prevkey->first_page)
+				+ nextkey->npages;
+			FreePagePopSpanLeader(fpm, nextkey->first_page);
+			remove_next = true;
+		}
+
+		/* Put the span on the correct freelist and save size. */
+		FreePagePopSpanLeader(fpm, prevkey->first_page);
+		FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
+		result = prevkey->npages;
+
+		/*
+		 * If we consolidated with both the preceding and following entries,
+		 * we must remove the following entry.  We do this last, because
+		 * removing an element from the btree may invalidate pointers we hold
+		 * into the current data structure.
+		 *
+		 * NB: The btree is technically in an invalid state a this point
+		 * because we've already updated prevkey to cover the same key space
+		 * as nextkey.  FreePageBtreeRemove() shouldn't notice that, though.
+		 */
+		if (remove_next)
+			FreePageBtreeRemove(fpm, np, nindex);
+
+		return result;
+	}
+
+	/* Consolidate with the next entry if possible. */
+	if (nextkey != NULL && first_page + npages >= nextkey->first_page)
+	{
+		Size		newpages;
+
+		/* Compute new size for span. */
+		Assert(first_page + npages == nextkey->first_page);
+		newpages = (nextkey->first_page - first_page) + nextkey->npages;
+
+		/* Put span on correct free list. */
+		FreePagePopSpanLeader(fpm, nextkey->first_page);
+		FreePagePushSpanLeader(fpm, first_page, newpages);
+
+		/* Update key in place. */
+		nextkey->first_page = first_page;
+		nextkey->npages = newpages;
+
+		/* If reducing first key on page, ancestors might need adjustment. */
+		if (nindex == 0)
+			FreePageBtreeAdjustAncestorKeys(fpm, np);
+
+		return nextkey->npages;
+	}
+
+	/* Split leaf page and as many of its ancestors as necessary. */
+	if (result.split_pages > 0)
+	{
+		/*
+		 * NB: We could consider various coping strategies here to avoid a
+		 * split; most obviously, if np != result.page, we could target that
+		 * page instead.   More complicated shuffling strategies could be
+		 * possible as well; basically, unless every single leaf page is 100%
+		 * full, we can jam this key in there if we try hard enough.  It's
+		 * unlikely that trying that hard is worthwhile, but it's possible we
+		 * might need to make more than no effort.  For now, we just do the
+		 * easy thing, which is nothing.
+		 */
+
+		/* If this is a soft insert, it's time to give up. */
+		if (soft)
+			return 0;
+
+		/* Check whether we need to allocate more btree pages to split. */
+		if (result.split_pages > fpm->btree_recycle_count)
+		{
+			Size		pages_needed;
+			Size		recycle_page;
+			Size		i;
+
+			/*
+			 * Allocate the required number of pages and split each one in
+			 * turn.  This should never fail, because if we've got enough
+			 * spans of free pages kicking around that we need additional
+			 * storage space just to remember them all, then we should
+			 * certainly have enough to expand the btree, which should only
+			 * ever use a tiny number of pages compared to the number under
+			 * management.  If it does, something's badly screwed up.
+			 */
+			pages_needed = result.split_pages - fpm->btree_recycle_count;
+			for (i = 0; i < pages_needed; ++i)
+			{
+				if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
+					elog(FATAL, "free page manager btree is corrupt");
+				FreePageBtreeRecycle(fpm, recycle_page);
+			}
+
+			/*
+			 * The act of allocating pages to recycle may have invalidated the
+			 * results of our previous btree research, so repeat it. (We could
+			 * recheck whether any of our split-avoidance strategies that were
+			 * not viable before now are, but it hardly seems worthwhile, so
+			 * we don't bother. Consolidation can't be possible now if it
+			 * wasn't previously.)
+			 */
+			FreePageBtreeSearch(fpm, first_page, &result);
+
+			/*
+			 * The act of allocating pages for use in constructing our btree
+			 * should never cause any page to become more full, so the new
+			 * split depth should be no greater than the old one, and perhaps
+			 * less if we fortuitously allocated a chunk that freed up a slot
+			 * on the page we need to update.
+			 */
+			Assert(result.split_pages <= fpm->btree_recycle_count);
+		}
+
+		/* If we still need to perform a split, do it. */
+		if (result.split_pages > 0)
+		{
+			FreePageBtree *split_target = result.page;
+			FreePageBtree *child = NULL;
+			Size		key = first_page;
+
+			for (;;)
+			{
+				FreePageBtree *newsibling;
+				FreePageBtree *parent;
+
+				/* Identify parent page, which must receive downlink. */
+				parent = relptr_access(base, split_target->hdr.parent);
+
+				/* Split the page - downlink not added yet. */
+				newsibling = FreePageBtreeSplitPage(fpm, split_target);
+
+				/*
+				 * At this point in the loop, we're always carrying a pending
+				 * insertion.  On the first pass, it's the actual key we're
+				 * trying to insert; on subsequent passes, it's the downlink
+				 * that needs to be added as a result of the split performed
+				 * during the previous loop iteration.  Since we've just split
+				 * the page, there's definitely room on one of the two
+				 * resulting pages.
+				 */
+				if (child == NULL)
+				{
+					Size		index;
+					FreePageBtree *insert_into;
+
+					insert_into = key < newsibling->u.leaf_key[0].first_page ?
+						split_target : newsibling;
+					index = FreePageBtreeSearchLeaf(insert_into, key);
+					FreePageBtreeInsertLeaf(insert_into, index, key, npages);
+					if (index == 0 && insert_into == split_target)
+						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+				}
+				else
+				{
+					Size		index;
+					FreePageBtree *insert_into;
+
+					insert_into =
+						key < newsibling->u.internal_key[0].first_page ?
+						split_target : newsibling;
+					index = FreePageBtreeSearchInternal(insert_into, key);
+					FreePageBtreeInsertInternal(base, insert_into, index,
+												key, child);
+					relptr_store(base, child->hdr.parent, insert_into);
+					if (index == 0 && insert_into == split_target)
+						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+				}
+
+				/* If the page we just split has no parent, split the root. */
+				if (parent == NULL)
+				{
+					FreePageBtree *newroot;
+
+					newroot = FreePageBtreeGetRecycled(fpm);
+					newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
+					newroot->hdr.nused = 2;
+					relptr_store(base, newroot->hdr.parent,
+								 (FreePageBtree *) NULL);
+					newroot->u.internal_key[0].first_page =
+						FreePageBtreeFirstKey(split_target);
+					relptr_store(base, newroot->u.internal_key[0].child,
+								 split_target);
+					relptr_store(base, split_target->hdr.parent, newroot);
+					newroot->u.internal_key[1].first_page =
+						FreePageBtreeFirstKey(newsibling);
+					relptr_store(base, newroot->u.internal_key[1].child,
+								 newsibling);
+					relptr_store(base, newsibling->hdr.parent, newroot);
+					relptr_store(base, fpm->btree_root, newroot);
+					fpm->btree_depth++;
+
+					break;
+				}
+
+				/* If the parent page isn't full, insert the downlink. */
+				key = newsibling->u.internal_key[0].first_page;
+				if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
+				{
+					Size		index;
+
+					index = FreePageBtreeSearchInternal(parent, key);
+					FreePageBtreeInsertInternal(base, parent, index,
+												key, newsibling);
+					relptr_store(base, newsibling->hdr.parent, parent);
+					if (index == 0)
+						FreePageBtreeAdjustAncestorKeys(fpm, parent);
+					break;
+				}
+
+				/* The parent also needs to be split, so loop around. */
+				child = newsibling;
+				split_target = parent;
+			}
+
+			/*
+			 * The loop above did the insert, so just need to update the free
+			 * list, and we're done.
+			 */
+			FreePagePushSpanLeader(fpm, first_page, npages);
+
+			return npages;
+		}
+	}
+
+	/* Physically add the key to the page. */
+	Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
+	FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
+
+	/* If new first key on page, ancestors might need adjustment. */
+	if (result.index == 0)
+		FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+	/* Put it on the free list. */
+	FreePagePushSpanLeader(fpm, first_page, npages);
+
+	return npages;
+}
+
+/*
+ * Remove a FreePageSpanLeader from the linked-list that contains it, either
+ * because we're changing the size of the span, or because we're allocating it.
+ */
+static void
+FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
+{
+	char	   *base = fpm_segment_base(fpm);
+	FreePageSpanLeader *span;
+	FreePageSpanLeader *next;
+	FreePageSpanLeader *prev;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+
+	next = relptr_access(base, span->next);
+	prev = relptr_access(base, span->prev);
+	if (next != NULL)
+		relptr_copy(next->prev, span->prev);
+	if (prev != NULL)
+		relptr_copy(prev->next, span->next);
+	else
+	{
+		Size		f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
+
+		Assert(relptr_offset(fpm->freelist[f]) == pageno * FPM_PAGE_SIZE);
+		relptr_copy(fpm->freelist[f], span->next);
+	}
+}
+
+/*
+ * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
+ */
+static void
+FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
+{
+	char	   *base = fpm_segment_base(fpm);
+	Size		f = Min(npages, FPM_NUM_FREELISTS) - 1;
+	FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
+	FreePageSpanLeader *span;
+
+	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
+	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+	span->npages = npages;
+	relptr_store(base, span->next, head);
+	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+	if (head != NULL)
+		relptr_store(base, head->prev, span);
+	relptr_store(base, fpm->freelist[f], span);
+}
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/generation.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/generation.c
new file mode 100644
index 00000000000..4fb8663cd6b
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,1134 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ *	  Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/generation.c
+ *
+ *
+ *	This memory context is based on the assumption that the chunks are freed
+ *	roughly in the same order as they were allocated (FIFO), or in groups with
+ *	similar lifespan (generations - hence the name of the context). This is
+ *	typical for various queue-like use cases, i.e. when tuples are constructed,
+ *	processed and then thrown away.
+ *
+ *	The memory context uses a very simple approach to free space management.
+ *	Instead of a complex global freelist, each block tracks a number
+ *	of allocated and freed chunks.  The block is classed as empty when the
+ *	number of free chunks is equal to the number of allocated chunks.  When
+ *	this occurs, instead of freeing the block, we try to "recycle" it, i.e.
+ *	reuse it for new allocations.  This is done by setting the block in the
+ *	context's 'freeblock' field.  If the freeblock field is already occupied
+ *	by another free block we simply return the newly empty block to malloc.
+ *
+ *	This approach to free blocks requires fewer malloc/free calls for truly
+ *	first allocated, first free'd allocation patterns.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "port/pg_bitutils.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/memutils_memorychunk.h"
+#include "utils/memutils_internal.h"
+
+
+#define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlock))
+#define Generation_CHUNKHDRSZ	sizeof(MemoryChunk)
+
+#define Generation_CHUNK_FRACTION	8
+
+typedef struct GenerationBlock GenerationBlock; /* forward reference */
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks,
+ * and freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+
+	/* Generational context parameters */
+	Size		initBlockSize;	/* initial block size */
+	Size		maxBlockSize;	/* maximum block size */
+	Size		nextBlockSize;	/* next block size to allocate */
+	Size		allocChunkLimit;	/* effective chunk size limit */
+
+	GenerationBlock *block;		/* current (most recently allocated) block, or
+								 * NULL if we've just freed the most recent
+								 * block */
+	GenerationBlock *freeblock; /* pointer to a block that's being recycled,
+								 * or NULL if there's no such block. */
+	GenerationBlock *keeper;	/* keep this block over resets */
+	dlist_head	blocks;			/* list of blocks */
+} GenerationContext;
+
+/*
+ * GenerationBlock
+ *		GenerationBlock is the unit of memory that is obtained by generation.c
+ *		from malloc().  It contains zero or more MemoryChunks, which are the
+ *		units requested by palloc() and freed by pfree().  MemoryChunks cannot
+ *		be returned to malloc() individually, instead pfree() updates the free
+ *		counter of the block and when all chunks in a block are free the whole
+ *		block can be returned to malloc().
+ *
+ *		GenerationBlock is the header data for a block --- the usable space
+ *		within the block begins at the next alignment boundary.
+ */
+struct GenerationBlock
+{
+	dlist_node	node;			/* doubly-linked list of blocks */
+	GenerationContext *context; /* pointer back to the owning context */
+	Size		blksize;		/* allocated size of this block */
+	int			nchunks;		/* number of chunks in the block */
+	int			nfree;			/* number of free chunks */
+	char	   *freeptr;		/* start of free space in this block */
+	char	   *endptr;			/* end of space in this block */
+};
+
+/*
+ * GenerationIsValid
+ *		True iff set is valid generation set.
+ */
+#define GenerationIsValid(set) \
+	(PointerIsValid(set) && IsA(set, GenerationContext))
+
+/*
+ * GenerationBlockIsValid
+ *		True iff block is valid block of generation set.
+ */
+#define GenerationBlockIsValid(block) \
+	(PointerIsValid(block) && GenerationIsValid((block)->context))
+
+/*
+ * We always store external chunks on a dedicated block.  This makes fetching
+ * the block from an external chunk easy since it's always the first and only
+ * chunk on the block.
+ */
+#define ExternalChunkGetBlock(chunk) \
+	(GenerationBlock *) ((char *) chunk - Generation_BLOCKHDRSZ)
+
+/* Inlined helper functions */
+static inline void GenerationBlockInit(GenerationContext *context,
+									   GenerationBlock *block,
+									   Size blksize);
+static inline bool GenerationBlockIsEmpty(GenerationBlock *block);
+static inline void GenerationBlockMarkEmpty(GenerationBlock *block);
+static inline Size GenerationBlockFreeBytes(GenerationBlock *block);
+static inline void GenerationBlockFree(GenerationContext *set,
+									   GenerationBlock *block);
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ *		Create a new Generation context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (must be statically allocated)
+ * minContextSize: minimum context size
+ * initBlockSize: initial allocation block size
+ * maxBlockSize: maximum allocation block size
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+						const char *name,
+						Size minContextSize,
+						Size initBlockSize,
+						Size maxBlockSize)
+{
+	Size		firstBlockSize;
+	Size		allocSize;
+	GenerationContext *set;
+	GenerationBlock *block;
+
+	/* ensure MemoryChunk's size is properly maxaligned */
+	StaticAssertDecl(Generation_CHUNKHDRSZ == MAXALIGN(Generation_CHUNKHDRSZ),
+					 "sizeof(MemoryChunk) is not maxaligned");
+
+	/*
+	 * First, validate allocation parameters.  Asserts seem sufficient because
+	 * nobody varies their parameters at runtime.  We somewhat arbitrarily
+	 * enforce a minimum 1K block size.  We restrict the maximum block size to
+	 * MEMORYCHUNK_MAX_BLOCKOFFSET as MemoryChunks are limited to this in
+	 * regards to addressing the offset between the chunk and the block that
+	 * the chunk is stored on.  We would be unable to store the offset between
+	 * the chunk and block for any chunks that were beyond
+	 * MEMORYCHUNK_MAX_BLOCKOFFSET bytes into the block if the block was to be
+	 * larger than this.
+	 */
+	Assert(initBlockSize == MAXALIGN(initBlockSize) &&
+		   initBlockSize >= 1024);
+	Assert(maxBlockSize == MAXALIGN(maxBlockSize) &&
+		   maxBlockSize >= initBlockSize &&
+		   AllocHugeSizeIsValid(maxBlockSize)); /* must be safe to double */
+	Assert(minContextSize == 0 ||
+		   (minContextSize == MAXALIGN(minContextSize) &&
+			minContextSize >= 1024 &&
+			minContextSize <= maxBlockSize));
+	Assert(maxBlockSize <= MEMORYCHUNK_MAX_BLOCKOFFSET);
+
+	/* Determine size of initial block */
+	allocSize = MAXALIGN(sizeof(GenerationContext)) +
+		Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+	if (minContextSize != 0)
+		allocSize = Max(allocSize, minContextSize);
+	else
+		allocSize = Max(allocSize, initBlockSize);
+
+	/*
+	 * Allocate the initial block.  Unlike other generation.c blocks, it
+	 * starts with the context header and its block header follows that.
+	 */
+	set = (GenerationContext *) malloc(allocSize);
+	if (set == NULL)
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed while creating memory context \"%s\".",
+						   name)));
+	}
+
+	/*
+	 * Avoid writing code that can fail between here and MemoryContextCreate;
+	 * we'd leak the header if we ereport in this stretch.
+	 */
+	dlist_init(&set->blocks);
+
+	/* Fill in the initial block's block header */
+	block = (GenerationBlock *) (((char *) set) + MAXALIGN(sizeof(GenerationContext)));
+	/* determine the block size and initialize it */
+	firstBlockSize = allocSize - MAXALIGN(sizeof(GenerationContext));
+	GenerationBlockInit(set, block, firstBlockSize);
+
+	/* add it to the doubly-linked list of blocks */
+	dlist_push_head(&set->blocks, &block->node);
+
+	/* use it as the current allocation block */
+	set->block = block;
+
+	/* No free block, yet */
+	set->freeblock = NULL;
+
+	/* Mark block as not to be released at reset time */
+	set->keeper = block;
+
+	/* Fill in GenerationContext-specific header fields */
+	set->initBlockSize = initBlockSize;
+	set->maxBlockSize = maxBlockSize;
+	set->nextBlockSize = initBlockSize;
+
+	/*
+	 * Compute the allocation chunk size limit for this context.
+	 *
+	 * Limit the maximum size a non-dedicated chunk can be so that we can fit
+	 * at least Generation_CHUNK_FRACTION of chunks this big onto the maximum
+	 * sized block.  We must further limit this value so that it's no more
+	 * than MEMORYCHUNK_MAX_VALUE.  We're unable to have non-external chunks
+	 * larger than that value as we store the chunk size in the MemoryChunk
+	 * 'value' field in the call to MemoryChunkSetHdrMask().
+	 */
+	set->allocChunkLimit = Min(maxBlockSize, MEMORYCHUNK_MAX_VALUE);
+	while ((Size) (set->allocChunkLimit + Generation_CHUNKHDRSZ) >
+		   (Size) ((Size) (maxBlockSize - Generation_BLOCKHDRSZ) / Generation_CHUNK_FRACTION))
+		set->allocChunkLimit >>= 1;
+
+	/* Finally, do the type-independent part of context creation */
+	MemoryContextCreate((MemoryContext) set,
+						T_GenerationContext,
+						MCTX_GENERATION_ID,
+						parent,
+						name);
+
+	((MemoryContext) set)->mem_allocated = firstBlockSize;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * GenerationReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+void
+GenerationReset(MemoryContext context)
+{
+	GenerationContext *set = (GenerationContext *) context;
+	dlist_mutable_iter miter;
+
+	Assert(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	GenerationCheck(context);
+#endif
+
+	/*
+	 * NULLify the free block pointer.  We must do this before calling
+	 * GenerationBlockFree as that function never expects to free the
+	 * freeblock.
+	 */
+	set->freeblock = NULL;
+
+	dlist_foreach_modify(miter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, miter.cur);
+
+		if (block == set->keeper)
+			GenerationBlockMarkEmpty(block);
+		else
+			GenerationBlockFree(set, block);
+	}
+
+	/* set it so new allocations to make use of the keeper block */
+	set->block = set->keeper;
+
+	/* Reset block size allocation sequence, too */
+	set->nextBlockSize = set->initBlockSize;
+
+	/* Ensure there is only 1 item in the dlist */
+	Assert(!dlist_is_empty(&set->blocks));
+	Assert(!dlist_has_next(&set->blocks, dlist_head_node(&set->blocks)));
+}
+
+/*
+ * GenerationDelete
+ *		Free all memory which is allocated in the given context.
+ */
+void
+GenerationDelete(MemoryContext context)
+{
+	/* Reset to release all releasable GenerationBlocks */
+	GenerationReset(context);
+	/* And free the context header and keeper block */
+	free(context);
+}
+
+/*
+ * GenerationAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ *
+ * Note: when using valgrind, it doesn't matter how the returned allocation
+ * is marked, as mcxt.c will set it to UNDEFINED.  In some paths we will
+ * return space that is marked NOACCESS - GenerationRealloc has to beware!
+ */
+void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+	GenerationContext *set = (GenerationContext *) context;
+	GenerationBlock *block;
+	MemoryChunk *chunk;
+	Size		chunk_size;
+	Size		required_size;
+
+	Assert(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* ensure there's always space for the sentinel byte */
+	chunk_size = MAXALIGN(size + 1);
+#else
+	chunk_size = MAXALIGN(size);
+#endif
+	required_size = chunk_size + Generation_CHUNKHDRSZ;
+
+	/* is it an over-sized chunk? if yes, allocate special block */
+	if (chunk_size > set->allocChunkLimit)
+	{
+		Size		blksize = required_size + Generation_BLOCKHDRSZ;
+
+		block = (GenerationBlock *) malloc(blksize);
+		if (block == NULL)
+			return NULL;
+
+		context->mem_allocated += blksize;
+
+		/* block with a single (used) chunk */
+		block->context = set;
+		block->blksize = blksize;
+		block->nchunks = 1;
+		block->nfree = 0;
+
+		/* the block is completely full */
+		block->freeptr = block->endptr = ((char *) block) + blksize;
+
+		chunk = (MemoryChunk *) (((char *) block) + Generation_BLOCKHDRSZ);
+
+		/* mark the MemoryChunk as externally managed */
+		MemoryChunkSetHdrMaskExternal(chunk, MCTX_GENERATION_ID);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+		chunk->requested_size = size;
+		/* set mark to catch clobber of "unused" space */
+		Assert(size < chunk_size);
+		set_sentinel(MemoryChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* fill the allocated space with junk */
+		randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
+#endif
+
+		/* add the block to the list of allocated blocks */
+		dlist_push_head(&set->blocks, &block->node);
+
+		/* Ensure any padding bytes are marked NOACCESS. */
+		VALGRIND_MAKE_MEM_NOACCESS((char *) MemoryChunkGetPointer(chunk) + size,
+								   chunk_size - size);
+
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+
+		return MemoryChunkGetPointer(chunk);
+	}
+
+	/*
+	 * Not an oversized chunk.  We try to first make use of the current block,
+	 * but if there's not enough space in it, instead of allocating a new
+	 * block, we look to see if the freeblock is empty and has enough space.
+	 * If not, we'll also try the same using the keeper block.  The keeper
+	 * block may have become empty and we have no other way to reuse it again
+	 * if we don't try to use it explicitly here.
+	 *
+	 * We don't want to start filling the freeblock before the current block
+	 * is full, otherwise we may cause fragmentation in FIFO type workloads.
+	 * We only switch to using the freeblock or keeper block if those blocks
+	 * are completely empty.  If we didn't do that we could end up fragmenting
+	 * consecutive allocations over multiple blocks which would be a problem
+	 * that would compound over time.
+	 */
+	block = set->block;
+
+	if (block == NULL ||
+		GenerationBlockFreeBytes(block) < required_size)
+	{
+		Size		blksize;
+		GenerationBlock *freeblock = set->freeblock;
+
+		if (freeblock != NULL &&
+			GenerationBlockIsEmpty(freeblock) &&
+			GenerationBlockFreeBytes(freeblock) >= required_size)
+		{
+			block = freeblock;
+
+			/*
+			 * Zero out the freeblock as we'll set this to the current block
+			 * below
+			 */
+			set->freeblock = NULL;
+		}
+		else if (GenerationBlockIsEmpty(set->keeper) &&
+				 GenerationBlockFreeBytes(set->keeper) >= required_size)
+		{
+			block = set->keeper;
+		}
+		else
+		{
+			/*
+			 * The first such block has size initBlockSize, and we double the
+			 * space in each succeeding block, but not more than maxBlockSize.
+			 */
+			blksize = set->nextBlockSize;
+			set->nextBlockSize <<= 1;
+			if (set->nextBlockSize > set->maxBlockSize)
+				set->nextBlockSize = set->maxBlockSize;
+
+			/* we'll need a block hdr too, so add that to the required size */
+			required_size += Generation_BLOCKHDRSZ;
+
+			/* round the size up to the next power of 2 */
+			if (blksize < required_size)
+				blksize = pg_nextpower2_size_t(required_size);
+
+			block = (GenerationBlock *) malloc(blksize);
+
+			if (block == NULL)
+				return NULL;
+
+			context->mem_allocated += blksize;
+
+			/* initialize the new block */
+			GenerationBlockInit(set, block, blksize);
+
+			/* add it to the doubly-linked list of blocks */
+			dlist_push_head(&set->blocks, &block->node);
+
+			/* Zero out the freeblock in case it's become full */
+			set->freeblock = NULL;
+		}
+
+		/* and also use it as the current allocation block */
+		set->block = block;
+	}
+
+	/* we're supposed to have a block with enough free space now */
+	Assert(block != NULL);
+	Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+	chunk = (MemoryChunk *) block->freeptr;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, Generation_CHUNKHDRSZ);
+
+	block->nchunks += 1;
+	block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+	Assert(block->freeptr <= block->endptr);
+
+	MemoryChunkSetHdrMask(chunk, block, chunk_size, MCTX_GENERATION_ID);
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->requested_size = size;
+	/* set mark to catch clobber of "unused" space */
+	Assert(size < chunk_size);
+	set_sentinel(MemoryChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
+#endif
+
+	/* Ensure any padding bytes are marked NOACCESS. */
+	VALGRIND_MAKE_MEM_NOACCESS((char *) MemoryChunkGetPointer(chunk) + size,
+							   chunk_size - size);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+
+	return MemoryChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationBlockInit
+ *		Initializes 'block' assuming 'blksize'.  Does not update the context's
+ *		mem_allocated field.
+ */
+static inline void
+GenerationBlockInit(GenerationContext *context, GenerationBlock *block,
+					Size blksize)
+{
+	block->context = context;
+	block->blksize = blksize;
+	block->nchunks = 0;
+	block->nfree = 0;
+
+	block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+	block->endptr = ((char *) block) + blksize;
+
+	/* Mark unallocated space NOACCESS. */
+	VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+							   blksize - Generation_BLOCKHDRSZ);
+}
+
+/*
+ * GenerationBlockIsEmpty
+ *		Returns true iff 'block' contains no chunks
+ */
+static inline bool
+GenerationBlockIsEmpty(GenerationBlock *block)
+{
+	return (block->nchunks == 0);
+}
+
+/*
+ * GenerationBlockMarkEmpty
+ *		Set a block as empty.  Does not free the block.
+ */
+static inline void
+GenerationBlockMarkEmpty(GenerationBlock *block)
+{
+#if defined(USE_VALGRIND) || defined(CLOBBER_FREED_MEMORY)
+	char	   *datastart = ((char *) block) + Generation_BLOCKHDRSZ;
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(datastart, block->freeptr - datastart);
+#else
+	/* wipe_mem() would have done this */
+	VALGRIND_MAKE_MEM_NOACCESS(datastart, block->freeptr - datastart);
+#endif
+
+	/* Reset the block, but don't return it to malloc */
+	block->nchunks = 0;
+	block->nfree = 0;
+	block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+}
+
+/*
+ * GenerationBlockFreeBytes
+ *		Returns the number of bytes free in 'block'
+ */
+static inline Size
+GenerationBlockFreeBytes(GenerationBlock *block)
+{
+	return (block->endptr - block->freeptr);
+}
+
+/*
+ * GenerationBlockFree
+ *		Remove 'block' from 'set' and release the memory consumed by it.
+ */
+static inline void
+GenerationBlockFree(GenerationContext *set, GenerationBlock *block)
+{
+	/* Make sure nobody tries to free the keeper block */
+	Assert(block != set->keeper);
+	/* We shouldn't be freeing the freeblock either */
+	Assert(block != set->freeblock);
+
+	/* release the block from the list of blocks */
+	dlist_delete(&block->node);
+
+	((MemoryContext) set)->mem_allocated -= block->blksize;
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(block, block->blksize);
+#endif
+
+	free(block);
+}
+
+/*
+ * GenerationFree
+ *		Update number of chunks in the block, and if all chunks in the block
+ *		are now free then discard the block.
+ */
+void
+GenerationFree(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	GenerationBlock *block;
+	GenerationContext *set;
+#if (defined(MEMORY_CONTEXT_CHECKING) && defined(USE_ASSERT_CHECKING)) \
+	|| defined(CLOBBER_FREED_MEMORY)
+	Size		chunksize;
+#endif
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Generation_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+	{
+		block = ExternalChunkGetBlock(chunk);
+
+		/*
+		 * Try to verify that we have a sane block pointer: the block header
+		 * should reference a generation context.
+		 */
+		if (!GenerationBlockIsValid(block))
+			elog(ERROR, "could not find block containing chunk %p", chunk);
+
+#if (defined(MEMORY_CONTEXT_CHECKING) && defined(USE_ASSERT_CHECKING)) \
+	|| defined(CLOBBER_FREED_MEMORY)
+		chunksize = block->endptr - (char *) pointer;
+#endif
+	}
+	else
+	{
+		block = MemoryChunkGetBlock(chunk);
+
+		/*
+		 * In this path, for speed reasons we just Assert that the referenced
+		 * block is good.  Future field experience may show that this Assert
+		 * had better become a regular runtime test-and-elog check.
+		 */
+		Assert(GenerationBlockIsValid(block));
+
+#if (defined(MEMORY_CONTEXT_CHECKING) && defined(USE_ASSERT_CHECKING)) \
+	|| defined(CLOBBER_FREED_MEMORY)
+		chunksize = MemoryChunkGetValue(chunk);
+#endif
+	}
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	Assert(chunk->requested_size < chunksize);
+	if (!sentinel_ok(pointer, chunk->requested_size))
+		elog(WARNING, "detected write past chunk end in %s %p",
+			 ((MemoryContext) block->context)->name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunksize);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to InvalidAllocSize in freed chunks */
+	chunk->requested_size = InvalidAllocSize;
+#endif
+
+	block->nfree += 1;
+
+	Assert(block->nchunks > 0);
+	Assert(block->nfree <= block->nchunks);
+
+	/* If there are still allocated chunks in the block, we're done. */
+	if (block->nfree < block->nchunks)
+		return;
+
+	set = block->context;
+
+	/* Don't try to free the keeper block, just mark it empty */
+	if (block == set->keeper)
+	{
+		GenerationBlockMarkEmpty(block);
+		return;
+	}
+
+	/*
+	 * If there is no freeblock set or if this is the freeblock then instead
+	 * of freeing this memory, we keep it around so that new allocations have
+	 * the option of recycling it.
+	 */
+	if (set->freeblock == NULL || set->freeblock == block)
+	{
+		/* XXX should we only recycle maxBlockSize sized blocks? */
+		set->freeblock = block;
+		GenerationBlockMarkEmpty(block);
+		return;
+	}
+
+	/* Also make sure the block is not marked as the current block. */
+	if (set->block == block)
+		set->block = NULL;
+
+	/*
+	 * The block is empty, so let's get rid of it. First remove it from the
+	 * list of blocks, then return it to malloc().
+	 */
+	dlist_delete(&block->node);
+
+	set->header.mem_allocated -= block->blksize;
+	free(block);
+}
+
+/*
+ * GenerationRealloc
+ *		When handling repalloc, we simply allocate a new chunk, copy the data
+ *		and discard the old one. The only exception is when the new size fits
+ *		into the old chunk - in that case we just update chunk header.
+ */
+void *
+GenerationRealloc(void *pointer, Size size)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	GenerationContext *set;
+	GenerationBlock *block;
+	GenerationPointer newPointer;
+	Size		oldsize;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Generation_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+	{
+		block = ExternalChunkGetBlock(chunk);
+
+		/*
+		 * Try to verify that we have a sane block pointer: the block header
+		 * should reference a generation context.
+		 */
+		if (!GenerationBlockIsValid(block))
+			elog(ERROR, "could not find block containing chunk %p", chunk);
+
+		oldsize = block->endptr - (char *) pointer;
+	}
+	else
+	{
+		block = MemoryChunkGetBlock(chunk);
+
+		/*
+		 * In this path, for speed reasons we just Assert that the referenced
+		 * block is good.  Future field experience may show that this Assert
+		 * had better become a regular runtime test-and-elog check.
+		 */
+		Assert(GenerationBlockIsValid(block));
+
+		oldsize = MemoryChunkGetValue(chunk);
+	}
+
+	set = block->context;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	Assert(chunk->requested_size < oldsize);
+	if (!sentinel_ok(pointer, chunk->requested_size))
+		elog(WARNING, "detected write past chunk end in %s %p",
+			 ((MemoryContext) set)->name, chunk);
+#endif
+
+	/*
+	 * Maybe the allocated area already is >= the new size.  (In particular,
+	 * we always fall out here if the requested size is a decrease.)
+	 *
+	 * This memory context does not use power-of-2 chunk sizing and instead
+	 * carves the chunks to be as small as possible, so most repalloc() calls
+	 * will end up in the palloc/memcpy/pfree branch.
+	 *
+	 * XXX Perhaps we should annotate this condition with unlikely()?
+	 */
+	if (oldsize >= size)
+	{
+#ifdef MEMORY_CONTEXT_CHECKING
+		Size		oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+		/* We can only fill the extra space if we know the prior request */
+		if (size > oldrequest)
+			randomize_mem((char *) pointer + oldrequest,
+						  size - oldrequest);
+#endif
+
+		chunk->requested_size = size;
+
+		/*
+		 * If this is an increase, mark any newly-available part UNDEFINED.
+		 * Otherwise, mark the obsolete part NOACCESS.
+		 */
+		if (size > oldrequest)
+			VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+										size - oldrequest);
+		else
+			VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+									   oldsize - size);
+
+		/* set mark to catch clobber of "unused" space */
+		set_sentinel(pointer, size);
+#else							/* !MEMORY_CONTEXT_CHECKING */
+
+		/*
+		 * We don't have the information to determine whether we're growing
+		 * the old request or shrinking it, so we conservatively mark the
+		 * entire new allocation DEFINED.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+		VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+
+		return pointer;
+	}
+
+	/* allocate new chunk */
+	newPointer = GenerationAlloc((MemoryContext) set, size);
+
+	/* leave immediately if request was not completed */
+	if (newPointer == NULL)
+	{
+		/* Disallow access to the chunk header. */
+		VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+		return NULL;
+	}
+
+	/*
+	 * GenerationAlloc() may have returned a region that is still NOACCESS.
+	 * Change it to UNDEFINED for the moment; memcpy() will then transfer
+	 * definedness from the old allocation to the new.  If we know the old
+	 * allocation, copy just that much.  Otherwise, make the entire old chunk
+	 * defined to avoid errors as we copy the currently-NOACCESS trailing
+	 * bytes.
+	 */
+	VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+	oldsize = chunk->requested_size;
+#else
+	VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+	/* transfer existing data (certain to fit) */
+	memcpy(newPointer, pointer, oldsize);
+
+	/* free old chunk */
+	GenerationFree(pointer);
+
+	return newPointer;
+}
+
+/*
+ * GenerationGetChunkContext
+ *		Return the MemoryContext that 'pointer' belongs to.
+ */
+MemoryContext
+GenerationGetChunkContext(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	GenerationBlock *block;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Generation_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+		block = ExternalChunkGetBlock(chunk);
+	else
+		block = (GenerationBlock *) MemoryChunkGetBlock(chunk);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+
+	Assert(GenerationBlockIsValid(block));
+	return &block->context->header;
+}
+
+/*
+ * GenerationGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+Size
+GenerationGetChunkSpace(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	Size		chunksize;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Generation_CHUNKHDRSZ);
+
+	if (MemoryChunkIsExternal(chunk))
+	{
+		GenerationBlock *block = ExternalChunkGetBlock(chunk);
+
+		Assert(GenerationBlockIsValid(block));
+		chunksize = block->endptr - (char *) pointer;
+	}
+	else
+		chunksize = MemoryChunkGetValue(chunk);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+
+	return Generation_CHUNKHDRSZ + chunksize;
+}
+
+/*
+ * GenerationIsEmpty
+ *		Is a GenerationContext empty of any allocated space?
+ */
+bool
+GenerationIsEmpty(MemoryContext context)
+{
+	GenerationContext *set = (GenerationContext *) context;
+	dlist_iter	iter;
+
+	Assert(GenerationIsValid(set));
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+		if (block->nchunks > 0)
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * GenerationStats
+ *		Compute stats about memory consumption of a Generation context.
+ *
+ * printfunc: if not NULL, pass a human-readable stats string to this.
+ * passthru: pass this pointer through to printfunc.
+ * totals: if not NULL, add stats about this context into *totals.
+ * print_to_stderr: print stats to stderr if true, elog otherwise.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+void
+GenerationStats(MemoryContext context,
+				MemoryStatsPrintFunc printfunc, void *passthru,
+				MemoryContextCounters *totals, bool print_to_stderr)
+{
+	GenerationContext *set = (GenerationContext *) context;
+	Size		nblocks = 0;
+	Size		nchunks = 0;
+	Size		nfreechunks = 0;
+	Size		totalspace;
+	Size		freespace = 0;
+	dlist_iter	iter;
+
+	Assert(GenerationIsValid(set));
+
+	/* Include context header in totalspace */
+	totalspace = MAXALIGN(sizeof(GenerationContext));
+
+	dlist_foreach(iter, &set->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+		nblocks++;
+		nchunks += block->nchunks;
+		nfreechunks += block->nfree;
+		totalspace += block->blksize;
+		freespace += (block->endptr - block->freeptr);
+	}
+
+	if (printfunc)
+	{
+		char		stats_string[200];
+
+		snprintf(stats_string, sizeof(stats_string),
+				 "%zu total in %zu blocks (%zu chunks); %zu free (%zu chunks); %zu used",
+				 totalspace, nblocks, nchunks, freespace,
+				 nfreechunks, totalspace - freespace);
+		printfunc(context, passthru, stats_string, print_to_stderr);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += nfreechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+void
+GenerationCheck(MemoryContext context)
+{
+	GenerationContext *gen = (GenerationContext *) context;
+	const char *name = context->name;
+	dlist_iter	iter;
+	Size		total_allocated = 0;
+
+	/* walk all blocks in this context */
+	dlist_foreach(iter, &gen->blocks)
+	{
+		GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+		int			nfree,
+					nchunks;
+		char	   *ptr;
+		bool		has_external_chunk = false;
+
+		total_allocated += block->blksize;
+
+		/*
+		 * nfree > nchunks is surely wrong.  Equality is allowed as the block
+		 * might completely empty if it's the freeblock.
+		 */
+		if (block->nfree > block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+				 name, block->nfree, block, block->nchunks);
+
+		/* check block belongs to the correct context */
+		if (block->context != gen)
+			elog(WARNING, "problem in Generation %s: bogus context link in block %p",
+				 name, block);
+
+		/* Now walk through the chunks and count them. */
+		nfree = 0;
+		nchunks = 0;
+		ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+		while (ptr < block->freeptr)
+		{
+			MemoryChunk *chunk = (MemoryChunk *) ptr;
+			GenerationBlock *chunkblock;
+			Size		chunksize;
+
+			/* Allow access to the chunk header. */
+			VALGRIND_MAKE_MEM_DEFINED(chunk, Generation_CHUNKHDRSZ);
+
+			if (MemoryChunkIsExternal(chunk))
+			{
+				chunkblock = ExternalChunkGetBlock(chunk);
+				chunksize = block->endptr - (char *) MemoryChunkGetPointer(chunk);
+				has_external_chunk = true;
+			}
+			else
+			{
+				chunkblock = MemoryChunkGetBlock(chunk);
+				chunksize = MemoryChunkGetValue(chunk);
+			}
+
+			/* move to the next chunk */
+			ptr += (chunksize + Generation_CHUNKHDRSZ);
+
+			nchunks += 1;
+
+			/* chunks have both block and context pointers, so check both */
+			if (chunkblock != block)
+				elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+					 name, block, chunk);
+
+
+			/* is chunk allocated? */
+			if (chunk->requested_size != InvalidAllocSize)
+			{
+				/* now make sure the chunk size is correct */
+				if (chunksize < chunk->requested_size ||
+					chunksize != MAXALIGN(chunksize))
+					elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* check sentinel */
+				Assert(chunk->requested_size < chunksize);
+				if (!sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->requested_size))
+					elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+			}
+			else
+				nfree += 1;
+
+			/* if chunk is allocated, disallow access to the chunk header */
+			if (chunk->requested_size != InvalidAllocSize)
+				VALGRIND_MAKE_MEM_NOACCESS(chunk, Generation_CHUNKHDRSZ);
+		}
+
+		/*
+		 * Make sure we got the expected number of allocated and free chunks
+		 * (as tracked in the block header).
+		 */
+		if (nchunks != block->nchunks)
+			elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+				 name, nchunks, block, block->nchunks);
+
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+				 name, nfree, block, block->nfree);
+
+		if (has_external_chunk && nchunks > 1)
+			elog(WARNING, "problem in Generation %s: external chunk on non-dedicated block %p",
+				 name, block);
+
+	}
+
+	Assert(total_allocated == context->mem_allocated);
+}
+
+#endif							/* MEMORY_CONTEXT_CHECKING */
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/mcxt.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/mcxt.c
new file mode 100644
index 00000000000..beabfec00f4
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/mcxt.c
@@ -0,0 +1,1732 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcxt.c
+ *	  POSTGRES memory context management code.
+ *
+ * This module handles context management operations that are independent
+ * of the particular kind of context being operated on.  It calls
+ * context-type-specific operations via the function pointers in a
+ * context's MemoryContextMethods struct.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/mcxt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "utils/fmgrprotos.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/memutils_internal.h"
+#include "utils/memutils_memorychunk.h"
+
+
+static void BogusFree(void *pointer);
+static void *BogusRealloc(void *pointer, Size size);
+static MemoryContext BogusGetChunkContext(void *pointer);
+static Size BogusGetChunkSpace(void *pointer);
+
+extern void *ArenaAlloc(MemoryContext context, Size size);
+extern void ArenaFree(void *pointer);
+extern void *ArenaRealloc(void *pointer, Size size);
+extern void ArenaReset(MemoryContext context);
+extern void ArenaDelete(MemoryContext context);
+extern MemoryContext ArenaGetChunkContext(void *pointer);
+extern Size ArenaGetChunkSpace(void *pointer);
+extern bool ArenaIsEmpty(MemoryContext context);
+extern void ArenaStats(MemoryContext context,
+						  MemoryStatsPrintFunc printfunc, void *passthru,
+						  MemoryContextCounters *totals,
+						  bool print_to_stderr);
+#ifdef MEMORY_CONTEXT_CHECKING
+extern void ArenaCheck(MemoryContext context);
+#endif
+
+extern void *MkqlAlloc(MemoryContext context, Size size);
+extern void MkqlFree(void *pointer);
+extern void *MkqlRealloc(void *pointer, Size size);
+extern void MkqlReset(MemoryContext context);
+extern void MkqlDelete(MemoryContext context);
+extern MemoryContext MkqlGetChunkContext(void *pointer);
+extern Size MkqlGetChunkSpace(void *pointer);
+extern bool MkqlIsEmpty(MemoryContext context);
+extern void MkqlStats(MemoryContext context,
+						  MemoryStatsPrintFunc printfunc, void *passthru,
+						  MemoryContextCounters *totals,
+						  bool print_to_stderr);
+#ifdef MEMORY_CONTEXT_CHECKING
+extern void MkqlCheck(MemoryContext context);
+#endif
+
+/*****************************************************************************
+ *	  GLOBAL MEMORY															 *
+ *****************************************************************************/
+
+static const MemoryContextMethods mcxt_methods[] = {
+	/* aset.c */
+	[MCTX_ASET_ID].alloc = AllocSetAlloc,
+	[MCTX_ASET_ID].free_p = AllocSetFree,
+	[MCTX_ASET_ID].realloc = AllocSetRealloc,
+	[MCTX_ASET_ID].reset = AllocSetReset,
+	[MCTX_ASET_ID].delete_context = AllocSetDelete,
+	[MCTX_ASET_ID].get_chunk_context = AllocSetGetChunkContext,
+	[MCTX_ASET_ID].get_chunk_space = AllocSetGetChunkSpace,
+	[MCTX_ASET_ID].is_empty = AllocSetIsEmpty,
+	[MCTX_ASET_ID].stats = AllocSetStats,
+#ifdef MEMORY_CONTEXT_CHECKING
+	[MCTX_ASET_ID].check = AllocSetCheck,
+#endif
+
+	/* generation.c */
+	[MCTX_GENERATION_ID].alloc = GenerationAlloc,
+	[MCTX_GENERATION_ID].free_p = GenerationFree,
+	[MCTX_GENERATION_ID].realloc = GenerationRealloc,
+	[MCTX_GENERATION_ID].reset = GenerationReset,
+	[MCTX_GENERATION_ID].delete_context = GenerationDelete,
+	[MCTX_GENERATION_ID].get_chunk_context = GenerationGetChunkContext,
+	[MCTX_GENERATION_ID].get_chunk_space = GenerationGetChunkSpace,
+	[MCTX_GENERATION_ID].is_empty = GenerationIsEmpty,
+	[MCTX_GENERATION_ID].stats = GenerationStats,
+#ifdef MEMORY_CONTEXT_CHECKING
+	[MCTX_GENERATION_ID].check = GenerationCheck,
+#endif
+
+	/* slab.c */
+	[MCTX_SLAB_ID].alloc = SlabAlloc,
+	[MCTX_SLAB_ID].free_p = SlabFree,
+	[MCTX_SLAB_ID].realloc = SlabRealloc,
+	[MCTX_SLAB_ID].reset = SlabReset,
+	[MCTX_SLAB_ID].delete_context = SlabDelete,
+	[MCTX_SLAB_ID].get_chunk_context = SlabGetChunkContext,
+	[MCTX_SLAB_ID].get_chunk_space = SlabGetChunkSpace,
+	[MCTX_SLAB_ID].is_empty = SlabIsEmpty,
+	[MCTX_SLAB_ID].stats = SlabStats,
+#ifdef MEMORY_CONTEXT_CHECKING
+	[MCTX_SLAB_ID].check = SlabCheck,
+#endif
+
+	/* alignedalloc.c */
+	[MCTX_ALIGNED_REDIRECT_ID].alloc = NULL,	/* not required */
+	[MCTX_ALIGNED_REDIRECT_ID].free_p = AlignedAllocFree,
+	[MCTX_ALIGNED_REDIRECT_ID].realloc = AlignedAllocRealloc,
+	[MCTX_ALIGNED_REDIRECT_ID].reset = NULL,	/* not required */
+	[MCTX_ALIGNED_REDIRECT_ID].delete_context = NULL,	/* not required */
+	[MCTX_ALIGNED_REDIRECT_ID].get_chunk_context = AlignedAllocGetChunkContext,
+	[MCTX_ALIGNED_REDIRECT_ID].get_chunk_space = AlignedAllocGetChunkSpace,
+	[MCTX_ALIGNED_REDIRECT_ID].is_empty = NULL, /* not required */
+	[MCTX_ALIGNED_REDIRECT_ID].stats = NULL,	/* not required */
+#ifdef MEMORY_CONTEXT_CHECKING
+	[MCTX_ALIGNED_REDIRECT_ID].check = NULL,	/* not required */
+#endif
+
+
+	/*
+	 * Unused (as yet) IDs should have dummy entries here.  This allows us to
+	 * fail cleanly if a bogus pointer is passed to pfree or the like.  It
+	 * seems sufficient to provide routines for the methods that might get
+	 * invoked from inspection of a chunk (see MCXT_METHOD calls below).
+	 */
+
+	[MCTX_UNUSED1_ID].free_p = BogusFree,
+	[MCTX_UNUSED1_ID].realloc = BogusRealloc,
+	[MCTX_UNUSED1_ID].get_chunk_context = BogusGetChunkContext,
+	[MCTX_UNUSED1_ID].get_chunk_space = BogusGetChunkSpace,
+
+    /* Arena based allocator */
+	[MCTX_UNUSED2_ID].alloc = ArenaAlloc,
+	[MCTX_UNUSED2_ID].free_p = ArenaFree,
+	[MCTX_UNUSED2_ID].realloc = ArenaRealloc,
+	[MCTX_UNUSED2_ID].reset = ArenaReset,
+	[MCTX_UNUSED2_ID].delete_context = ArenaDelete,
+	[MCTX_UNUSED2_ID].get_chunk_context = ArenaGetChunkContext,
+	[MCTX_UNUSED2_ID].get_chunk_space = ArenaGetChunkSpace,
+	[MCTX_UNUSED2_ID].is_empty = ArenaIsEmpty,
+	[MCTX_UNUSED2_ID].stats = ArenaStats,
+#ifdef MEMORY_CONTEXT_CHECKING
+	[MCTX_UNUSED2_ID].check = ArenaCheck,
+#endif
+
+    /* MKQL based allocator */
+	[MCTX_UNUSED3_ID].alloc = MkqlAlloc,
+	[MCTX_UNUSED3_ID].free_p = MkqlFree,
+	[MCTX_UNUSED3_ID].realloc = MkqlRealloc,
+	[MCTX_UNUSED3_ID].reset = MkqlReset,
+	[MCTX_UNUSED3_ID].delete_context = MkqlDelete,
+	[MCTX_UNUSED3_ID].get_chunk_context = MkqlGetChunkContext,
+	[MCTX_UNUSED3_ID].get_chunk_space = MkqlGetChunkSpace,
+	[MCTX_UNUSED3_ID].is_empty = MkqlIsEmpty,
+	[MCTX_UNUSED3_ID].stats = MkqlStats,
+#ifdef MEMORY_CONTEXT_CHECKING
+	[MCTX_UNUSED3_ID].check = MkqlCheck,
+#endif
+
+	[MCTX_UNUSED4_ID].free_p = BogusFree,
+	[MCTX_UNUSED4_ID].realloc = BogusRealloc,
+	[MCTX_UNUSED4_ID].get_chunk_context = BogusGetChunkContext,
+	[MCTX_UNUSED4_ID].get_chunk_space = BogusGetChunkSpace,
+};
+
+/*
+ * CurrentMemoryContext
+ *		Default memory context for allocations.
+ */
+__thread MemoryContext CurrentMemoryContext = NULL;
+MemoryContext* ImplPtrCurrentMemoryContext() { return &CurrentMemoryContext; }
+
+/*
+ * Standard top-level contexts. For a description of the purpose of each
+ * of these contexts, refer to src/backend/utils/mmgr/README
+ */
+__thread MemoryContext TopMemoryContext = NULL;
+__thread MemoryContext ErrorContext = NULL;
+__thread MemoryContext PostmasterContext = NULL;
+__thread MemoryContext CacheMemoryContext = NULL;
+MemoryContext* ImplPtrCacheMemoryContext() { return &CacheMemoryContext; }
+__thread MemoryContext MessageContext = NULL;
+__thread MemoryContext TopTransactionContext = NULL;
+__thread MemoryContext CurTransactionContext = NULL;
+
+/* This is a transient link to the active portal's memory context: */
+__thread MemoryContext PortalContext = NULL;
+
+static void MemoryContextCallResetCallbacks(MemoryContext context);
+static void MemoryContextStatsInternal(MemoryContext context, int level,
+									   bool print, int max_children,
+									   MemoryContextCounters *totals,
+									   bool print_to_stderr);
+static void MemoryContextStatsPrint(MemoryContext context, void *passthru,
+									const char *stats_string,
+									bool print_to_stderr);
+
+/*
+ * You should not do memory allocations within a critical section, because
+ * an out-of-memory error will be escalated to a PANIC. To enforce that
+ * rule, the allocation functions Assert that.
+ */
+#define AssertNotInCriticalSection(context) \
+	Assert(CritSectionCount == 0 || (context)->allowInCritSection)
+
+/*
+ * Call the given function in the MemoryContextMethods for the memory context
+ * type that 'pointer' belongs to.
+ */
+#define MCXT_METHOD(pointer, method) \
+	mcxt_methods[GetMemoryChunkMethodID(pointer)].method
+
+/*
+ * GetMemoryChunkMethodID
+ *		Return the MemoryContextMethodID from the uint64 chunk header which
+ *		directly precedes 'pointer'.
+ */
+static inline MemoryContextMethodID
+GetMemoryChunkMethodID(const void *pointer)
+{
+	uint64		header;
+
+	/*
+	 * Try to detect bogus pointers handed to us, poorly though we can.
+	 * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+	 * allocated chunk.
+	 */
+	Assert(pointer == (const void *) MAXALIGN(pointer));
+
+	/* Allow access to the uint64 header */
+	VALGRIND_MAKE_MEM_DEFINED((char *) pointer - sizeof(uint64), sizeof(uint64));
+
+	header = *((const uint64 *) ((const char *) pointer - sizeof(uint64)));
+
+	/* Disallow access to the uint64 header */
+	VALGRIND_MAKE_MEM_NOACCESS((char *) pointer - sizeof(uint64), sizeof(uint64));
+
+	return (MemoryContextMethodID) (header & MEMORY_CONTEXT_METHODID_MASK);
+}
+
+/*
+ * GetMemoryChunkHeader
+ *		Return the uint64 chunk header which directly precedes 'pointer'.
+ *
+ * This is only used after GetMemoryChunkMethodID, so no need for error checks.
+ */
+static inline uint64
+GetMemoryChunkHeader(const void *pointer)
+{
+	uint64		header;
+
+	/* Allow access to the uint64 header */
+	VALGRIND_MAKE_MEM_DEFINED((char *) pointer - sizeof(uint64), sizeof(uint64));
+
+	header = *((const uint64 *) ((const char *) pointer - sizeof(uint64)));
+
+	/* Disallow access to the uint64 header */
+	VALGRIND_MAKE_MEM_NOACCESS((char *) pointer - sizeof(uint64), sizeof(uint64));
+
+	return header;
+}
+
+/*
+ * Support routines to trap use of invalid memory context method IDs
+ * (from calling pfree or the like on a bogus pointer).  As a possible
+ * aid in debugging, we report the header word along with the pointer
+ * address (if we got here, there must be an accessible header word).
+ */
+static void
+BogusFree(void *pointer)
+{
+	elog(ERROR, "pfree called with invalid pointer %p (header 0x%016llx)",
+		 pointer, (unsigned long long) GetMemoryChunkHeader(pointer));
+}
+
+static void *
+BogusRealloc(void *pointer, Size size)
+{
+	elog(ERROR, "repalloc called with invalid pointer %p (header 0x%016llx)",
+		 pointer, (unsigned long long) GetMemoryChunkHeader(pointer));
+	return NULL;				/* keep compiler quiet */
+}
+
+static MemoryContext
+BogusGetChunkContext(void *pointer)
+{
+	elog(ERROR, "GetMemoryChunkContext called with invalid pointer %p (header 0x%016llx)",
+		 pointer, (unsigned long long) GetMemoryChunkHeader(pointer));
+	return NULL;				/* keep compiler quiet */
+}
+
+static Size
+BogusGetChunkSpace(void *pointer)
+{
+	elog(ERROR, "GetMemoryChunkSpace called with invalid pointer %p (header 0x%016llx)",
+		 pointer, (unsigned long long) GetMemoryChunkHeader(pointer));
+	return 0;					/* keep compiler quiet */
+}
+
+
+/*****************************************************************************
+ *	  EXPORTED ROUTINES														 *
+ *****************************************************************************/
+
+
+/*
+ * MemoryContextInit
+ *		Start up the memory-context subsystem.
+ *
+ * This must be called before creating contexts or allocating memory in
+ * contexts.  TopMemoryContext and ErrorContext are initialized here;
+ * other contexts must be created afterwards.
+ *
+ * In normal multi-backend operation, this is called once during
+ * postmaster startup, and not at all by individual backend startup
+ * (since the backends inherit an already-initialized context subsystem
+ * by virtue of being forked off the postmaster).  But in an EXEC_BACKEND
+ * build, each process must do this for itself.
+ *
+ * In a standalone backend this must be called during backend startup.
+ */
+void
+MemoryContextInit(void)
+{
+	Assert(TopMemoryContext == NULL);
+
+	/*
+	 * First, initialize TopMemoryContext, which is the parent of all others.
+	 */
+	TopMemoryContext = AllocSetContextCreate((MemoryContext) NULL,
+											 "TopMemoryContext",
+											 ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * Not having any other place to point CurrentMemoryContext, make it point
+	 * to TopMemoryContext.  Caller should change this soon!
+	 */
+	CurrentMemoryContext = TopMemoryContext;
+
+	/*
+	 * Initialize ErrorContext as an AllocSetContext with slow growth rate ---
+	 * we don't really expect much to be allocated in it. More to the point,
+	 * require it to contain at least 8K at all times. This is the only case
+	 * where retained memory in a context is *essential* --- we want to be
+	 * sure ErrorContext still has some memory even if we've run out
+	 * elsewhere! Also, allow allocations in ErrorContext within a critical
+	 * section. Otherwise a PANIC will cause an assertion failure in the error
+	 * reporting code, before printing out the real cause of the failure.
+	 *
+	 * This should be the last step in this function, as elog.c assumes memory
+	 * management works once ErrorContext is non-null.
+	 */
+	ErrorContext = AllocSetContextCreate(TopMemoryContext,
+										 "ErrorContext",
+										 8 * 1024,
+										 8 * 1024,
+										 8 * 1024);
+	MemoryContextAllowInCriticalSection(ErrorContext, true);
+}
+
+/*
+ * MemoryContextReset
+ *		Release all space allocated within a context and delete all its
+ *		descendant contexts (but not the named context itself).
+ */
+void
+MemoryContextReset(MemoryContext context)
+{
+	Assert(MemoryContextIsValid(context));
+
+	/* save a function call in common case where there are no children */
+	if (context->firstchild != NULL)
+		MemoryContextDeleteChildren(context);
+
+	/* save a function call if no pallocs since startup or last reset */
+	if (!context->isReset)
+		MemoryContextResetOnly(context);
+}
+
+/*
+ * MemoryContextResetOnly
+ *		Release all space allocated within a context.
+ *		Nothing is done to the context's descendant contexts.
+ */
+void
+MemoryContextResetOnly(MemoryContext context)
+{
+	Assert(MemoryContextIsValid(context));
+
+	/* Nothing to do if no pallocs since startup or last reset */
+	if (!context->isReset)
+	{
+		MemoryContextCallResetCallbacks(context);
+
+		/*
+		 * If context->ident points into the context's memory, it will become
+		 * a dangling pointer.  We could prevent that by setting it to NULL
+		 * here, but that would break valid coding patterns that keep the
+		 * ident elsewhere, e.g. in a parent context.  So for now we assume
+		 * the programmer got it right.
+		 */
+
+		context->methods->reset(context);
+		context->isReset = true;
+		VALGRIND_DESTROY_MEMPOOL(context);
+		VALGRIND_CREATE_MEMPOOL(context, 0, false);
+	}
+}
+
+/*
+ * MemoryContextResetChildren
+ *		Release all space allocated within a context's descendants,
+ *		but don't delete the contexts themselves.  The named context
+ *		itself is not touched.
+ */
+void
+MemoryContextResetChildren(MemoryContext context)
+{
+	MemoryContext child;
+
+	Assert(MemoryContextIsValid(context));
+
+	for (child = context->firstchild; child != NULL; child = child->nextchild)
+	{
+		MemoryContextResetChildren(child);
+		MemoryContextResetOnly(child);
+	}
+}
+
+/*
+ * MemoryContextDelete
+ *		Delete a context and its descendants, and release all space
+ *		allocated therein.
+ *
+ * The type-specific delete routine removes all storage for the context,
+ * but we have to recurse to handle the children.
+ * We must also delink the context from its parent, if it has one.
+ */
+void
+MemoryContextDelete(MemoryContext context)
+{
+	Assert(MemoryContextIsValid(context));
+	/* We had better not be deleting TopMemoryContext ... */
+	Assert(context != TopMemoryContext);
+	/* And not CurrentMemoryContext, either */
+	Assert(context != CurrentMemoryContext);
+
+	/* save a function call in common case where there are no children */
+	if (context->firstchild != NULL)
+		MemoryContextDeleteChildren(context);
+
+	/*
+	 * It's not entirely clear whether 'tis better to do this before or after
+	 * delinking the context; but an error in a callback will likely result in
+	 * leaking the whole context (if it's not a root context) if we do it
+	 * after, so let's do it before.
+	 */
+	MemoryContextCallResetCallbacks(context);
+
+	/*
+	 * We delink the context from its parent before deleting it, so that if
+	 * there's an error we won't have deleted/busted contexts still attached
+	 * to the context tree.  Better a leak than a crash.
+	 */
+	MemoryContextSetParent(context, NULL);
+
+	/*
+	 * Also reset the context's ident pointer, in case it points into the
+	 * context.  This would only matter if someone tries to get stats on the
+	 * (already unlinked) context, which is unlikely, but let's be safe.
+	 */
+	context->ident = NULL;
+
+	context->methods->delete_context(context);
+
+	VALGRIND_DESTROY_MEMPOOL(context);
+}
+
+/*
+ * MemoryContextDeleteChildren
+ *		Delete all the descendants of the named context and release all
+ *		space allocated therein.  The named context itself is not touched.
+ */
+void
+MemoryContextDeleteChildren(MemoryContext context)
+{
+	Assert(MemoryContextIsValid(context));
+
+	/*
+	 * MemoryContextDelete will delink the child from me, so just iterate as
+	 * long as there is a child.
+	 */
+	while (context->firstchild != NULL)
+		MemoryContextDelete(context->firstchild);
+}
+
+/*
+ * MemoryContextRegisterResetCallback
+ *		Register a function to be called before next context reset/delete.
+ *		Such callbacks will be called in reverse order of registration.
+ *
+ * The caller is responsible for allocating a MemoryContextCallback struct
+ * to hold the info about this callback request, and for filling in the
+ * "func" and "arg" fields in the struct to show what function to call with
+ * what argument.  Typically the callback struct should be allocated within
+ * the specified context, since that means it will automatically be freed
+ * when no longer needed.
+ *
+ * There is no API for deregistering a callback once registered.  If you
+ * want it to not do anything anymore, adjust the state pointed to by its
+ * "arg" to indicate that.
+ */
+void
+MemoryContextRegisterResetCallback(MemoryContext context,
+								   MemoryContextCallback *cb)
+{
+	Assert(MemoryContextIsValid(context));
+
+	/* Push onto head so this will be called before older registrants. */
+	cb->next = context->reset_cbs;
+	context->reset_cbs = cb;
+	/* Mark the context as non-reset (it probably is already). */
+	context->isReset = false;
+}
+
+/*
+ * MemoryContextCallResetCallbacks
+ *		Internal function to call all registered callbacks for context.
+ */
+static void
+MemoryContextCallResetCallbacks(MemoryContext context)
+{
+	MemoryContextCallback *cb;
+
+	/*
+	 * We pop each callback from the list before calling.  That way, if an
+	 * error occurs inside the callback, we won't try to call it a second time
+	 * in the likely event that we reset or delete the context later.
+	 */
+	while ((cb = context->reset_cbs) != NULL)
+	{
+		context->reset_cbs = cb->next;
+		cb->func(cb->arg);
+	}
+}
+
+/*
+ * MemoryContextSetIdentifier
+ *		Set the identifier string for a memory context.
+ *
+ * An identifier can be provided to help distinguish among different contexts
+ * of the same kind in memory context stats dumps.  The identifier string
+ * must live at least as long as the context it is for; typically it is
+ * allocated inside that context, so that it automatically goes away on
+ * context deletion.  Pass id = NULL to forget any old identifier.
+ */
+void
+MemoryContextSetIdentifier(MemoryContext context, const char *id)
+{
+	Assert(MemoryContextIsValid(context));
+	context->ident = id;
+}
+
+/*
+ * MemoryContextSetParent
+ *		Change a context to belong to a new parent (or no parent).
+ *
+ * We provide this as an API function because it is sometimes useful to
+ * change a context's lifespan after creation.  For example, a context
+ * might be created underneath a transient context, filled with data,
+ * and then reparented underneath CacheMemoryContext to make it long-lived.
+ * In this way no special effort is needed to get rid of the context in case
+ * a failure occurs before its contents are completely set up.
+ *
+ * Callers often assume that this function cannot fail, so don't put any
+ * elog(ERROR) calls in it.
+ *
+ * A possible caller error is to reparent a context under itself, creating
+ * a loop in the context graph.  We assert here that context != new_parent,
+ * but checking for multi-level loops seems more trouble than it's worth.
+ */
+void
+MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
+{
+	Assert(MemoryContextIsValid(context));
+	Assert(context != new_parent);
+
+	/* Fast path if it's got correct parent already */
+	if (new_parent == context->parent)
+		return;
+
+	/* Delink from existing parent, if any */
+	if (context->parent)
+	{
+		MemoryContext parent = context->parent;
+
+		if (context->prevchild != NULL)
+			context->prevchild->nextchild = context->nextchild;
+		else
+		{
+			Assert(parent->firstchild == context);
+			parent->firstchild = context->nextchild;
+		}
+
+		if (context->nextchild != NULL)
+			context->nextchild->prevchild = context->prevchild;
+	}
+
+	/* And relink */
+	if (new_parent)
+	{
+		Assert(MemoryContextIsValid(new_parent));
+		context->parent = new_parent;
+		context->prevchild = NULL;
+		context->nextchild = new_parent->firstchild;
+		if (new_parent->firstchild != NULL)
+			new_parent->firstchild->prevchild = context;
+		new_parent->firstchild = context;
+	}
+	else
+	{
+		context->parent = NULL;
+		context->prevchild = NULL;
+		context->nextchild = NULL;
+	}
+}
+
+/*
+ * MemoryContextAllowInCriticalSection
+ *		Allow/disallow allocations in this memory context within a critical
+ *		section.
+ *
+ * Normally, memory allocations are not allowed within a critical section,
+ * because a failure would lead to PANIC.  There are a few exceptions to
+ * that, like allocations related to debugging code that is not supposed to
+ * be enabled in production.  This function can be used to exempt specific
+ * memory contexts from the assertion in palloc().
+ */
+void
+MemoryContextAllowInCriticalSection(MemoryContext context, bool allow)
+{
+	Assert(MemoryContextIsValid(context));
+
+	context->allowInCritSection = allow;
+}
+
+/*
+ * GetMemoryChunkContext
+ *		Given a currently-allocated chunk, determine the MemoryContext that
+ *		the chunk belongs to.
+ */
+MemoryContext
+GetMemoryChunkContext(void *pointer)
+{
+	return MCXT_METHOD(pointer, get_chunk_context) (pointer);
+}
+
+/*
+ * GetMemoryChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ *
+ * This is useful for measuring the total space occupied by a set of
+ * allocated chunks.
+ */
+Size
+GetMemoryChunkSpace(void *pointer)
+{
+	return MCXT_METHOD(pointer, get_chunk_space) (pointer);
+}
+
+/*
+ * MemoryContextGetParent
+ *		Get the parent context (if any) of the specified context
+ */
+MemoryContext
+MemoryContextGetParent(MemoryContext context)
+{
+	Assert(MemoryContextIsValid(context));
+
+	return context->parent;
+}
+
+/*
+ * MemoryContextIsEmpty
+ *		Is a memory context empty of any allocated space?
+ */
+bool
+MemoryContextIsEmpty(MemoryContext context)
+{
+	Assert(MemoryContextIsValid(context));
+
+	/*
+	 * For now, we consider a memory context nonempty if it has any children;
+	 * perhaps this should be changed later.
+	 */
+	if (context->firstchild != NULL)
+		return false;
+	/* Otherwise use the type-specific inquiry */
+	return context->methods->is_empty(context);
+}
+
+/*
+ * Find the memory allocated to blocks for this memory context. If recurse is
+ * true, also include children.
+ */
+Size
+MemoryContextMemAllocated(MemoryContext context, bool recurse)
+{
+	Size		total = context->mem_allocated;
+
+	Assert(MemoryContextIsValid(context));
+
+	if (recurse)
+	{
+		MemoryContext child;
+
+		for (child = context->firstchild;
+			 child != NULL;
+			 child = child->nextchild)
+			total += MemoryContextMemAllocated(child, true);
+	}
+
+	return total;
+}
+
+/*
+ * MemoryContextStats
+ *		Print statistics about the named context and all its descendants.
+ *
+ * This is just a debugging utility, so it's not very fancy.  However, we do
+ * make some effort to summarize when the output would otherwise be very long.
+ * The statistics are sent to stderr.
+ */
+void
+MemoryContextStats(MemoryContext context)
+{
+	/* A hard-wired limit on the number of children is usually good enough */
+	MemoryContextStatsDetail(context, 100, true);
+}
+
+/*
+ * MemoryContextStatsDetail
+ *
+ * Entry point for use if you want to vary the number of child contexts shown.
+ *
+ * If print_to_stderr is true, print statistics about the memory contexts
+ * with fprintf(stderr), otherwise use ereport().
+ */
+void
+MemoryContextStatsDetail(MemoryContext context, int max_children,
+						 bool print_to_stderr)
+{
+	MemoryContextCounters grand_totals;
+
+	memset(&grand_totals, 0, sizeof(grand_totals));
+
+	MemoryContextStatsInternal(context, 0, true, max_children, &grand_totals, print_to_stderr);
+
+	if (print_to_stderr)
+		fprintf(stderr,
+				"Grand total: %zu bytes in %zu blocks; %zu free (%zu chunks); %zu used\n",
+				grand_totals.totalspace, grand_totals.nblocks,
+				grand_totals.freespace, grand_totals.freechunks,
+				grand_totals.totalspace - grand_totals.freespace);
+	else
+
+		/*
+		 * Use LOG_SERVER_ONLY to prevent the memory contexts from being sent
+		 * to the connected client.
+		 *
+		 * We don't buffer the information about all memory contexts in a
+		 * backend into StringInfo and log it as one message.  That would
+		 * require the buffer to be enlarged, risking an OOM as there could be
+		 * a large number of memory contexts in a backend.  Instead, we log
+		 * one message per memory context.
+		 */
+		ereport(LOG_SERVER_ONLY,
+				(errhidestmt(true),
+				 errhidecontext(true),
+				 errmsg_internal("Grand total: %zu bytes in %zu blocks; %zu free (%zu chunks); %zu used",
+								 grand_totals.totalspace, grand_totals.nblocks,
+								 grand_totals.freespace, grand_totals.freechunks,
+								 grand_totals.totalspace - grand_totals.freespace)));
+}
+
+/*
+ * MemoryContextStatsInternal
+ *		One recursion level for MemoryContextStats
+ *
+ * Print this context if print is true, but in any case accumulate counts into
+ * *totals (if given).
+ */
+static void
+MemoryContextStatsInternal(MemoryContext context, int level,
+						   bool print, int max_children,
+						   MemoryContextCounters *totals,
+						   bool print_to_stderr)
+{
+	MemoryContextCounters local_totals;
+	MemoryContext child;
+	int			ichild;
+
+	Assert(MemoryContextIsValid(context));
+
+	/* Examine the context itself */
+	context->methods->stats(context,
+							print ? MemoryContextStatsPrint : NULL,
+							(void *) &level,
+							totals, print_to_stderr);
+
+	/*
+	 * Examine children.  If there are more than max_children of them, we do
+	 * not print the rest explicitly, but just summarize them.
+	 */
+	memset(&local_totals, 0, sizeof(local_totals));
+
+	for (child = context->firstchild, ichild = 0;
+		 child != NULL;
+		 child = child->nextchild, ichild++)
+	{
+		if (ichild < max_children)
+			MemoryContextStatsInternal(child, level + 1,
+									   print, max_children,
+									   totals,
+									   print_to_stderr);
+		else
+			MemoryContextStatsInternal(child, level + 1,
+									   false, max_children,
+									   &local_totals,
+									   print_to_stderr);
+	}
+
+	/* Deal with excess children */
+	if (ichild > max_children)
+	{
+		if (print)
+		{
+			if (print_to_stderr)
+			{
+				int			i;
+
+				for (i = 0; i <= level; i++)
+					fprintf(stderr, "  ");
+				fprintf(stderr,
+						"%d more child contexts containing %zu total in %zu blocks; %zu free (%zu chunks); %zu used\n",
+						ichild - max_children,
+						local_totals.totalspace,
+						local_totals.nblocks,
+						local_totals.freespace,
+						local_totals.freechunks,
+						local_totals.totalspace - local_totals.freespace);
+			}
+			else
+				ereport(LOG_SERVER_ONLY,
+						(errhidestmt(true),
+						 errhidecontext(true),
+						 errmsg_internal("level: %d; %d more child contexts containing %zu total in %zu blocks; %zu free (%zu chunks); %zu used",
+										 level,
+										 ichild - max_children,
+										 local_totals.totalspace,
+										 local_totals.nblocks,
+										 local_totals.freespace,
+										 local_totals.freechunks,
+										 local_totals.totalspace - local_totals.freespace)));
+		}
+
+		if (totals)
+		{
+			totals->nblocks += local_totals.nblocks;
+			totals->freechunks += local_totals.freechunks;
+			totals->totalspace += local_totals.totalspace;
+			totals->freespace += local_totals.freespace;
+		}
+	}
+}
+
+/*
+ * MemoryContextStatsPrint
+ *		Print callback used by MemoryContextStatsInternal
+ *
+ * For now, the passthru pointer just points to "int level"; later we might
+ * make that more complicated.
+ */
+static void
+MemoryContextStatsPrint(MemoryContext context, void *passthru,
+						const char *stats_string,
+						bool print_to_stderr)
+{
+	int			level = *(int *) passthru;
+	const char *name = context->name;
+	const char *ident = context->ident;
+	char		truncated_ident[110];
+	int			i;
+
+	/*
+	 * It seems preferable to label dynahash contexts with just the hash table
+	 * name.  Those are already unique enough, so the "dynahash" part isn't
+	 * very helpful, and this way is more consistent with pre-v11 practice.
+	 */
+	if (ident && strcmp(name, "dynahash") == 0)
+	{
+		name = ident;
+		ident = NULL;
+	}
+
+	truncated_ident[0] = '\0';
+
+	if (ident)
+	{
+		/*
+		 * Some contexts may have very long identifiers (e.g., SQL queries).
+		 * Arbitrarily truncate at 100 bytes, but be careful not to break
+		 * multibyte characters.  Also, replace ASCII control characters, such
+		 * as newlines, with spaces.
+		 */
+		int			idlen = strlen(ident);
+		bool		truncated = false;
+
+		strcpy(truncated_ident, ": ");
+		i = strlen(truncated_ident);
+
+		if (idlen > 100)
+		{
+			idlen = pg_mbcliplen(ident, idlen, 100);
+			truncated = true;
+		}
+
+		while (idlen-- > 0)
+		{
+			unsigned char c = *ident++;
+
+			if (c < ' ')
+				c = ' ';
+			truncated_ident[i++] = c;
+		}
+		truncated_ident[i] = '\0';
+
+		if (truncated)
+			strcat(truncated_ident, "...");
+	}
+
+	if (print_to_stderr)
+	{
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr, "%s: %s%s\n", name, stats_string, truncated_ident);
+	}
+	else
+		ereport(LOG_SERVER_ONLY,
+				(errhidestmt(true),
+				 errhidecontext(true),
+				 errmsg_internal("level: %d; %s: %s%s",
+								 level, name, stats_string, truncated_ident)));
+}
+
+/*
+ * MemoryContextCheck
+ *		Check all chunks in the named context.
+ *
+ * This is just a debugging utility, so it's not fancy.
+ */
+#ifdef MEMORY_CONTEXT_CHECKING
+void
+MemoryContextCheck(MemoryContext context)
+{
+	MemoryContext child;
+
+	Assert(MemoryContextIsValid(context));
+
+	context->methods->check(context);
+	for (child = context->firstchild; child != NULL; child = child->nextchild)
+		MemoryContextCheck(child);
+}
+#endif
+
+/*
+ * MemoryContextCreate
+ *		Context-type-independent part of context creation.
+ *
+ * This is only intended to be called by context-type-specific
+ * context creation routines, not by the unwashed masses.
+ *
+ * The memory context creation procedure goes like this:
+ *	1.  Context-type-specific routine makes some initial space allocation,
+ *		including enough space for the context header.  If it fails,
+ *		it can ereport() with no damage done.
+ *	2.	Context-type-specific routine sets up all type-specific fields of
+ *		the header (those beyond MemoryContextData proper), as well as any
+ *		other management fields it needs to have a fully valid context.
+ *		Usually, failure in this step is impossible, but if it's possible
+ *		the initial space allocation should be freed before ereport'ing.
+ *	3.	Context-type-specific routine calls MemoryContextCreate() to fill in
+ *		the generic header fields and link the context into the context tree.
+ *	4.  We return to the context-type-specific routine, which finishes
+ *		up type-specific initialization.  This routine can now do things
+ *		that might fail (like allocate more memory), so long as it's
+ *		sure the node is left in a state that delete will handle.
+ *
+ * node: the as-yet-uninitialized common part of the context header node.
+ * tag: NodeTag code identifying the memory context type.
+ * method_id: MemoryContextMethodID of the context-type being created.
+ * parent: parent context, or NULL if this will be a top-level context.
+ * name: name of context (must be statically allocated).
+ *
+ * Context routines generally assume that MemoryContextCreate can't fail,
+ * so this can contain Assert but not elog/ereport.
+ */
+void
+MemoryContextCreate(MemoryContext node,
+					NodeTag tag,
+					MemoryContextMethodID method_id,
+					MemoryContext parent,
+					const char *name)
+{
+	/* Creating new memory contexts is not allowed in a critical section */
+	Assert(CritSectionCount == 0);
+
+	/* Initialize all standard fields of memory context header */
+	node->type = tag;
+	node->isReset = true;
+	node->methods = &mcxt_methods[method_id];
+	node->parent = parent;
+	node->firstchild = NULL;
+	node->mem_allocated = 0;
+	node->prevchild = NULL;
+	node->name = name;
+	node->ident = NULL;
+	node->reset_cbs = NULL;
+
+	/* OK to link node into context tree */
+	if (parent)
+	{
+		node->nextchild = parent->firstchild;
+		if (parent->firstchild != NULL)
+			parent->firstchild->prevchild = node;
+		parent->firstchild = node;
+		/* inherit allowInCritSection flag from parent */
+		node->allowInCritSection = parent->allowInCritSection;
+	}
+	else
+	{
+		node->nextchild = NULL;
+		node->allowInCritSection = false;
+	}
+
+	VALGRIND_CREATE_MEMPOOL(node, 0, false);
+}
+
+/*
+ * MemoryContextAlloc
+ *		Allocate space within the specified context.
+ *
+ * This could be turned into a macro, but we'd have to import
+ * nodes/memnodes.h into postgres.h which seems a bad idea.
+ */
+void *
+MemoryContextAlloc(MemoryContext context, Size size)
+{
+	void	   *ret;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContextStats(TopMemoryContext);
+
+		/*
+		 * Here, and elsewhere in this module, we show the target context's
+		 * "name" but not its "ident" (if any) in user-visible error messages.
+		 * The "ident" string might contain security-sensitive data, such as
+		 * values in SQL commands.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, context->name)));
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	return ret;
+}
+
+/*
+ * MemoryContextAllocZero
+ *		Like MemoryContextAlloc, but clears allocated memory
+ *
+ *	We could just call MemoryContextAlloc then clear the memory, but this
+ *	is a very common combination, so we provide the combined operation.
+ */
+void *
+MemoryContextAllocZero(MemoryContext context, Size size)
+{
+	void	   *ret;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, context->name)));
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	MemSetAligned(ret, 0, size);
+
+	return ret;
+}
+
+/*
+ * MemoryContextAllocZeroAligned
+ *		MemoryContextAllocZero where length is suitable for MemSetLoop
+ *
+ *	This might seem overly specialized, but it's not because newNode()
+ *	is so often called with compile-time-constant sizes.
+ */
+void *
+MemoryContextAllocZeroAligned(MemoryContext context, Size size)
+{
+	void	   *ret;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, context->name)));
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	MemSetLoop(ret, 0, size);
+
+	return ret;
+}
+
+/*
+ * MemoryContextAllocExtended
+ *		Allocate space within the specified context using the given flags.
+ */
+void *
+MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
+{
+	void	   *ret;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!((flags & MCXT_ALLOC_HUGE) != 0 ? AllocHugeSizeIsValid(size) :
+		  AllocSizeIsValid(size)))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		if ((flags & MCXT_ALLOC_NO_OOM) == 0)
+		{
+			MemoryContextStats(TopMemoryContext);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory"),
+					 errdetail("Failed on request of size %zu in memory context \"%s\".",
+							   size, context->name)));
+		}
+		return NULL;
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	if ((flags & MCXT_ALLOC_ZERO) != 0)
+		MemSetAligned(ret, 0, size);
+
+	return ret;
+}
+
+/*
+ * HandleLogMemoryContextInterrupt
+ *		Handle receipt of an interrupt indicating logging of memory
+ *		contexts.
+ *
+ * All the actual work is deferred to ProcessLogMemoryContextInterrupt(),
+ * because we cannot safely emit a log message inside the signal handler.
+ */
+void
+HandleLogMemoryContextInterrupt(void)
+{
+	InterruptPending = true;
+	LogMemoryContextPending = true;
+	/* latch will be set by procsignal_sigusr1_handler */
+}
+
+/*
+ * ProcessLogMemoryContextInterrupt
+ * 		Perform logging of memory contexts of this backend process.
+ *
+ * Any backend that participates in ProcSignal signaling must arrange
+ * to call this function if we see LogMemoryContextPending set.
+ * It is called from CHECK_FOR_INTERRUPTS(), which is enough because
+ * the target process for logging of memory contexts is a backend.
+ */
+void
+ProcessLogMemoryContextInterrupt(void)
+{
+	LogMemoryContextPending = false;
+
+	/*
+	 * Use LOG_SERVER_ONLY to prevent this message from being sent to the
+	 * connected client.
+	 */
+	ereport(LOG_SERVER_ONLY,
+			(errhidestmt(true),
+			 errhidecontext(true),
+			 errmsg("logging memory contexts of PID %d", MyProcPid)));
+
+	/*
+	 * When a backend process is consuming huge memory, logging all its memory
+	 * contexts might overrun available disk space. To prevent this, we limit
+	 * the number of child contexts to log per parent to 100.
+	 *
+	 * As with MemoryContextStats(), we suppose that practical cases where the
+	 * dump gets long will typically be huge numbers of siblings under the
+	 * same parent context; while the additional debugging value from seeing
+	 * details about individual siblings beyond 100 will not be large.
+	 */
+	MemoryContextStatsDetail(TopMemoryContext, 100, false);
+}
+
+void *
+palloc(Size size)
+{
+	/* duplicates MemoryContextAlloc to avoid increased overhead */
+	void	   *ret;
+	MemoryContext context = CurrentMemoryContext;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, context->name)));
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	return ret;
+}
+
+void *
+palloc0(Size size)
+{
+	/* duplicates MemoryContextAllocZero to avoid increased overhead */
+	void	   *ret;
+	MemoryContext context = CurrentMemoryContext;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, context->name)));
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	MemSetAligned(ret, 0, size);
+
+	return ret;
+}
+
+void *
+palloc_extended(Size size, int flags)
+{
+	/* duplicates MemoryContextAllocExtended to avoid increased overhead */
+	void	   *ret;
+	MemoryContext context = CurrentMemoryContext;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!((flags & MCXT_ALLOC_HUGE) != 0 ? AllocHugeSizeIsValid(size) :
+		  AllocSizeIsValid(size)))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		if ((flags & MCXT_ALLOC_NO_OOM) == 0)
+		{
+			MemoryContextStats(TopMemoryContext);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory"),
+					 errdetail("Failed on request of size %zu in memory context \"%s\".",
+							   size, context->name)));
+		}
+		return NULL;
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	if ((flags & MCXT_ALLOC_ZERO) != 0)
+		MemSetAligned(ret, 0, size);
+
+	return ret;
+}
+
+/*
+ * MemoryContextAllocAligned
+ *		Allocate 'size' bytes of memory in 'context' aligned to 'alignto'
+ *		bytes.
+ *
+ * Currently, we align addresses by requesting additional bytes from the
+ * MemoryContext's standard allocator function and then aligning the returned
+ * address by the required alignment.  This means that the given MemoryContext
+ * must support providing us with a chunk of memory that's larger than 'size'.
+ * For allocators such as Slab, that's not going to work, as slab only allows
+ * chunks of the size that's specified when the context is created.
+ *
+ * 'alignto' must be a power of 2.
+ * 'flags' may be 0 or set the same as MemoryContextAllocExtended().
+ */
+void *
+MemoryContextAllocAligned(MemoryContext context,
+						  Size size, Size alignto, int flags)
+{
+	MemoryChunk *alignedchunk;
+	Size		alloc_size;
+	void	   *unaligned;
+	void	   *aligned;
+
+	/* wouldn't make much sense to waste that much space */
+	Assert(alignto < (128 * 1024 * 1024));
+
+	/* ensure alignto is a power of 2 */
+	Assert((alignto & (alignto - 1)) == 0);
+
+	/*
+	 * If the alignment requirements are less than what we already guarantee
+	 * then just use the standard allocation function.
+	 */
+	if (unlikely(alignto <= MAXIMUM_ALIGNOF))
+		return MemoryContextAllocExtended(context, size, flags);
+
+	/*
+	 * We implement aligned pointers by simply allocating enough memory for
+	 * the requested size plus the alignment and an additional "redirection"
+	 * MemoryChunk.  This additional MemoryChunk is required for operations
+	 * such as pfree when used on the pointer returned by this function.  We
+	 * use this redirection MemoryChunk in order to find the pointer to the
+	 * memory that was returned by the MemoryContextAllocExtended call below.
+	 * We do that by "borrowing" the block offset field and instead of using
+	 * that to find the offset into the owning block, we use it to find the
+	 * original allocated address.
+	 *
+	 * Here we must allocate enough extra memory so that we can still align
+	 * the pointer returned by MemoryContextAllocExtended and also have enough
+	 * space for the redirection MemoryChunk.  Since allocations will already
+	 * be at least aligned by MAXIMUM_ALIGNOF, we can subtract that amount
+	 * from the allocation size to save a little memory.
+	 */
+	alloc_size = size + PallocAlignedExtraBytes(alignto);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* ensure there's space for a sentinel byte */
+	alloc_size += 1;
+#endif
+
+	/* perform the actual allocation */
+	unaligned = MemoryContextAllocExtended(context, alloc_size, flags);
+
+	/* set the aligned pointer */
+	aligned = (void *) TYPEALIGN(alignto, (char *) unaligned +
+								 sizeof(MemoryChunk));
+
+	alignedchunk = PointerGetMemoryChunk(aligned);
+
+	/*
+	 * We set the redirect MemoryChunk so that the block offset calculation is
+	 * used to point back to the 'unaligned' allocated chunk.  This allows us
+	 * to use MemoryChunkGetBlock() to find the unaligned chunk when we need
+	 * to perform operations such as pfree() and repalloc().
+	 *
+	 * We store 'alignto' in the MemoryChunk's 'value' so that we know what
+	 * the alignment was set to should we ever be asked to realloc this
+	 * pointer.
+	 */
+	MemoryChunkSetHdrMask(alignedchunk, unaligned, alignto,
+						  MCTX_ALIGNED_REDIRECT_ID);
+
+	/* double check we produced a correctly aligned pointer */
+	Assert((void *) TYPEALIGN(alignto, aligned) == aligned);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	alignedchunk->requested_size = size;
+	/* set mark to catch clobber of "unused" space */
+	set_sentinel(aligned, size);
+#endif
+
+	/* Mark the bytes before the redirection header as noaccess */
+	VALGRIND_MAKE_MEM_NOACCESS(unaligned,
+							   (char *) alignedchunk - (char *) unaligned);
+
+	/* Disallow access to the redirection chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(alignedchunk, sizeof(MemoryChunk));
+
+	return aligned;
+}
+
+/*
+ * palloc_aligned
+ *		Allocate 'size' bytes returning a pointer that's aligned to the
+ *		'alignto' boundary.
+ *
+ * Currently, we align addresses by requesting additional bytes from the
+ * MemoryContext's standard allocator function and then aligning the returned
+ * address by the required alignment.  This means that the given MemoryContext
+ * must support providing us with a chunk of memory that's larger than 'size'.
+ * For allocators such as Slab, that's not going to work, as slab only allows
+ * chunks of the size that's specified when the context is created.
+ *
+ * 'alignto' must be a power of 2.
+ * 'flags' may be 0 or set the same as MemoryContextAllocExtended().
+ */
+void *
+palloc_aligned(Size size, Size alignto, int flags)
+{
+	return MemoryContextAllocAligned(CurrentMemoryContext, size, alignto, flags);
+}
+
+/*
+ * pfree
+ *		Release an allocated chunk.
+ */
+void
+pfree(void *pointer)
+{
+#ifdef USE_VALGRIND
+	MemoryContextMethodID method = GetMemoryChunkMethodID(pointer);
+	MemoryContext context = GetMemoryChunkContext(pointer);
+#endif
+
+	MCXT_METHOD(pointer, free_p) (pointer);
+
+#ifdef USE_VALGRIND
+	if (method != MCTX_ALIGNED_REDIRECT_ID)
+		VALGRIND_MEMPOOL_FREE(context, pointer);
+#endif
+}
+
+/*
+ * repalloc
+ *		Adjust the size of a previously allocated chunk.
+ */
+void *
+repalloc(void *pointer, Size size)
+{
+#ifdef USE_VALGRIND
+	MemoryContextMethodID method = GetMemoryChunkMethodID(pointer);
+#endif
+#if defined(USE_ASSERT_CHECKING) || defined(USE_VALGRIND)
+	MemoryContext context = GetMemoryChunkContext(pointer);
+#endif
+	void	   *ret;
+
+	if (!AllocSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	AssertNotInCriticalSection(context);
+
+	/* isReset must be false already */
+	Assert(!context->isReset);
+
+	ret = MCXT_METHOD(pointer, realloc) (pointer, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContext cxt = GetMemoryChunkContext(pointer);
+
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, cxt->name)));
+	}
+
+#ifdef USE_VALGRIND
+	if (method != MCTX_ALIGNED_REDIRECT_ID)
+		VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
+#endif
+
+	return ret;
+}
+
+/*
+ * repalloc_extended
+ *		Adjust the size of a previously allocated chunk,
+ *		with HUGE and NO_OOM options.
+ */
+void *
+repalloc_extended(void *pointer, Size size, int flags)
+{
+#if defined(USE_ASSERT_CHECKING) || defined(USE_VALGRIND)
+	MemoryContext context = GetMemoryChunkContext(pointer);
+#endif
+	void	   *ret;
+
+	if (!((flags & MCXT_ALLOC_HUGE) != 0 ? AllocHugeSizeIsValid(size) :
+		  AllocSizeIsValid(size)))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	AssertNotInCriticalSection(context);
+
+	/* isReset must be false already */
+	Assert(!context->isReset);
+
+	ret = MCXT_METHOD(pointer, realloc) (pointer, size);
+	if (unlikely(ret == NULL))
+	{
+		if ((flags & MCXT_ALLOC_NO_OOM) == 0)
+		{
+			MemoryContext cxt = GetMemoryChunkContext(pointer);
+
+			MemoryContextStats(TopMemoryContext);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory"),
+					 errdetail("Failed on request of size %zu in memory context \"%s\".",
+							   size, cxt->name)));
+		}
+		return NULL;
+	}
+
+	VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
+
+	return ret;
+}
+
+/*
+ * repalloc0
+ *		Adjust the size of a previously allocated chunk and zero out the added
+ *		space.
+ */
+void *
+repalloc0(void *pointer, Size oldsize, Size size)
+{
+	void	   *ret;
+
+	/* catch wrong argument order */
+	if (unlikely(oldsize > size))
+		elog(ERROR, "invalid repalloc0 call: oldsize %zu, new size %zu",
+			 oldsize, size);
+
+	ret = repalloc(pointer, size);
+	memset((char *) ret + oldsize, 0, (size - oldsize));
+	return ret;
+}
+
+/*
+ * MemoryContextAllocHuge
+ *		Allocate (possibly-expansive) space within the specified context.
+ *
+ * See considerations in comment at MaxAllocHugeSize.
+ */
+void *
+MemoryContextAllocHuge(MemoryContext context, Size size)
+{
+	void	   *ret;
+
+	Assert(MemoryContextIsValid(context));
+	AssertNotInCriticalSection(context);
+
+	if (!AllocHugeSizeIsValid(size))
+		elog(ERROR, "invalid memory alloc request size %zu", size);
+
+	context->isReset = false;
+
+	ret = context->methods->alloc(context, size);
+	if (unlikely(ret == NULL))
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on request of size %zu in memory context \"%s\".",
+						   size, context->name)));
+	}
+
+	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+	return ret;
+}
+
+/*
+ * repalloc_huge
+ *		Adjust the size of a previously allocated chunk, permitting a large
+ *		value.  The previous allocation need not have been "huge".
+ */
+void *
+repalloc_huge(void *pointer, Size size)
+{
+	/* this one seems not worth its own implementation */
+	return repalloc_extended(pointer, size, MCXT_ALLOC_HUGE);
+}
+
+/*
+ * MemoryContextStrdup
+ *		Like strdup(), but allocate from the specified context
+ */
+char *
+MemoryContextStrdup(MemoryContext context, const char *string)
+{
+	char	   *nstr;
+	Size		len = strlen(string) + 1;
+
+	nstr = (char *) MemoryContextAlloc(context, len);
+
+	memcpy(nstr, string, len);
+
+	return nstr;
+}
+
+char *
+pstrdup(const char *in)
+{
+	return MemoryContextStrdup(CurrentMemoryContext, in);
+}
+
+/*
+ * pnstrdup
+ *		Like pstrdup(), but append null byte to a
+ *		not-necessarily-null-terminated input string.
+ */
+char *
+pnstrdup(const char *in, Size len)
+{
+	char	   *out;
+
+	len = strnlen(in, len);
+
+	out = palloc(len + 1);
+	memcpy(out, in, len);
+	out[len] = '\0';
+
+	return out;
+}
+
+/*
+ * Make copy of string with all trailing newline characters removed.
+ */
+char *
+pchomp(const char *in)
+{
+	size_t		n;
+
+	n = strlen(in);
+	while (n > 0 && in[n - 1] == '\n')
+		n--;
+	return pnstrdup(in, n);
+}
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/memdebug.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/memdebug.c
new file mode 100644
index 00000000000..ec50a30d5f7
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/memdebug.c
@@ -0,0 +1,93 @@
+/*-------------------------------------------------------------------------
+ *
+ * memdebug.c
+ *	  Declarations used in memory context implementations, not part of the
+ *	  public API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/mmgr/memdebug.c
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+
+#endif							/* RANDOMIZE_ALLOCATED_MEMORY */
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/portalmem.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/portalmem.c
new file mode 100644
index 00000000000..d756ae80341
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/portalmem.c
@@ -0,0 +1,1291 @@
+/*-------------------------------------------------------------------------
+ *
+ * portalmem.c
+ *	  backend portal memory management
+ *
+ * Portals are objects representing the execution state of a query.
+ * This module provides memory management services for portals, but it
+ * doesn't actually run the executor for them.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/portalmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "commands/portalcmds.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+/*
+ * Estimate of the maximum number of open portals a user would have,
+ * used in initially sizing the PortalHashTable in EnablePortalManager().
+ * Since the hash table can expand, there's no need to make this overly
+ * generous, and keeping it small avoids unnecessary overhead in the
+ * hash_seq_search() calls executed during transaction end.
+ */
+#define PORTALS_PER_USER	   16
+
+
+/* ----------------
+ *		Global state
+ * ----------------
+ */
+
+#define MAX_PORTALNAME_LEN		NAMEDATALEN
+
+typedef struct portalhashent
+{
+	char		portalname[MAX_PORTALNAME_LEN];
+	Portal		portal;
+} PortalHashEnt;
+
+static __thread HTAB *PortalHashTable = NULL;
+
+#define PortalHashTableLookup(NAME, PORTAL) \
+do { \
+	PortalHashEnt *hentry; \
+	\
+	hentry = (PortalHashEnt *) hash_search(PortalHashTable, \
+										   (NAME), HASH_FIND, NULL); \
+	if (hentry) \
+		PORTAL = hentry->portal; \
+	else \
+		PORTAL = NULL; \
+} while(0)
+
+#define PortalHashTableInsert(PORTAL, NAME) \
+do { \
+	PortalHashEnt *hentry; bool found; \
+	\
+	hentry = (PortalHashEnt *) hash_search(PortalHashTable, \
+										   (NAME), HASH_ENTER, &found); \
+	if (found) \
+		elog(ERROR, "duplicate portal name"); \
+	hentry->portal = PORTAL; \
+	/* To avoid duplicate storage, make PORTAL->name point to htab entry */ \
+	PORTAL->name = hentry->portalname; \
+} while(0)
+
+#define PortalHashTableDelete(PORTAL) \
+do { \
+	PortalHashEnt *hentry; \
+	\
+	hentry = (PortalHashEnt *) hash_search(PortalHashTable, \
+										   PORTAL->name, HASH_REMOVE, NULL); \
+	if (hentry == NULL) \
+		elog(WARNING, "trying to delete portal name that does not exist"); \
+} while(0)
+
+static __thread MemoryContext TopPortalContext = NULL;
+
+
+/* ----------------------------------------------------------------
+ *				   public portal interface functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * EnablePortalManager
+ *		Enables the portal management module at backend startup.
+ */
+void
+EnablePortalManager(void)
+{
+	HASHCTL		ctl;
+
+	Assert(TopPortalContext == NULL);
+
+	TopPortalContext = AllocSetContextCreate(TopMemoryContext,
+											 "TopPortalContext",
+											 ALLOCSET_DEFAULT_SIZES);
+
+	ctl.keysize = MAX_PORTALNAME_LEN;
+	ctl.entrysize = sizeof(PortalHashEnt);
+
+	/*
+	 * use PORTALS_PER_USER as a guess of how many hash table entries to
+	 * create, initially
+	 */
+	PortalHashTable = hash_create("Portal hash", PORTALS_PER_USER,
+								  &ctl, HASH_ELEM | HASH_STRINGS);
+}
+
+/*
+ * GetPortalByName
+ *		Returns a portal given a portal name, or NULL if name not found.
+ */
+Portal
+GetPortalByName(const char *name)
+{
+	Portal		portal;
+
+	if (PointerIsValid(name))
+		PortalHashTableLookup(name, portal);
+	else
+		portal = NULL;
+
+	return portal;
+}
+
+/*
+ * PortalGetPrimaryStmt
+ *		Get the "primary" stmt within a portal, ie, the one marked canSetTag.
+ *
+ * Returns NULL if no such stmt.  If multiple PlannedStmt structs within the
+ * portal are marked canSetTag, returns the first one.  Neither of these
+ * cases should occur in present usages of this function.
+ */
+PlannedStmt *
+PortalGetPrimaryStmt(Portal portal)
+{
+	ListCell   *lc;
+
+	foreach(lc, portal->stmts)
+	{
+		PlannedStmt *stmt = lfirst_node(PlannedStmt, lc);
+
+		if (stmt->canSetTag)
+			return stmt;
+	}
+	return NULL;
+}
+
+/*
+ * CreatePortal
+ *		Returns a new portal given a name.
+ *
+ * allowDup: if true, automatically drop any pre-existing portal of the
+ * same name (if false, an error is raised).
+ *
+ * dupSilent: if true, don't even emit a WARNING.
+ */
+Portal
+CreatePortal(const char *name, bool allowDup, bool dupSilent)
+{
+	Portal		portal;
+
+	Assert(PointerIsValid(name));
+
+	portal = GetPortalByName(name);
+	if (PortalIsValid(portal))
+	{
+		if (!allowDup)
+			ereport(ERROR,
+					(errcode(ERRCODE_DUPLICATE_CURSOR),
+					 errmsg("cursor \"%s\" already exists", name)));
+		if (!dupSilent)
+			ereport(WARNING,
+					(errcode(ERRCODE_DUPLICATE_CURSOR),
+					 errmsg("closing existing cursor \"%s\"",
+							name)));
+		PortalDrop(portal, false);
+	}
+
+	/* make new portal structure */
+	portal = (Portal) MemoryContextAllocZero(TopPortalContext, sizeof *portal);
+
+	/* initialize portal context; typically it won't store much */
+	portal->portalContext = AllocSetContextCreate(TopPortalContext,
+												  "PortalContext",
+												  ALLOCSET_SMALL_SIZES);
+
+	/* create a resource owner for the portal */
+	portal->resowner = ResourceOwnerCreate(CurTransactionResourceOwner,
+										   "Portal");
+
+	/* initialize portal fields that don't start off zero */
+	portal->status = PORTAL_NEW;
+	portal->cleanup = PortalCleanup;
+	portal->createSubid = GetCurrentSubTransactionId();
+	portal->activeSubid = portal->createSubid;
+	portal->createLevel = GetCurrentTransactionNestLevel();
+	portal->strategy = PORTAL_MULTI_QUERY;
+	portal->cursorOptions = CURSOR_OPT_NO_SCROLL;
+	portal->atStart = true;
+	portal->atEnd = true;		/* disallow fetches until query is set */
+	portal->visible = true;
+	portal->creation_time = GetCurrentStatementStartTimestamp();
+
+	/* put portal in table (sets portal->name) */
+	PortalHashTableInsert(portal, name);
+
+	/* for named portals reuse portal->name copy */
+	MemoryContextSetIdentifier(portal->portalContext, portal->name[0] ? portal->name : "<unnamed>");
+
+	return portal;
+}
+
+/*
+ * CreateNewPortal
+ *		Create a new portal, assigning it a random nonconflicting name.
+ */
+Portal
+CreateNewPortal(void)
+{
+	static __thread unsigned int unnamed_portal_count = 0;
+
+	char		portalname[MAX_PORTALNAME_LEN];
+
+	/* Select a nonconflicting name */
+	for (;;)
+	{
+		unnamed_portal_count++;
+		sprintf(portalname, "<unnamed portal %u>", unnamed_portal_count);
+		if (GetPortalByName(portalname) == NULL)
+			break;
+	}
+
+	return CreatePortal(portalname, false, false);
+}
+
+/*
+ * PortalDefineQuery
+ *		A simple subroutine to establish a portal's query.
+ *
+ * Notes: as of PG 8.4, caller MUST supply a sourceText string; it is not
+ * allowed anymore to pass NULL.  (If you really don't have source text,
+ * you can pass a constant string, perhaps "(query not available)".)
+ *
+ * commandTag shall be NULL if and only if the original query string
+ * (before rewriting) was an empty string.  Also, the passed commandTag must
+ * be a pointer to a constant string, since it is not copied.
+ *
+ * If cplan is provided, then it is a cached plan containing the stmts, and
+ * the caller must have done GetCachedPlan(), causing a refcount increment.
+ * The refcount will be released when the portal is destroyed.
+ *
+ * If cplan is NULL, then it is the caller's responsibility to ensure that
+ * the passed plan trees have adequate lifetime.  Typically this is done by
+ * copying them into the portal's context.
+ *
+ * The caller is also responsible for ensuring that the passed prepStmtName
+ * (if not NULL) and sourceText have adequate lifetime.
+ *
+ * NB: this function mustn't do much beyond storing the passed values; in
+ * particular don't do anything that risks elog(ERROR).  If that were to
+ * happen here before storing the cplan reference, we'd leak the plancache
+ * refcount that the caller is trying to hand off to us.
+ */
+void
+PortalDefineQuery(Portal portal,
+				  const char *prepStmtName,
+				  const char *sourceText,
+				  CommandTag commandTag,
+				  List *stmts,
+				  CachedPlan *cplan)
+{
+	Assert(PortalIsValid(portal));
+	Assert(portal->status == PORTAL_NEW);
+
+	Assert(sourceText != NULL);
+	Assert(commandTag != CMDTAG_UNKNOWN || stmts == NIL);
+
+	portal->prepStmtName = prepStmtName;
+	portal->sourceText = sourceText;
+	portal->qc.commandTag = commandTag;
+	portal->qc.nprocessed = 0;
+	portal->commandTag = commandTag;
+	portal->stmts = stmts;
+	portal->cplan = cplan;
+	portal->status = PORTAL_DEFINED;
+}
+
+/*
+ * PortalReleaseCachedPlan
+ *		Release a portal's reference to its cached plan, if any.
+ */
+static void
+PortalReleaseCachedPlan(Portal portal)
+{
+	if (portal->cplan)
+	{
+		ReleaseCachedPlan(portal->cplan, NULL);
+		portal->cplan = NULL;
+
+		/*
+		 * We must also clear portal->stmts which is now a dangling reference
+		 * to the cached plan's plan list.  This protects any code that might
+		 * try to examine the Portal later.
+		 */
+		portal->stmts = NIL;
+	}
+}
+
+/*
+ * PortalCreateHoldStore
+ *		Create the tuplestore for a portal.
+ */
+void
+PortalCreateHoldStore(Portal portal)
+{
+	MemoryContext oldcxt;
+
+	Assert(portal->holdContext == NULL);
+	Assert(portal->holdStore == NULL);
+	Assert(portal->holdSnapshot == NULL);
+
+	/*
+	 * Create the memory context that is used for storage of the tuple set.
+	 * Note this is NOT a child of the portal's portalContext.
+	 */
+	portal->holdContext =
+		AllocSetContextCreate(TopPortalContext,
+							  "PortalHoldContext",
+							  ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * Create the tuple store, selecting cross-transaction temp files, and
+	 * enabling random access only if cursor requires scrolling.
+	 *
+	 * XXX: Should maintenance_work_mem be used for the portal size?
+	 */
+	oldcxt = MemoryContextSwitchTo(portal->holdContext);
+
+	portal->holdStore =
+		tuplestore_begin_heap(portal->cursorOptions & CURSOR_OPT_SCROLL,
+							  true, work_mem);
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * PinPortal
+ *		Protect a portal from dropping.
+ *
+ * A pinned portal is still unpinned and dropped at transaction or
+ * subtransaction abort.
+ */
+void
+PinPortal(Portal portal)
+{
+	if (portal->portalPinned)
+		elog(ERROR, "portal already pinned");
+
+	portal->portalPinned = true;
+}
+
+void
+UnpinPortal(Portal portal)
+{
+	if (!portal->portalPinned)
+		elog(ERROR, "portal not pinned");
+
+	portal->portalPinned = false;
+}
+
+/*
+ * MarkPortalActive
+ *		Transition a portal from READY to ACTIVE state.
+ *
+ * NOTE: never set portal->status = PORTAL_ACTIVE directly; call this instead.
+ */
+void
+MarkPortalActive(Portal portal)
+{
+	/* For safety, this is a runtime test not just an Assert */
+	if (portal->status != PORTAL_READY)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("portal \"%s\" cannot be run", portal->name)));
+	/* Perform the state transition */
+	portal->status = PORTAL_ACTIVE;
+	portal->activeSubid = GetCurrentSubTransactionId();
+}
+
+/*
+ * MarkPortalDone
+ *		Transition a portal from ACTIVE to DONE state.
+ *
+ * NOTE: never set portal->status = PORTAL_DONE directly; call this instead.
+ */
+void
+MarkPortalDone(Portal portal)
+{
+	/* Perform the state transition */
+	Assert(portal->status == PORTAL_ACTIVE);
+	portal->status = PORTAL_DONE;
+
+	/*
+	 * Allow portalcmds.c to clean up the state it knows about.  We might as
+	 * well do that now, since the portal can't be executed any more.
+	 *
+	 * In some cases involving execution of a ROLLBACK command in an already
+	 * aborted transaction, this is necessary, or we'd reach AtCleanup_Portals
+	 * with the cleanup hook still unexecuted.
+	 */
+	if (PointerIsValid(portal->cleanup))
+	{
+		portal->cleanup(portal);
+		portal->cleanup = NULL;
+	}
+}
+
+/*
+ * MarkPortalFailed
+ *		Transition a portal into FAILED state.
+ *
+ * NOTE: never set portal->status = PORTAL_FAILED directly; call this instead.
+ */
+void
+MarkPortalFailed(Portal portal)
+{
+	/* Perform the state transition */
+	Assert(portal->status != PORTAL_DONE);
+	portal->status = PORTAL_FAILED;
+
+	/*
+	 * Allow portalcmds.c to clean up the state it knows about.  We might as
+	 * well do that now, since the portal can't be executed any more.
+	 *
+	 * In some cases involving cleanup of an already aborted transaction, this
+	 * is necessary, or we'd reach AtCleanup_Portals with the cleanup hook
+	 * still unexecuted.
+	 */
+	if (PointerIsValid(portal->cleanup))
+	{
+		portal->cleanup(portal);
+		portal->cleanup = NULL;
+	}
+}
+
+/*
+ * PortalDrop
+ *		Destroy the portal.
+ */
+void
+PortalDrop(Portal portal, bool isTopCommit)
+{
+	Assert(PortalIsValid(portal));
+
+	/*
+	 * Don't allow dropping a pinned portal, it's still needed by whoever
+	 * pinned it.
+	 */
+	if (portal->portalPinned)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_CURSOR_STATE),
+				 errmsg("cannot drop pinned portal \"%s\"", portal->name)));
+
+	/*
+	 * Not sure if the PORTAL_ACTIVE case can validly happen or not...
+	 */
+	if (portal->status == PORTAL_ACTIVE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_CURSOR_STATE),
+				 errmsg("cannot drop active portal \"%s\"", portal->name)));
+
+	/*
+	 * Allow portalcmds.c to clean up the state it knows about, in particular
+	 * shutting down the executor if still active.  This step potentially runs
+	 * user-defined code so failure has to be expected.  It's the cleanup
+	 * hook's responsibility to not try to do that more than once, in the case
+	 * that failure occurs and then we come back to drop the portal again
+	 * during transaction abort.
+	 *
+	 * Note: in most paths of control, this will have been done already in
+	 * MarkPortalDone or MarkPortalFailed.  We're just making sure.
+	 */
+	if (PointerIsValid(portal->cleanup))
+	{
+		portal->cleanup(portal);
+		portal->cleanup = NULL;
+	}
+
+	/* There shouldn't be an active snapshot anymore, except after error */
+	Assert(portal->portalSnapshot == NULL || !isTopCommit);
+
+	/*
+	 * Remove portal from hash table.  Because we do this here, we will not
+	 * come back to try to remove the portal again if there's any error in the
+	 * subsequent steps.  Better to leak a little memory than to get into an
+	 * infinite error-recovery loop.
+	 */
+	PortalHashTableDelete(portal);
+
+	/* drop cached plan reference, if any */
+	PortalReleaseCachedPlan(portal);
+
+	/*
+	 * If portal has a snapshot protecting its data, release that.  This needs
+	 * a little care since the registration will be attached to the portal's
+	 * resowner; if the portal failed, we will already have released the
+	 * resowner (and the snapshot) during transaction abort.
+	 */
+	if (portal->holdSnapshot)
+	{
+		if (portal->resowner)
+			UnregisterSnapshotFromOwner(portal->holdSnapshot,
+										portal->resowner);
+		portal->holdSnapshot = NULL;
+	}
+
+	/*
+	 * Release any resources still attached to the portal.  There are several
+	 * cases being covered here:
+	 *
+	 * Top transaction commit (indicated by isTopCommit): normally we should
+	 * do nothing here and let the regular end-of-transaction resource
+	 * releasing mechanism handle these resources too.  However, if we have a
+	 * FAILED portal (eg, a cursor that got an error), we'd better clean up
+	 * its resources to avoid resource-leakage warning messages.
+	 *
+	 * Sub transaction commit: never comes here at all, since we don't kill
+	 * any portals in AtSubCommit_Portals().
+	 *
+	 * Main or sub transaction abort: we will do nothing here because
+	 * portal->resowner was already set NULL; the resources were already
+	 * cleaned up in transaction abort.
+	 *
+	 * Ordinary portal drop: must release resources.  However, if the portal
+	 * is not FAILED then we do not release its locks.  The locks become the
+	 * responsibility of the transaction's ResourceOwner (since it is the
+	 * parent of the portal's owner) and will be released when the transaction
+	 * eventually ends.
+	 */
+	if (portal->resowner &&
+		(!isTopCommit || portal->status == PORTAL_FAILED))
+	{
+		bool		isCommit = (portal->status != PORTAL_FAILED);
+
+		ResourceOwnerRelease(portal->resowner,
+							 RESOURCE_RELEASE_BEFORE_LOCKS,
+							 isCommit, false);
+		ResourceOwnerRelease(portal->resowner,
+							 RESOURCE_RELEASE_LOCKS,
+							 isCommit, false);
+		ResourceOwnerRelease(portal->resowner,
+							 RESOURCE_RELEASE_AFTER_LOCKS,
+							 isCommit, false);
+		ResourceOwnerDelete(portal->resowner);
+	}
+	portal->resowner = NULL;
+
+	/*
+	 * Delete tuplestore if present.  We should do this even under error
+	 * conditions; since the tuplestore would have been using cross-
+	 * transaction storage, its temp files need to be explicitly deleted.
+	 */
+	if (portal->holdStore)
+	{
+		MemoryContext oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(portal->holdContext);
+		tuplestore_end(portal->holdStore);
+		MemoryContextSwitchTo(oldcontext);
+		portal->holdStore = NULL;
+	}
+
+	/* delete tuplestore storage, if any */
+	if (portal->holdContext)
+		MemoryContextDelete(portal->holdContext);
+
+	/* release subsidiary storage */
+	MemoryContextDelete(portal->portalContext);
+
+	/* release portal struct (it's in TopPortalContext) */
+	pfree(portal);
+}
+
+/*
+ * Delete all declared cursors.
+ *
+ * Used by commands: CLOSE ALL, DISCARD ALL
+ */
+void
+PortalHashTableDeleteAll(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	if (PortalHashTable == NULL)
+		return;
+
+	hash_seq_init(&status, PortalHashTable);
+	while ((hentry = hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		/* Can't close the active portal (the one running the command) */
+		if (portal->status == PORTAL_ACTIVE)
+			continue;
+
+		PortalDrop(portal, false);
+
+		/* Restart the iteration in case that led to other drops */
+		hash_seq_term(&status);
+		hash_seq_init(&status, PortalHashTable);
+	}
+}
+
+/*
+ * "Hold" a portal.  Prepare it for access by later transactions.
+ */
+static void
+HoldPortal(Portal portal)
+{
+	/*
+	 * Note that PersistHoldablePortal() must release all resources used by
+	 * the portal that are local to the creating transaction.
+	 */
+	PortalCreateHoldStore(portal);
+	PersistHoldablePortal(portal);
+
+	/* drop cached plan reference, if any */
+	PortalReleaseCachedPlan(portal);
+
+	/*
+	 * Any resources belonging to the portal will be released in the upcoming
+	 * transaction-wide cleanup; the portal will no longer have its own
+	 * resources.
+	 */
+	portal->resowner = NULL;
+
+	/*
+	 * Having successfully exported the holdable cursor, mark it as not
+	 * belonging to this transaction.
+	 */
+	portal->createSubid = InvalidSubTransactionId;
+	portal->activeSubid = InvalidSubTransactionId;
+	portal->createLevel = 0;
+}
+
+/*
+ * Pre-commit processing for portals.
+ *
+ * Holdable cursors created in this transaction need to be converted to
+ * materialized form, since we are going to close down the executor and
+ * release locks.  Non-holdable portals created in this transaction are
+ * simply removed.  Portals remaining from prior transactions should be
+ * left untouched.
+ *
+ * Returns true if any portals changed state (possibly causing user-defined
+ * code to be run), false if not.
+ */
+bool
+PreCommit_Portals(bool isPrepare)
+{
+	bool		result = false;
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		/*
+		 * There should be no pinned portals anymore. Complain if someone
+		 * leaked one. Auto-held portals are allowed; we assume that whoever
+		 * pinned them is managing them.
+		 */
+		if (portal->portalPinned && !portal->autoHeld)
+			elog(ERROR, "cannot commit while a portal is pinned");
+
+		/*
+		 * Do not touch active portals --- this can only happen in the case of
+		 * a multi-transaction utility command, such as VACUUM, or a commit in
+		 * a procedure.
+		 *
+		 * Note however that any resource owner attached to such a portal is
+		 * still going to go away, so don't leave a dangling pointer.  Also
+		 * unregister any snapshots held by the portal, mainly to avoid
+		 * snapshot leak warnings from ResourceOwnerRelease().
+		 */
+		if (portal->status == PORTAL_ACTIVE)
+		{
+			if (portal->holdSnapshot)
+			{
+				if (portal->resowner)
+					UnregisterSnapshotFromOwner(portal->holdSnapshot,
+												portal->resowner);
+				portal->holdSnapshot = NULL;
+			}
+			portal->resowner = NULL;
+			/* Clear portalSnapshot too, for cleanliness */
+			portal->portalSnapshot = NULL;
+			continue;
+		}
+
+		/* Is it a holdable portal created in the current xact? */
+		if ((portal->cursorOptions & CURSOR_OPT_HOLD) &&
+			portal->createSubid != InvalidSubTransactionId &&
+			portal->status == PORTAL_READY)
+		{
+			/*
+			 * We are exiting the transaction that created a holdable cursor.
+			 * Instead of dropping the portal, prepare it for access by later
+			 * transactions.
+			 *
+			 * However, if this is PREPARE TRANSACTION rather than COMMIT,
+			 * refuse PREPARE, because the semantics seem pretty unclear.
+			 */
+			if (isPrepare)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot PREPARE a transaction that has created a cursor WITH HOLD")));
+
+			HoldPortal(portal);
+
+			/* Report we changed state */
+			result = true;
+		}
+		else if (portal->createSubid == InvalidSubTransactionId)
+		{
+			/*
+			 * Do nothing to cursors held over from a previous transaction
+			 * (including ones we just froze in a previous cycle of this loop)
+			 */
+			continue;
+		}
+		else
+		{
+			/* Zap all non-holdable portals */
+			PortalDrop(portal, true);
+
+			/* Report we changed state */
+			result = true;
+		}
+
+		/*
+		 * After either freezing or dropping a portal, we have to restart the
+		 * iteration, because we could have invoked user-defined code that
+		 * caused a drop of the next portal in the hash chain.
+		 */
+		hash_seq_term(&status);
+		hash_seq_init(&status, PortalHashTable);
+	}
+
+	return result;
+}
+
+/*
+ * Abort processing for portals.
+ *
+ * At this point we run the cleanup hook if present, but we can't release the
+ * portal's memory until the cleanup call.
+ */
+void
+AtAbort_Portals(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		/*
+		 * When elog(FATAL) is progress, we need to set the active portal to
+		 * failed, so that PortalCleanup() doesn't run the executor shutdown.
+		 */
+		if (portal->status == PORTAL_ACTIVE && shmem_exit_inprogress)
+			MarkPortalFailed(portal);
+
+		/*
+		 * Do nothing else to cursors held over from a previous transaction.
+		 */
+		if (portal->createSubid == InvalidSubTransactionId)
+			continue;
+
+		/*
+		 * Do nothing to auto-held cursors.  This is similar to the case of a
+		 * cursor from a previous transaction, but it could also be that the
+		 * cursor was auto-held in this transaction, so it wants to live on.
+		 */
+		if (portal->autoHeld)
+			continue;
+
+		/*
+		 * If it was created in the current transaction, we can't do normal
+		 * shutdown on a READY portal either; it might refer to objects
+		 * created in the failed transaction.  See comments in
+		 * AtSubAbort_Portals.
+		 */
+		if (portal->status == PORTAL_READY)
+			MarkPortalFailed(portal);
+
+		/*
+		 * Allow portalcmds.c to clean up the state it knows about, if we
+		 * haven't already.
+		 */
+		if (PointerIsValid(portal->cleanup))
+		{
+			portal->cleanup(portal);
+			portal->cleanup = NULL;
+		}
+
+		/* drop cached plan reference, if any */
+		PortalReleaseCachedPlan(portal);
+
+		/*
+		 * Any resources belonging to the portal will be released in the
+		 * upcoming transaction-wide cleanup; they will be gone before we run
+		 * PortalDrop.
+		 */
+		portal->resowner = NULL;
+
+		/*
+		 * Although we can't delete the portal data structure proper, we can
+		 * release any memory in subsidiary contexts, such as executor state.
+		 * The cleanup hook was the last thing that might have needed data
+		 * there.  But leave active portals alone.
+		 */
+		if (portal->status != PORTAL_ACTIVE)
+			MemoryContextDeleteChildren(portal->portalContext);
+	}
+}
+
+/*
+ * Post-abort cleanup for portals.
+ *
+ * Delete all portals not held over from prior transactions.  */
+void
+AtCleanup_Portals(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		/*
+		 * Do not touch active portals --- this can only happen in the case of
+		 * a multi-transaction command.
+		 */
+		if (portal->status == PORTAL_ACTIVE)
+			continue;
+
+		/*
+		 * Do nothing to cursors held over from a previous transaction or
+		 * auto-held ones.
+		 */
+		if (portal->createSubid == InvalidSubTransactionId || portal->autoHeld)
+		{
+			Assert(portal->status != PORTAL_ACTIVE);
+			Assert(portal->resowner == NULL);
+			continue;
+		}
+
+		/*
+		 * If a portal is still pinned, forcibly unpin it. PortalDrop will not
+		 * let us drop the portal otherwise. Whoever pinned the portal was
+		 * interrupted by the abort too and won't try to use it anymore.
+		 */
+		if (portal->portalPinned)
+			portal->portalPinned = false;
+
+		/*
+		 * We had better not call any user-defined code during cleanup, so if
+		 * the cleanup hook hasn't been run yet, too bad; we'll just skip it.
+		 */
+		if (PointerIsValid(portal->cleanup))
+		{
+			elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name);
+			portal->cleanup = NULL;
+		}
+
+		/* Zap it. */
+		PortalDrop(portal, false);
+	}
+}
+
+/*
+ * Portal-related cleanup when we return to the main loop on error.
+ *
+ * This is different from the cleanup at transaction abort.  Auto-held portals
+ * are cleaned up on error but not on transaction abort.
+ */
+void
+PortalErrorCleanup(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->autoHeld)
+		{
+			portal->portalPinned = false;
+			PortalDrop(portal, false);
+		}
+	}
+}
+
+/*
+ * Pre-subcommit processing for portals.
+ *
+ * Reassign portals created or used in the current subtransaction to the
+ * parent subtransaction.
+ */
+void
+AtSubCommit_Portals(SubTransactionId mySubid,
+					SubTransactionId parentSubid,
+					int parentLevel,
+					ResourceOwner parentXactOwner)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->createSubid == mySubid)
+		{
+			portal->createSubid = parentSubid;
+			portal->createLevel = parentLevel;
+			if (portal->resowner)
+				ResourceOwnerNewParent(portal->resowner, parentXactOwner);
+		}
+		if (portal->activeSubid == mySubid)
+			portal->activeSubid = parentSubid;
+	}
+}
+
+/*
+ * Subtransaction abort handling for portals.
+ *
+ * Deactivate portals created or used during the failed subtransaction.
+ * Note that per AtSubCommit_Portals, this will catch portals created/used
+ * in descendants of the subtransaction too.
+ *
+ * We don't destroy any portals here; that's done in AtSubCleanup_Portals.
+ */
+void
+AtSubAbort_Portals(SubTransactionId mySubid,
+				   SubTransactionId parentSubid,
+				   ResourceOwner myXactOwner,
+				   ResourceOwner parentXactOwner)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		/* Was it created in this subtransaction? */
+		if (portal->createSubid != mySubid)
+		{
+			/* No, but maybe it was used in this subtransaction? */
+			if (portal->activeSubid == mySubid)
+			{
+				/* Maintain activeSubid until the portal is removed */
+				portal->activeSubid = parentSubid;
+
+				/*
+				 * A MarkPortalActive() caller ran an upper-level portal in
+				 * this subtransaction and left the portal ACTIVE.  This can't
+				 * happen, but force the portal into FAILED state for the same
+				 * reasons discussed below.
+				 *
+				 * We assume we can get away without forcing upper-level READY
+				 * portals to fail, even if they were run and then suspended.
+				 * In theory a suspended upper-level portal could have
+				 * acquired some references to objects that are about to be
+				 * destroyed, but there should be sufficient defenses against
+				 * such cases: the portal's original query cannot contain such
+				 * references, and any references within, say, cached plans of
+				 * PL/pgSQL functions are not from active queries and should
+				 * be protected by revalidation logic.
+				 */
+				if (portal->status == PORTAL_ACTIVE)
+					MarkPortalFailed(portal);
+
+				/*
+				 * Also, if we failed it during the current subtransaction
+				 * (either just above, or earlier), reattach its resource
+				 * owner to the current subtransaction's resource owner, so
+				 * that any resources it still holds will be released while
+				 * cleaning up this subtransaction.  This prevents some corner
+				 * cases wherein we might get Asserts or worse while cleaning
+				 * up objects created during the current subtransaction
+				 * (because they're still referenced within this portal).
+				 */
+				if (portal->status == PORTAL_FAILED && portal->resowner)
+				{
+					ResourceOwnerNewParent(portal->resowner, myXactOwner);
+					portal->resowner = NULL;
+				}
+			}
+			/* Done if it wasn't created in this subtransaction */
+			continue;
+		}
+
+		/*
+		 * Force any live portals of my own subtransaction into FAILED state.
+		 * We have to do this because they might refer to objects created or
+		 * changed in the failed subtransaction, leading to crashes within
+		 * ExecutorEnd when portalcmds.c tries to close down the portal.
+		 * Currently, every MarkPortalActive() caller ensures it updates the
+		 * portal status again before relinquishing control, so ACTIVE can't
+		 * happen here.  If it does happen, dispose the portal like existing
+		 * MarkPortalActive() callers would.
+		 */
+		if (portal->status == PORTAL_READY ||
+			portal->status == PORTAL_ACTIVE)
+			MarkPortalFailed(portal);
+
+		/*
+		 * Allow portalcmds.c to clean up the state it knows about, if we
+		 * haven't already.
+		 */
+		if (PointerIsValid(portal->cleanup))
+		{
+			portal->cleanup(portal);
+			portal->cleanup = NULL;
+		}
+
+		/* drop cached plan reference, if any */
+		PortalReleaseCachedPlan(portal);
+
+		/*
+		 * Any resources belonging to the portal will be released in the
+		 * upcoming transaction-wide cleanup; they will be gone before we run
+		 * PortalDrop.
+		 */
+		portal->resowner = NULL;
+
+		/*
+		 * Although we can't delete the portal data structure proper, we can
+		 * release any memory in subsidiary contexts, such as executor state.
+		 * The cleanup hook was the last thing that might have needed data
+		 * there.
+		 */
+		MemoryContextDeleteChildren(portal->portalContext);
+	}
+}
+
+/*
+ * Post-subabort cleanup for portals.
+ *
+ * Drop all portals created in the failed subtransaction (but note that
+ * we will not drop any that were reassigned to the parent above).
+ */
+void
+AtSubCleanup_Portals(SubTransactionId mySubid)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->createSubid != mySubid)
+			continue;
+
+		/*
+		 * If a portal is still pinned, forcibly unpin it. PortalDrop will not
+		 * let us drop the portal otherwise. Whoever pinned the portal was
+		 * interrupted by the abort too and won't try to use it anymore.
+		 */
+		if (portal->portalPinned)
+			portal->portalPinned = false;
+
+		/*
+		 * We had better not call any user-defined code during cleanup, so if
+		 * the cleanup hook hasn't been run yet, too bad; we'll just skip it.
+		 */
+		if (PointerIsValid(portal->cleanup))
+		{
+			elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name);
+			portal->cleanup = NULL;
+		}
+
+		/* Zap it. */
+		PortalDrop(portal, false);
+	}
+}
+
+/* Find all available cursors */
+Datum
+pg_cursor(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	HASH_SEQ_STATUS hash_seq;
+	PortalHashEnt *hentry;
+
+	/*
+	 * We put all the tuples into a tuplestore in one scan of the hashtable.
+	 * This avoids any issue of the hashtable possibly changing between calls.
+	 */
+	InitMaterializedSRF(fcinfo, 0);
+
+	hash_seq_init(&hash_seq, PortalHashTable);
+	while ((hentry = hash_seq_search(&hash_seq)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+		Datum		values[6];
+		bool		nulls[6] = {0};
+
+		/* report only "visible" entries */
+		if (!portal->visible)
+			continue;
+
+		values[0] = CStringGetTextDatum(portal->name);
+		values[1] = CStringGetTextDatum(portal->sourceText);
+		values[2] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_HOLD);
+		values[3] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_BINARY);
+		values[4] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_SCROLL);
+		values[5] = TimestampTzGetDatum(portal->creation_time);
+
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+
+	return (Datum) 0;
+}
+
+bool
+ThereAreNoReadyPortals(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->status == PORTAL_READY)
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Hold all pinned portals.
+ *
+ * When initiating a COMMIT or ROLLBACK inside a procedure, this must be
+ * called to protect internally-generated cursors from being dropped during
+ * the transaction shutdown.  Currently, SPI calls this automatically; PLs
+ * that initiate COMMIT or ROLLBACK some other way are on the hook to do it
+ * themselves.  (Note that we couldn't do this in, say, AtAbort_Portals
+ * because we need to run user-defined code while persisting a portal.
+ * It's too late to do that once transaction abort has started.)
+ *
+ * We protect such portals by converting them to held cursors.  We mark them
+ * as "auto-held" so that exception exit knows to clean them up.  (In normal,
+ * non-exception code paths, the PL needs to clean such portals itself, since
+ * transaction end won't do it anymore; but that should be normal practice
+ * anyway.)
+ */
+void
+HoldPinnedPortals(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->portalPinned && !portal->autoHeld)
+		{
+			/*
+			 * Doing transaction control, especially abort, inside a cursor
+			 * loop that is not read-only, for example using UPDATE ...
+			 * RETURNING, has weird semantics issues.  Also, this
+			 * implementation wouldn't work, because such portals cannot be
+			 * held.  (The core grammar enforces that only SELECT statements
+			 * can drive a cursor, but for example PL/pgSQL does not restrict
+			 * it.)
+			 */
+			if (portal->strategy != PORTAL_ONE_SELECT)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("cannot perform transaction commands inside a cursor loop that is not read-only")));
+
+			/* Verify it's in a suitable state to be held */
+			if (portal->status != PORTAL_READY)
+				elog(ERROR, "pinned portal is not ready to be auto-held");
+
+			HoldPortal(portal);
+			portal->autoHeld = true;
+		}
+	}
+}
+
+/*
+ * Drop the outer active snapshots for all portals, so that no snapshots
+ * remain active.
+ *
+ * Like HoldPinnedPortals, this must be called when initiating a COMMIT or
+ * ROLLBACK inside a procedure.  This has to be separate from that since it
+ * should not be run until we're done with steps that are likely to fail.
+ *
+ * It's tempting to fold this into PreCommit_Portals, but to do so, we'd
+ * need to clean up snapshot management in VACUUM and perhaps other places.
+ */
+void
+ForgetPortalSnapshots(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+	int			numPortalSnaps = 0;
+	int			numActiveSnaps = 0;
+
+	/* First, scan PortalHashTable and clear portalSnapshot fields */
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->portalSnapshot != NULL)
+		{
+			portal->portalSnapshot = NULL;
+			numPortalSnaps++;
+		}
+		/* portal->holdSnapshot will be cleaned up in PreCommit_Portals */
+	}
+
+	/*
+	 * Now, pop all the active snapshots, which should be just those that were
+	 * portal snapshots.  Ideally we'd drive this directly off the portal
+	 * scan, but there's no good way to visit the portals in the correct
+	 * order.  So just cross-check after the fact.
+	 */
+	while (ActiveSnapshotSet())
+	{
+		PopActiveSnapshot();
+		numActiveSnaps++;
+	}
+
+	if (numPortalSnaps != numActiveSnaps)
+		elog(ERROR, "portal snapshots (%d) did not account for all active snapshots (%d)",
+			 numPortalSnaps, numActiveSnaps);
+}
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/slab.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/slab.c
new file mode 100644
index 00000000000..718dd2ba03c
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,1097 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a MemoryContext implementation designed for cases where large
+ * numbers of equally-sized objects can be allocated and freed efficiently
+ * with minimal memory wastage and fragmentation.
+ *
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ * NOTE:
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations over more general purpose allocators. The blocks are carved
+ *	into chunks of exactly the right size, wasting only the space required to
+ *	MAXALIGN the allocated chunks.
+ *
+ *	Slab can also help reduce memory fragmentation in cases where longer-lived
+ *	chunks remain stored on blocks while most of the other chunks have already
+ *	been pfree'd.  We give priority to putting new allocations into the
+ *	"fullest" block.  This help avoid having too many sparsely used blocks
+ *	around and allows blocks to more easily become completely unused which
+ *	allows them to be eventually free'd.
+ *
+ *	We identify the "fullest" block to put new allocations on by using a block
+ *	from the lowest populated element of the context's "blocklist" array.
+ *	This is an array of dlists containing blocks which we partition by the
+ *	number of free chunks which block has.  Blocks with fewer free chunks are
+ *	stored in a lower indexed dlist array slot.  Full blocks go on the 0th
+ *	element of the blocklist array.  So that we don't have to have too many
+ *	elements in the array, each dlist in the array is responsible for a range
+ *	of free chunks.  When a chunk is palloc'd or pfree'd we may need to move
+ *	the block onto another dlist if the number of free chunks crosses the
+ *	range boundary that the current list is responsible for.  Having just a
+ *	few blocklist elements reduces the number of times we must move the block
+ *	onto another dlist element.
+ *
+ *	We keep track of free chunks within each block by using a block-level free
+ *	list.  We consult this list when we allocate a new chunk in the block.
+ *	The free list is a linked list, the head of which is pointed to with
+ *	SlabBlock's freehead field.  Each subsequent list item is stored in the
+ *	free chunk's memory.  We ensure chunks are large enough to store this
+ *	address.
+ *
+ *	When we allocate a new block, technically all chunks are free, however, to
+ *	avoid having to write out the entire block to set the linked list for the
+ *	free chunks for every chunk in the block, we instead store a pointer to
+ *	the next "unused" chunk on the block and keep track of how many of these
+ *	unused chunks there are.  When a new block is malloc'd, all chunks are
+ *	unused.  The unused pointer starts with the first chunk on the block and
+ *	as chunks are allocated, the unused pointer is incremented.  As chunks are
+ *	pfree'd, the unused pointer never goes backwards.  The unused pointer can
+ *	be thought of as a high watermark for the maximum number of chunks in the
+ *	block which have been in use concurrently.  When a chunk is pfree'd the
+ *	chunk is put onto the head of the free list and the unused pointer is not
+ *	changed.  We only consume more unused chunks if we run out of free chunks
+ *	on the free list.  This method effectively gives priority to using
+ *	previously used chunks over previously unused chunks, which should perform
+ *	better due to CPU caching effects.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/memutils_memorychunk.h"
+#include "utils/memutils_internal.h"
+
+#define Slab_BLOCKHDRSZ	MAXALIGN(sizeof(SlabBlock))
+
+#ifdef MEMORY_CONTEXT_CHECKING
+/*
+ * Size of the memory required to store the SlabContext.
+ * MEMORY_CONTEXT_CHECKING builds need some extra memory for the isChunkFree
+ * array.
+ */
+#define Slab_CONTEXT_HDRSZ(chunksPerBlock)	\
+	(sizeof(SlabContext) + ((chunksPerBlock) * sizeof(bool)))
+#else
+#define Slab_CONTEXT_HDRSZ(chunksPerBlock)	sizeof(SlabContext)
+#endif
+
+/*
+ * The number of partitions to divide the blocklist into based their number of
+ * free chunks.  There must be at least 2.
+ */
+#define SLAB_BLOCKLIST_COUNT 3
+
+/* The maximum number of completely empty blocks to keep around for reuse. */
+#define SLAB_MAXIMUM_EMPTY_BLOCKS 10
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* the requested (non-aligned) chunk size */
+	Size		fullChunkSize;	/* chunk size with chunk header and alignment */
+	Size		blockSize;		/* the size to make each block of chunks */
+	int32		chunksPerBlock; /* number of chunks that fit in 1 block */
+	int32		curBlocklistIndex;	/* index into the blocklist[] element
+									 * containing the fullest, blocks */
+#ifdef MEMORY_CONTEXT_CHECKING
+	bool	   *isChunkFree;	/* array to mark free chunks in a block during
+								 * SlabCheck */
+#endif
+
+	int32		blocklist_shift;	/* number of bits to shift the nfree count
+									 * by to get the index into blocklist[] */
+	dclist_head emptyblocks;	/* empty blocks to use up first instead of
+								 * mallocing new blocks */
+
+	/*
+	 * Blocks with free space, grouped by the number of free chunks they
+	 * contain.  Completely full blocks are stored in the 0th element.
+	 * Completely empty blocks are stored in emptyblocks or free'd if we have
+	 * enough empty blocks already.
+	 */
+	dlist_head	blocklist[SLAB_BLOCKLIST_COUNT];
+} SlabContext;
+
+/*
+ * SlabBlock
+ *		Structure of a single slab block.
+ *
+ * slab: pointer back to the owning MemoryContext
+ * nfree: number of chunks on the block which are unallocated
+ * nunused: number of chunks on the block unallocated and not on the block's
+ * freelist.
+ * freehead: linked-list header storing a pointer to the first free chunk on
+ * the block.  Subsequent pointers are stored in the chunk's memory.  NULL
+ * indicates the end of the list.
+ * unused: pointer to the next chunk which has yet to be used.
+ * node: doubly-linked list node for the context's blocklist
+ */
+typedef struct SlabBlock
+{
+	SlabContext *slab;			/* owning context */
+	int32		nfree;			/* number of chunks on free + unused chunks */
+	int32		nunused;		/* number of unused chunks */
+	MemoryChunk *freehead;		/* pointer to the first free chunk */
+	MemoryChunk *unused;		/* pointer to the next unused chunk */
+	dlist_node	node;			/* doubly-linked list for blocklist[] */
+} SlabBlock;
+
+
+#define Slab_CHUNKHDRSZ sizeof(MemoryChunk)
+#define SlabChunkGetPointer(chk)	\
+	((void *) (((char *) (chk)) + sizeof(MemoryChunk)))
+
+/*
+ * SlabBlockGetChunk
+ *		Obtain a pointer to the nth (0-based) chunk in the block
+ */
+#define SlabBlockGetChunk(slab, block, n) \
+	((MemoryChunk *) ((char *) (block) + Slab_BLOCKHDRSZ	\
+					+ ((n) * (slab)->fullChunkSize)))
+
+#if defined(MEMORY_CONTEXT_CHECKING) || defined(USE_ASSERT_CHECKING)
+
+/*
+ * SlabChunkIndex
+ *		Get the 0-based index of how many chunks into the block the given
+ *		chunk is.
+*/
+#define SlabChunkIndex(slab, block, chunk)	\
+	(((char *) (chunk) - (char *) SlabBlockGetChunk(slab, block, 0)) / \
+	(slab)->fullChunkSize)
+
+/*
+ * SlabChunkMod
+ *		A MemoryChunk should always be at an address which is a multiple of
+ *		fullChunkSize starting from the 0th chunk position.  This will return
+ *		non-zero if it's not.
+ */
+#define SlabChunkMod(slab, block, chunk)	\
+	(((char *) (chunk) - (char *) SlabBlockGetChunk(slab, block, 0)) % \
+	(slab)->fullChunkSize)
+
+#endif
+
+/*
+ * SlabIsValid
+ *		True iff set is a valid slab allocation set.
+ */
+#define SlabIsValid(set) (PointerIsValid(set) && IsA(set, SlabContext))
+
+/*
+ * SlabBlockIsValid
+ *		True iff block is a valid block of slab allocation set.
+ */
+#define SlabBlockIsValid(block) \
+	(PointerIsValid(block) && SlabIsValid((block)->slab))
+
+/*
+ * SlabBlocklistIndex
+ *		Determine the blocklist index that a block should be in for the given
+ *		number of free chunks.
+ */
+static inline int32
+SlabBlocklistIndex(SlabContext *slab, int nfree)
+{
+	int32		index;
+	int32		blocklist_shift = slab->blocklist_shift;
+
+	Assert(nfree >= 0 && nfree <= slab->chunksPerBlock);
+
+	/*
+	 * Determine the blocklist index based on the number of free chunks.  We
+	 * must ensure that 0 free chunks is dedicated to index 0.  Everything
+	 * else must be >= 1 and < SLAB_BLOCKLIST_COUNT.
+	 *
+	 * To make this as efficient as possible, we exploit some two's complement
+	 * arithmetic where we reverse the sign before bit shifting.  This results
+	 * in an nfree of 0 using index 0 and anything non-zero staying non-zero.
+	 * This is exploiting 0 and -0 being the same in two's complement.  When
+	 * we're done, we just need to flip the sign back over again for a
+	 * positive index.
+	 */
+	index = -((-nfree) >> blocklist_shift);
+
+	if (nfree == 0)
+		Assert(index == 0);
+	else
+		Assert(index >= 1 && index < SLAB_BLOCKLIST_COUNT);
+
+	return index;
+}
+
+/*
+ * SlabFindNextBlockListIndex
+ *		Search blocklist for blocks which have free chunks and return the
+ *		index of the blocklist found containing at least 1 block with free
+ *		chunks.  If no block can be found we return 0.
+ *
+ * Note: We give priority to fuller blocks so that these are filled before
+ * emptier blocks.  This is done to increase the chances that mostly-empty
+ * blocks will eventually become completely empty so they can be free'd.
+ */
+static int32
+SlabFindNextBlockListIndex(SlabContext *slab)
+{
+	/* start at 1 as blocklist[0] is for full blocks. */
+	for (int i = 1; i < SLAB_BLOCKLIST_COUNT; i++)
+	{
+		/* return the first found non-empty index */
+		if (!dlist_is_empty(&slab->blocklist[i]))
+			return i;
+	}
+
+	/* no blocks with free space */
+	return 0;
+}
+
+/*
+ * SlabGetNextFreeChunk
+ *		Return the next free chunk in block and update the block to account
+ *		for the returned chunk now being used.
+ */
+static inline MemoryChunk *
+SlabGetNextFreeChunk(SlabContext *slab, SlabBlock *block)
+{
+	MemoryChunk *chunk;
+
+	Assert(block->nfree > 0);
+
+	if (block->freehead != NULL)
+	{
+		chunk = block->freehead;
+
+		/*
+		 * Pop the chunk from the linked list of free chunks.  The pointer to
+		 * the next free chunk is stored in the chunk itself.
+		 */
+		VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(MemoryChunk *));
+		block->freehead = *(MemoryChunk **) SlabChunkGetPointer(chunk);
+
+		/* check nothing stomped on the free chunk's memory */
+		Assert(block->freehead == NULL ||
+			   (block->freehead >= SlabBlockGetChunk(slab, block, 0) &&
+				block->freehead <= SlabBlockGetChunk(slab, block, slab->chunksPerBlock - 1) &&
+				SlabChunkMod(slab, block, block->freehead) == 0));
+	}
+	else
+	{
+		Assert(block->nunused > 0);
+
+		chunk = block->unused;
+		block->unused = (MemoryChunk *) (((char *) block->unused) + slab->fullChunkSize);
+		block->nunused--;
+	}
+
+	block->nfree--;
+
+	return chunk;
+}
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (must be statically allocated)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ *
+ * The MAXALIGN(chunkSize) may not exceed MEMORYCHUNK_MAX_VALUE
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+				  const char *name,
+				  Size blockSize,
+				  Size chunkSize)
+{
+	int			chunksPerBlock;
+	Size		fullChunkSize;
+	SlabContext *slab;
+	int			i;
+
+	/* ensure MemoryChunk's size is properly maxaligned */
+	StaticAssertDecl(Slab_CHUNKHDRSZ == MAXALIGN(Slab_CHUNKHDRSZ),
+					 "sizeof(MemoryChunk) is not maxaligned");
+	Assert(MAXALIGN(chunkSize) <= MEMORYCHUNK_MAX_VALUE);
+
+	/*
+	 * Ensure there's enough space to store the pointer to the next free chunk
+	 * in the memory of the (otherwise) unused allocation.
+	 */
+	if (chunkSize < sizeof(MemoryChunk *))
+		chunkSize = sizeof(MemoryChunk *);
+
+	/* length of the maxaligned chunk including the chunk header  */
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* ensure there's always space for the sentinel byte */
+	fullChunkSize = Slab_CHUNKHDRSZ + MAXALIGN(chunkSize + 1);
+#else
+	fullChunkSize = Slab_CHUNKHDRSZ + MAXALIGN(chunkSize);
+#endif
+
+	/* compute the number of chunks that will fit on each block */
+	chunksPerBlock = (blockSize - Slab_BLOCKHDRSZ) / fullChunkSize;
+
+	/* Make sure the block can store at least one chunk. */
+	if (chunksPerBlock == 0)
+		elog(ERROR, "block size %zu for slab is too small for %zu-byte chunks",
+			 blockSize, chunkSize);
+
+
+
+	slab = (SlabContext *) malloc(Slab_CONTEXT_HDRSZ(chunksPerBlock));
+	if (slab == NULL)
+	{
+		MemoryContextStats(TopMemoryContext);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed while creating memory context \"%s\".",
+						   name)));
+	}
+
+	/*
+	 * Avoid writing code that can fail between here and MemoryContextCreate;
+	 * we'd leak the header if we ereport in this stretch.
+	 */
+
+	/* Fill in SlabContext-specific header fields */
+	slab->chunkSize = chunkSize;
+	slab->fullChunkSize = fullChunkSize;
+	slab->blockSize = blockSize;
+	slab->chunksPerBlock = chunksPerBlock;
+	slab->curBlocklistIndex = 0;
+
+	/*
+	 * Compute a shift that guarantees that shifting chunksPerBlock with it is
+	 * < SLAB_BLOCKLIST_COUNT - 1.  The reason that we subtract 1 from
+	 * SLAB_BLOCKLIST_COUNT in this calculation is that we reserve the 0th
+	 * blocklist element for blocks which have no free chunks.
+	 *
+	 * We calculate the number of bits to shift by rather than a divisor to
+	 * divide by as performing division each time we need to find the
+	 * blocklist index would be much slower.
+	 */
+	slab->blocklist_shift = 0;
+	while ((slab->chunksPerBlock >> slab->blocklist_shift) >= (SLAB_BLOCKLIST_COUNT - 1))
+		slab->blocklist_shift++;
+
+	/* initialize the list to store empty blocks to be reused */
+	dclist_init(&slab->emptyblocks);
+
+	/* initialize each blocklist slot */
+	for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
+		dlist_init(&slab->blocklist[i]);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* set the isChunkFree pointer right after the end of the context */
+	slab->isChunkFree = (bool *) ((char *) slab + sizeof(SlabContext));
+#endif
+
+	/* Finally, do the type-independent part of context creation */
+	MemoryContextCreate((MemoryContext) slab,
+						T_SlabContext,
+						MCTX_SLAB_ID,
+						parent,
+						name);
+
+	return (MemoryContext) slab;
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+void
+SlabReset(MemoryContext context)
+{
+	SlabContext *slab = (SlabContext *) context;
+	dlist_mutable_iter miter;
+	int			i;
+
+	Assert(SlabIsValid(slab));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+	/* release any retained empty blocks */
+	dclist_foreach_modify(miter, &slab->emptyblocks)
+	{
+		SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);
+
+		dclist_delete_from(&slab->emptyblocks, miter.cur);
+
+#ifdef CLOBBER_FREED_MEMORY
+		wipe_mem(block, slab->blockSize);
+#endif
+		free(block);
+		context->mem_allocated -= slab->blockSize;
+	}
+
+	/* walk over blocklist and free the blocks */
+	for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
+	{
+		dlist_foreach_modify(miter, &slab->blocklist[i])
+		{
+			SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, slab->blockSize);
+#endif
+			free(block);
+			context->mem_allocated -= slab->blockSize;
+		}
+	}
+
+	slab->curBlocklistIndex = 0;
+
+	Assert(context->mem_allocated == 0);
+}
+
+/*
+ * SlabDelete
+ *		Free all memory which is allocated in the given context.
+ */
+void
+SlabDelete(MemoryContext context)
+{
+	/* Reset to release all the SlabBlocks */
+	SlabReset(context);
+	/* And free the context header */
+	free(context);
+}
+
+/*
+ * SlabAlloc
+ *		Returns a pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the slab.
+ */
+void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	SlabContext *slab = (SlabContext *) context;
+	SlabBlock  *block;
+	MemoryChunk *chunk;
+
+	Assert(SlabIsValid(slab));
+
+	/* sanity check that this is pointing to a valid blocklist */
+	Assert(slab->curBlocklistIndex >= 0);
+	Assert(slab->curBlocklistIndex <= SlabBlocklistIndex(slab, slab->chunksPerBlock));
+
+	/* make sure we only allow correct request size */
+	if (unlikely(size != slab->chunkSize))
+		elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)",
+			 size, slab->chunkSize);
+
+	/*
+	 * Handle the case when there are no partially filled blocks available.
+	 * SlabFree() will have updated the curBlocklistIndex setting it to zero
+	 * to indicate that it has freed the final block.  Also later in
+	 * SlabAlloc() we will set the curBlocklistIndex to zero if we end up
+	 * filling the final block.
+	 */
+	if (unlikely(slab->curBlocklistIndex == 0))
+	{
+		dlist_head *blocklist;
+		int			blocklist_idx;
+
+		/* to save allocating a new one, first check the empty blocks list */
+		if (dclist_count(&slab->emptyblocks) > 0)
+		{
+			dlist_node *node = dclist_pop_head_node(&slab->emptyblocks);
+
+			block = dlist_container(SlabBlock, node, node);
+
+			/*
+			 * SlabFree() should have left this block in a valid state with
+			 * all chunks free.  Ensure that's the case.
+			 */
+			Assert(block->nfree == slab->chunksPerBlock);
+
+			/* fetch the next chunk from this block */
+			chunk = SlabGetNextFreeChunk(slab, block);
+		}
+		else
+		{
+			block = (SlabBlock *) malloc(slab->blockSize);
+
+			if (unlikely(block == NULL))
+				return NULL;
+
+			block->slab = slab;
+			context->mem_allocated += slab->blockSize;
+
+			/* use the first chunk in the new block */
+			chunk = SlabBlockGetChunk(slab, block, 0);
+
+			block->nfree = slab->chunksPerBlock - 1;
+			block->unused = SlabBlockGetChunk(slab, block, 1);
+			block->freehead = NULL;
+			block->nunused = slab->chunksPerBlock - 1;
+		}
+
+		/* find the blocklist element for storing blocks with 1 used chunk */
+		blocklist_idx = SlabBlocklistIndex(slab, block->nfree);
+		blocklist = &slab->blocklist[blocklist_idx];
+
+		/* this better be empty.  We just added a block thinking it was */
+		Assert(dlist_is_empty(blocklist));
+
+		dlist_push_head(blocklist, &block->node);
+
+		slab->curBlocklistIndex = blocklist_idx;
+	}
+	else
+	{
+		dlist_head *blocklist = &slab->blocklist[slab->curBlocklistIndex];
+		int			new_blocklist_idx;
+
+		Assert(!dlist_is_empty(blocklist));
+
+		/* grab the block from the blocklist */
+		block = dlist_head_element(SlabBlock, node, blocklist);
+
+		/* make sure we actually got a valid block, with matching nfree */
+		Assert(block != NULL);
+		Assert(slab->curBlocklistIndex == SlabBlocklistIndex(slab, block->nfree));
+		Assert(block->nfree > 0);
+
+		/* fetch the next chunk from this block */
+		chunk = SlabGetNextFreeChunk(slab, block);
+
+		/* get the new blocklist index based on the new free chunk count */
+		new_blocklist_idx = SlabBlocklistIndex(slab, block->nfree);
+
+		/*
+		 * Handle the case where the blocklist index changes.  This also deals
+		 * with blocks becoming full as only full blocks go at index 0.
+		 */
+		if (unlikely(slab->curBlocklistIndex != new_blocklist_idx))
+		{
+			dlist_delete_from(blocklist, &block->node);
+			dlist_push_head(&slab->blocklist[new_blocklist_idx], &block->node);
+
+			if (dlist_is_empty(blocklist))
+				slab->curBlocklistIndex = SlabFindNextBlockListIndex(slab);
+		}
+	}
+
+	/*
+	 * Check that the chunk pointer is actually somewhere on the block and is
+	 * aligned as expected.
+	 */
+	Assert(chunk >= SlabBlockGetChunk(slab, block, 0));
+	Assert(chunk <= SlabBlockGetChunk(slab, block, slab->chunksPerBlock - 1));
+	Assert(SlabChunkMod(slab, block, chunk) == 0);
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, Slab_CHUNKHDRSZ);
+
+	MemoryChunkSetHdrMask(chunk, block, MAXALIGN(slab->chunkSize),
+						  MCTX_SLAB_ID);
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* slab mark to catch clobber of "unused" space */
+	Assert(slab->chunkSize < (slab->fullChunkSize - Slab_CHUNKHDRSZ));
+	set_sentinel(MemoryChunkGetPointer(chunk), size);
+	VALGRIND_MAKE_MEM_NOACCESS(((char *) chunk) +
+							   Slab_CHUNKHDRSZ + slab->chunkSize,
+							   slab->fullChunkSize -
+							   (slab->chunkSize + Slab_CHUNKHDRSZ));
+#endif
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
+#endif
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Slab_CHUNKHDRSZ);
+
+	return MemoryChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the slab.
+ */
+void
+SlabFree(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	SlabBlock  *block;
+	SlabContext *slab;
+	int			curBlocklistIdx;
+	int			newBlocklistIdx;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Slab_CHUNKHDRSZ);
+
+	block = MemoryChunkGetBlock(chunk);
+
+	/*
+	 * For speed reasons we just Assert that the referenced block is good.
+	 * Future field experience may show that this Assert had better become a
+	 * regular runtime test-and-elog check.
+	 */
+	Assert(SlabBlockIsValid(block));
+	slab = block->slab;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Test for someone scribbling on unused space in chunk */
+	Assert(slab->chunkSize < (slab->fullChunkSize - Slab_CHUNKHDRSZ));
+	if (!sentinel_ok(pointer, slab->chunkSize))
+		elog(WARNING, "detected write past chunk end in %s %p",
+			 slab->header.name, chunk);
+#endif
+
+	/* push this chunk onto the head of the block's free list */
+	*(MemoryChunk **) pointer = block->freehead;
+	block->freehead = chunk;
+
+	block->nfree++;
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= slab->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	/* don't wipe the free list MemoryChunk pointer stored in the chunk */
+	wipe_mem((char *) pointer + sizeof(MemoryChunk *),
+			 slab->chunkSize - sizeof(MemoryChunk *));
+#endif
+
+	curBlocklistIdx = SlabBlocklistIndex(slab, block->nfree - 1);
+	newBlocklistIdx = SlabBlocklistIndex(slab, block->nfree);
+
+	/*
+	 * Check if the block needs to be moved to another element on the
+	 * blocklist based on it now having 1 more free chunk.
+	 */
+	if (unlikely(curBlocklistIdx != newBlocklistIdx))
+	{
+		/* do the move */
+		dlist_delete_from(&slab->blocklist[curBlocklistIdx], &block->node);
+		dlist_push_head(&slab->blocklist[newBlocklistIdx], &block->node);
+
+		/*
+		 * The blocklist[curBlocklistIdx] may now be empty or we may now be
+		 * able to use a lower-element blocklist.  We'll need to redetermine
+		 * what the slab->curBlocklistIndex is if the current blocklist was
+		 * changed or if a lower element one was changed.  We must ensure we
+		 * use the list with the fullest block(s).
+		 */
+		if (slab->curBlocklistIndex >= curBlocklistIdx)
+		{
+			slab->curBlocklistIndex = SlabFindNextBlockListIndex(slab);
+
+			/*
+			 * We know there must be a block with at least 1 unused chunk as
+			 * we just pfree'd one.  Ensure curBlocklistIndex reflects this.
+			 */
+			Assert(slab->curBlocklistIndex > 0);
+		}
+	}
+
+	/* Handle when a block becomes completely empty */
+	if (unlikely(block->nfree == slab->chunksPerBlock))
+	{
+		/* remove the block */
+		dlist_delete_from(&slab->blocklist[newBlocklistIdx], &block->node);
+
+		/*
+		 * To avoid thrashing malloc/free, we keep a list of empty blocks that
+		 * we can reuse again instead of having to malloc a new one.
+		 */
+		if (dclist_count(&slab->emptyblocks) < SLAB_MAXIMUM_EMPTY_BLOCKS)
+			dclist_push_head(&slab->emptyblocks, &block->node);
+		else
+		{
+			/*
+			 * When we have enough empty blocks stored already, we actually
+			 * free the block.
+			 */
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, slab->blockSize);
+#endif
+			free(block);
+			slab->header.mem_allocated -= slab->blockSize;
+		}
+
+		/*
+		 * Check if we need to reset the blocklist index.  This is required
+		 * when the blocklist this block is on has become completely empty.
+		 */
+		if (slab->curBlocklistIndex == newBlocklistIdx &&
+			dlist_is_empty(&slab->blocklist[newBlocklistIdx]))
+			slab->curBlocklistIndex = SlabFindNextBlockListIndex(slab);
+	}
+}
+
+/*
+ * SlabRealloc
+ *		Change the allocated size of a chunk.
+ *
+ * As Slab is designed for allocating equally-sized chunks of memory, it can't
+ * do an actual chunk size change.  We try to be gentle and allow calls with
+ * exactly the same size, as in that case we can simply return the same
+ * chunk.  When the size differs, we throw an error.
+ *
+ * We could also allow requests with size < chunkSize.  That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ */
+void *
+SlabRealloc(void *pointer, Size size)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	SlabBlock  *block;
+	SlabContext *slab;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Slab_CHUNKHDRSZ);
+
+	block = MemoryChunkGetBlock(chunk);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Slab_CHUNKHDRSZ);
+
+	/*
+	 * Try to verify that we have a sane block pointer: the block header
+	 * should reference a slab context.  (We use a test-and-elog, not just
+	 * Assert, because it seems highly likely that we're here in error in the
+	 * first place.)
+	 */
+	if (!SlabBlockIsValid(block))
+		elog(ERROR, "could not find block containing chunk %p", chunk);
+	slab = block->slab;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == slab->chunkSize)
+		return pointer;
+
+	elog(ERROR, "slab allocator does not support realloc()");
+	return NULL;				/* keep compiler quiet */
+}
+
+/*
+ * SlabGetChunkContext
+ *		Return the MemoryContext that 'pointer' belongs to.
+ */
+MemoryContext
+SlabGetChunkContext(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	SlabBlock  *block;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Slab_CHUNKHDRSZ);
+
+	block = MemoryChunkGetBlock(chunk);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Slab_CHUNKHDRSZ);
+
+	Assert(SlabBlockIsValid(block));
+
+	return &block->slab->header;
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+Size
+SlabGetChunkSpace(void *pointer)
+{
+	MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
+	SlabBlock  *block;
+	SlabContext *slab;
+
+	/* Allow access to the chunk header. */
+	VALGRIND_MAKE_MEM_DEFINED(chunk, Slab_CHUNKHDRSZ);
+
+	block = MemoryChunkGetBlock(chunk);
+
+	/* Disallow access to the chunk header. */
+	VALGRIND_MAKE_MEM_NOACCESS(chunk, Slab_CHUNKHDRSZ);
+
+	Assert(SlabBlockIsValid(block));
+	slab = block->slab;
+
+	return slab->fullChunkSize;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is the slab empty of any allocated space?
+ */
+bool
+SlabIsEmpty(MemoryContext context)
+{
+	Assert(SlabIsValid((SlabContext *) context));
+
+	return (context->mem_allocated == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of a Slab context.
+ *
+ * printfunc: if not NULL, pass a human-readable stats string to this.
+ * passthru: pass this pointer through to printfunc.
+ * totals: if not NULL, add stats about this context into *totals.
+ * print_to_stderr: print stats to stderr if true, elog otherwise.
+ */
+void
+SlabStats(MemoryContext context,
+		  MemoryStatsPrintFunc printfunc, void *passthru,
+		  MemoryContextCounters *totals,
+		  bool print_to_stderr)
+{
+	SlabContext *slab = (SlabContext *) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace;
+	Size		freespace = 0;
+	int			i;
+
+	Assert(SlabIsValid(slab));
+
+	/* Include context header in totalspace */
+	totalspace = Slab_CONTEXT_HDRSZ(slab->chunksPerBlock);
+
+	/* Add the space consumed by blocks in the emptyblocks list */
+	totalspace += dclist_count(&slab->emptyblocks) * slab->blockSize;
+
+	for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
+	{
+		dlist_iter	iter;
+
+		dlist_foreach(iter, &slab->blocklist[i])
+		{
+			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+
+			nblocks++;
+			totalspace += slab->blockSize;
+			freespace += slab->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+		}
+	}
+
+	if (printfunc)
+	{
+		char		stats_string[200];
+
+		/* XXX should we include free chunks on empty blocks? */
+		snprintf(stats_string, sizeof(stats_string),
+				 "%zu total in %zu blocks; %u empty blocks; %zu free (%zu chunks); %zu used",
+				 totalspace, nblocks, dclist_count(&slab->emptyblocks),
+				 freespace, freechunks, totalspace - freespace);
+		printfunc(context, passthru, stats_string, print_to_stderr);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through all blocks looking for inconsistencies.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+void
+SlabCheck(MemoryContext context)
+{
+	SlabContext *slab = (SlabContext *) context;
+	int			i;
+	int			nblocks = 0;
+	const char *name = slab->header.name;
+	dlist_iter	iter;
+
+	Assert(SlabIsValid(slab));
+	Assert(slab->chunksPerBlock > 0);
+
+	/*
+	 * Have a look at the empty blocks.  These should have all their chunks
+	 * marked as free.  Ensure that's the case.
+	 */
+	dclist_foreach(iter, &slab->emptyblocks)
+	{
+		SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+
+		if (block->nfree != slab->chunksPerBlock)
+			elog(WARNING, "problem in slab %s: empty block %p should have %d free chunks but has %d chunks free",
+				 name, block, slab->chunksPerBlock, block->nfree);
+	}
+
+	/* walk the non-empty block lists */
+	for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
+	{
+		int			j,
+					nfree;
+
+		/* walk all blocks on this blocklist */
+		dlist_foreach(iter, &slab->blocklist[i])
+		{
+			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+			MemoryChunk *cur_chunk;
+
+			/*
+			 * Make sure the number of free chunks (in the block header)
+			 * matches the position in the blocklist.
+			 */
+			if (SlabBlocklistIndex(slab, block->nfree) != i)
+				elog(WARNING, "problem in slab %s: block %p is on blocklist %d but should be on blocklist %d",
+					 name, block, i, SlabBlocklistIndex(slab, block->nfree));
+
+			/* make sure the block is not empty */
+			if (block->nfree >= slab->chunksPerBlock)
+				elog(WARNING, "problem in slab %s: empty block %p incorrectly stored on blocklist element %d",
+					 name, block, i);
+
+			/* make sure the slab pointer correctly points to this context */
+			if (block->slab != slab)
+				elog(WARNING, "problem in slab %s: bogus slab link in block %p",
+					 name, block);
+
+			/* reset the array of free chunks for this block */
+			memset(slab->isChunkFree, 0, (slab->chunksPerBlock * sizeof(bool)));
+			nfree = 0;
+
+			/* walk through the block's free list chunks */
+			cur_chunk = block->freehead;
+			while (cur_chunk != NULL)
+			{
+				int			chunkidx = SlabChunkIndex(slab, block, cur_chunk);
+
+				/*
+				 * Ensure the free list link points to something on the block
+				 * at an address aligned according to the full chunk size.
+				 */
+				if (cur_chunk < SlabBlockGetChunk(slab, block, 0) ||
+					cur_chunk > SlabBlockGetChunk(slab, block, slab->chunksPerBlock - 1) ||
+					SlabChunkMod(slab, block, cur_chunk) != 0)
+					elog(WARNING, "problem in slab %s: bogus free list link %p in block %p",
+						 name, cur_chunk, block);
+
+				/* count the chunk and mark it free on the free chunk array */
+				nfree++;
+				slab->isChunkFree[chunkidx] = true;
+
+				/* read pointer of the next free chunk */
+				VALGRIND_MAKE_MEM_DEFINED(MemoryChunkGetPointer(cur_chunk), sizeof(MemoryChunk *));
+				cur_chunk = *(MemoryChunk **) SlabChunkGetPointer(cur_chunk);
+			}
+
+			/* check that the unused pointer matches what nunused claims */
+			if (SlabBlockGetChunk(slab, block, slab->chunksPerBlock - block->nunused) !=
+				block->unused)
+				elog(WARNING, "problem in slab %s: mismatch detected between nunused chunks and unused pointer in block %p",
+					 name, block);
+
+			/*
+			 * count the remaining free chunks that have yet to make it onto
+			 * the block's free list.
+			 */
+			cur_chunk = block->unused;
+			for (j = 0; j < block->nunused; j++)
+			{
+				int			chunkidx = SlabChunkIndex(slab, block, cur_chunk);
+
+
+				/* count the chunk as free and mark it as so in the array */
+				nfree++;
+				if (chunkidx < slab->chunksPerBlock)
+					slab->isChunkFree[chunkidx] = true;
+
+				/* move forward 1 chunk */
+				cur_chunk = (MemoryChunk *) (((char *) cur_chunk) + slab->fullChunkSize);
+			}
+
+			for (j = 0; j < slab->chunksPerBlock; j++)
+			{
+				if (!slab->isChunkFree[j])
+				{
+					MemoryChunk *chunk = SlabBlockGetChunk(slab, block, j);
+					SlabBlock  *chunkblock;
+
+					/* Allow access to the chunk header. */
+					VALGRIND_MAKE_MEM_DEFINED(chunk, Slab_CHUNKHDRSZ);
+
+					chunkblock = (SlabBlock *) MemoryChunkGetBlock(chunk);
+
+					/* Disallow access to the chunk header. */
+					VALGRIND_MAKE_MEM_NOACCESS(chunk, Slab_CHUNKHDRSZ);
+
+					/*
+					 * check the chunk's blockoffset correctly points back to
+					 * the block
+					 */
+					if (chunkblock != block)
+						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+							 name, block, chunk);
+
+					/* check the sentinel byte is intact */
+					Assert(slab->chunkSize < (slab->fullChunkSize - Slab_CHUNKHDRSZ));
+					if (!sentinel_ok(chunk, Slab_CHUNKHDRSZ + slab->chunkSize))
+						elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+							 name, block, chunk);
+				}
+			}
+
+			/*
+			 * Make sure we got the expected number of free chunks (as tracked
+			 * in the block header).
+			 */
+			if (nfree != block->nfree)
+				elog(WARNING, "problem in slab %s: nfree in block %p is %d but %d chunk were found as free",
+					 name, block, block->nfree, nfree);
+
+			nblocks++;
+		}
+	}
+
+	/* the stored empty blocks are tracked in mem_allocated too */
+	nblocks += dclist_count(&slab->emptyblocks);
+
+	Assert(nblocks * slab->blockSize == context->mem_allocated);
+}
+
+#endif							/* MEMORY_CONTEXT_CHECKING */
author	vvvv <[email protected]>	2024-11-07 12:29:36 +0300
committer	vvvv <[email protected]>	2024-11-07 13:49:47 +0300
commit	d4c258e9431675bab6745c8638df6e3dfd4dca6b (patch)
tree	b5efcfa11351152a4c872fccaea35749141c0b11 /yql/essentials/parser/pg_wrapper/postgresql/src/backend/utils/mmgr
parent	13a4f274caef5cfdaf0263b24e4d6bdd5521472b (diff)