From 1fe6b17873edfa1429b9f9c1616e23d41a29d0a2 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 2 Dec 2016 11:29:01 -0500 Subject: [PATCH 01/10] Add a crude facility for dealing with relative pointers. C doesn't have any sort of built-in understanding of a pointer relative to some arbitrary base address, but dynamic shared memory segments can be mapped at different addresses in different processes, so any sort of shared data structure stored within a dynamic shared memory segment can't use absolute pointers. We could use something like Size to represent a relative pointer, but then the compiler provides no type-checking. Use stupid macro tricks to get some type-checking. Patch originally by me. Concept suggested by Andres Freund. Recently resubmitted as part of Thomas Munro's work on dynamic shared memory allocation. Discussion: 20131205144434.GG12398@alap2.anarazel.de Discussion: CAEepm=1z5WLuNoJ80PaCvz6EtG9dN0j-KuHcHtU6QEfcPP5-qA@mail.gmail.com (cherry picked from commit fbc1c12a94a638cf4f577fef158175e22ab824a3) --- src/include/utils/relptr.h | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 src/include/utils/relptr.h diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h new file mode 100644 index 00000000000..f01924a1edf --- /dev/null +++ b/src/include/utils/relptr.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * relptr.h + * This file contains basic declarations for relative pointers. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/relptr.h + * + *------------------------------------------------------------------------- + */ + +#ifndef RELPTR_H +#define RELPTR_H + +/* + * Relative pointers are intended to be used when storing an address that may + * be relative either to the base of the processes address space or some + * dynamic shared memory segment mapped therein. + * + * The idea here is that you declare a relative pointer as relptr(type) + * and then use relptr_access to dereference it and relptr_store to change + * it. The use of a union here is a hack, because what's stored in the + * relptr is always a Size, never an actual pointer. But including a pointer + * in the union allows us to use stupid macro tricks to provide some measure + * of type-safety. + */ +#define relptr(type) union { type *relptr_type; Size relptr_off; } + +/* + * pgindent gets confused by declarations of the type relptr(type), so it's + * useful to give them a name that doesn't include parentheses. + */ +#define relptr_declare(type, name) \ + typedef union { type *relptr_type; Size relptr_off; } name; + +#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P +#define relptr_access(base, rp) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \ + (base + (rp).relptr_off))) +#else +/* + * If we don't have __builtin_types_compatible_p, assume we might not have + * __typeof__ either. + */ +#define relptr_access(base, rp) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (void *) ((rp).relptr_off == 0 ? NULL : (base + (rp).relptr_off))) +#endif + +#define relptr_is_null(rp) \ + ((rp).relptr_off == 0) + +#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P +#define relptr_store(base, rp, val) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \ + (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base))) +#else +/* + * If we don't have __builtin_types_compatible_p, assume we might not have + * __typeof__ either. + */ +#define relptr_store(base, rp, val) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base))) +#endif + +#define relptr_copy(rp1, rp2) \ + ((rp1).relptr_off = (rp2).relptr_off) + +#endif /* RELPTR_H */ From fd76bbfe17675cd5d1b049456443a5c30f815864 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 2 Dec 2016 12:03:30 -0500 Subject: [PATCH 02/10] Management of free memory pages. This is intended as infrastructure for a full-fledged allocator for dynamic shared memory. The interface looks a bit like a real allocator, but only supports allocating and freeing memory in multiples of the 4kB page size. Further, to free memory, you must know the size of the span you wish to free, in pages. While these are make it unsuitable as an allocator in and of itself, it still serves as very useful scaffolding for a full-fledged allocator. Robert Haas and Thomas Munro. This code is mostly the same as my 2014 submission, but Thomas fixed quite a few bugs and made some changes to the interface. Discussion: CA+TgmobkeWptGwiNa+SGFWsTLzTzD-CeLz0KcE-y6LFgoUus4A@mail.gmail.com Discussion: CAEepm=1z5WLuNoJ80PaCvz6EtG9dN0j-KuHcHtU6QEfcPP5-qA@mail.gmail.com (cherry picked from commit 13e14a78ea15f4c581cae408b5010c13961c96de) --- src/backend/utils/mmgr/Makefile | 2 +- src/backend/utils/mmgr/freepage.c | 1886 +++++++++++++++++++++++++++++ src/include/utils/freepage.h | 99 ++ src/tools/pgindent/typedefs.list | 7 + 4 files changed, 1993 insertions(+), 1 deletion(-) create mode 100644 src/backend/utils/mmgr/freepage.c create mode 100644 src/include/utils/freepage.h diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile index 2d24fa0124c..4071dcf15c9 100644 --- a/src/backend/utils/mmgr/Makefile +++ b/src/backend/utils/mmgr/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/utils/mmgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = aset.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o +OBJS = aset.o freepage.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o # In PostgreSQL, this is under src/common. It has been backported, but because # we haven't merged the changes that introduced the src/common directory, it diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c new file mode 100644 index 00000000000..8c017a425a4 --- /dev/null +++ b/src/backend/utils/mmgr/freepage.c @@ -0,0 +1,1886 @@ +/*------------------------------------------------------------------------- + * + * freepage.c + * Management of free memory pages. + * + * The intention of this code is to provide infrastructure for memory + * allocators written specifically for PostgreSQL. At least in the case + * of dynamic shared memory, we can't simply use malloc() or even + * relatively thin wrappers like palloc() which sit on top of it, because + * no allocator built into the operating system will deal with relative + * pointers. In the future, we may find other cases in which greater + * control over our own memory management seems desirable. + * + * A FreePageManager keeps track of which 4kB pages of memory are currently + * unused from the point of view of some higher-level memory allocator. + * Unlike a user-facing allocator such as palloc(), a FreePageManager can + * only allocate and free in units of whole pages, and freeing an + * allocation can only be done given knowledge of its length in pages. + * + * Since a free page manager has only a fixed amount of dedicated memory, + * and since there is no underlying allocator, it uses the free pages + * it is given to manage to store its bookkeeping data. It keeps multiple + * freelists of runs of pages, sorted by the size of the run; the head of + * each freelist is stored in the FreePageManager itself, and the first + * page of each run contains a relative pointer to the next run. See + * FreePageManagerGetInternal for more details on how the freelists are + * managed. + * + * To avoid memory fragmentation, it's important to consolidate adjacent + * spans of pages whenever possible; otherwise, large allocation requests + * might not be satisfied even when sufficient contiguous space is + * available. Therefore, in addition to the freelists, we maintain an + * in-memory btree of free page ranges ordered by page number. If a + * range being freed precedes or follows a range that is already free, + * the existing range is extended; if it exactly bridges the gap between + * free ranges, then the two existing ranges are consolidated with the + * newly-freed range to form one great big range of free pages. + * + * When there is only one range of free pages, the btree is trivial and + * is stored within the FreePageManager proper; otherwise, pages are + * allocated from the area under management as needed. Even in cases + * where memory fragmentation is very severe, only a tiny fraction of + * the pages under management are consumed by this btree. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/freepage.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" + +#include "utils/freepage.h" +#include "utils/relptr.h" + + +/* Magic numbers to identify various page types */ +#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0 +#define FREE_PAGE_LEAF_MAGIC 0x98eae728 +#define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9 + +/* Doubly linked list of spans of free pages; stored in first page of span. */ +struct FreePageSpanLeader +{ + int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */ + Size npages; /* number of pages in span */ + RelptrFreePageSpanLeader prev; + RelptrFreePageSpanLeader next; +}; + +/* Common header for btree leaf and internal pages. */ +typedef struct FreePageBtreeHeader +{ + int magic; /* FREE_PAGE_LEAF_MAGIC or + * FREE_PAGE_INTERNAL_MAGIC */ + Size nused; /* number of items used */ + RelptrFreePageBtree parent; /* uplink */ +} FreePageBtreeHeader; + +/* Internal key; points to next level of btree. */ +typedef struct FreePageBtreeInternalKey +{ + Size first_page; /* low bound for keys on child page */ + RelptrFreePageBtree child; /* downlink */ +} FreePageBtreeInternalKey; + +/* Leaf key; no payload data. */ +typedef struct FreePageBtreeLeafKey +{ + Size first_page; /* first page in span */ + Size npages; /* number of pages in span */ +} FreePageBtreeLeafKey; + +/* Work out how many keys will fit on a page. */ +#define FPM_ITEMS_PER_INTERNAL_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeInternalKey)) +#define FPM_ITEMS_PER_LEAF_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeLeafKey)) + +/* A btree page of either sort */ +struct FreePageBtree +{ + FreePageBtreeHeader hdr; + union + { + FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]; + FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]; + } u; +}; + +/* Results of a btree search */ +typedef struct FreePageBtreeSearchResult +{ + FreePageBtree *page; + Size index; + bool found; + unsigned split_pages; +} FreePageBtreeSearchResult; + +/* Helper functions */ +static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, + FreePageBtree *btp); +static Size FreePageBtreeCleanup(FreePageManager *fpm); +static FreePageBtree *FreePageBtreeFindLeftSibling(char *base, + FreePageBtree *btp); +static FreePageBtree *FreePageBtreeFindRightSibling(char *base, + FreePageBtree *btp); +static Size FreePageBtreeFirstKey(FreePageBtree *btp); +static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm); +static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, + Size index, Size first_page, FreePageBtree *child); +static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, + Size first_page, Size npages); +static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno); +static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, + Size index); +static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp); +static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result); +static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page); +static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page); +static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm, + FreePageBtree *btp); +static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp); +static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf); +static void FreePageManagerDumpSpans(FreePageManager *fpm, + FreePageSpanLeader *span, Size expected_pages, + StringInfo buf); +static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, + Size *first_page); +static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, + Size npages, bool soft); +static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno); +static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, + Size npages); +static Size FreePageManagerLargestContiguous(FreePageManager *fpm); +static void FreePageManagerUpdateLargest(FreePageManager *fpm); + +#if FPM_EXTRA_ASSERTS +static Size sum_free_pages(FreePageManager *fpm); +#endif + +/* + * Initialize a new, empty free page manager. + * + * 'fpm' should reference caller-provided memory large enough to contain a + * FreePageManager. We'll initialize it here. + * + * 'base' is the address to which all pointers are relative. When managing + * a dynamic shared memory segment, it should normally be the base of the + * segment. When managing backend-private memory, it can be either NULL or, + * if managing a single contiguous extent of memory, the start of that extent. + */ +void +FreePageManagerInitialize(FreePageManager *fpm, char *base) +{ + Size f; + + relptr_store(base, fpm->self, fpm); + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL); + fpm->btree_depth = 0; + fpm->btree_recycle_count = 0; + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->contiguous_pages = 0; + fpm->contiguous_pages_dirty = true; +#ifdef FPM_EXTRA_ASSERTS + fpm->free_pages = 0; +#endif + + for (f = 0; f < FPM_NUM_FREELISTS; f++) + relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL); +} + +/* + * Allocate a run of pages of the given length from the free page manager. + * The return value indicates whether we were able to satisfy the request; + * if true, the first page of the allocation is stored in *first_page. + */ +bool +FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) +{ + bool result; + Size contiguous_pages; + + result = FreePageManagerGetInternal(fpm, npages, first_page); + + /* + * It's a bit counterintuitive, but allocating pages can actually create + * opportunities for cleanup that create larger ranges. We might pull a + * key out of the btree that enables the item at the head of the btree + * recycle list to be inserted; and then if there are more items behind it + * one of those might cause two currently-separated ranges to merge, + * creating a single range of contiguous pages larger than any that + * existed previously. It might be worth trying to improve the cleanup + * algorithm to avoid such corner cases, but for now we just notice the + * condition and do the appropriate reporting. + */ + contiguous_pages = FreePageBtreeCleanup(fpm); + if (fpm->contiguous_pages < contiguous_pages) + fpm->contiguous_pages = contiguous_pages; + + /* + * FreePageManagerGetInternal may have set contiguous_pages_dirty. + * Recompute contigous_pages if so. + */ + FreePageManagerUpdateLargest(fpm); + +#ifdef FPM_EXTRA_ASSERTS + if (result) + { + Assert(fpm->free_pages >= npages); + fpm->free_pages -= npages; + } + Assert(fpm->free_pages == sum_free_pages(fpm)); + Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); +#endif + return result; +} + +#ifdef FPM_EXTRA_ASSERTS +static void +sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum) +{ + char *base = fpm_segment_base(fpm); + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC || + btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + ++*sum; + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + Size index; + + + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + sum_free_pages_recurse(fpm, child, sum); + } + } +} +static Size +sum_free_pages(FreePageManager *fpm) +{ + FreePageSpanLeader *recycle; + char *base = fpm_segment_base(fpm); + Size sum = 0; + int list; + + /* Count the spans by scanning the freelists. */ + for (list = 0; list < FPM_NUM_FREELISTS; ++list) + { + + if (!relptr_is_null(fpm->freelist[list])) + { + FreePageSpanLeader *candidate = + relptr_access(base, fpm->freelist[list]); + + do + { + sum += candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + } + + /* Count btree internal pages. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + sum_free_pages_recurse(fpm, root, &sum); + } + + /* Count the recycle list. */ + for (recycle = relptr_access(base, fpm->btree_recycle); + recycle != NULL; + recycle = relptr_access(base, recycle->next)) + { + Assert(recycle->npages == 1); + ++sum; + } + + return sum; +} +#endif + +/* + * Compute the size of the largest run of pages that the user could + * succesfully get. + */ +static Size +FreePageManagerLargestContiguous(FreePageManager *fpm) +{ + char *base; + Size largest; + + base = fpm_segment_base(fpm); + largest = 0; + if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1])) + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]); + do + { + if (candidate->npages > largest) + largest = candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + else + { + Size f = FPM_NUM_FREELISTS - 1; + + do + { + --f; + if (!relptr_is_null(fpm->freelist[f])) + { + largest = f + 1; + break; + } + } while (f > 0); + } + + return largest; +} + +/* + * Recompute the size of the largest run of pages that the user could + * succesfully get, if it has been marked dirty. + */ +static void +FreePageManagerUpdateLargest(FreePageManager *fpm) +{ + if (!fpm->contiguous_pages_dirty) + return; + + fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm); + fpm->contiguous_pages_dirty = false; +} + +/* + * Transfer a run of pages to the free page manager. + */ +void +FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages) +{ + Size contiguous_pages; + + Assert(npages > 0); + + /* Record the new pages. */ + contiguous_pages = + FreePageManagerPutInternal(fpm, first_page, npages, false); + + /* + * If the new range we inserted into the page manager was contiguous with + * an existing range, it may have opened up cleanup opportunities. + */ + if (contiguous_pages > npages) + { + Size cleanup_contiguous_pages; + + cleanup_contiguous_pages = FreePageBtreeCleanup(fpm); + if (cleanup_contiguous_pages > contiguous_pages) + contiguous_pages = cleanup_contiguous_pages; + } + + /* See if we now have a new largest chunk. */ + if (fpm->contiguous_pages < contiguous_pages) + fpm->contiguous_pages = contiguous_pages; + + /* + * The earlier call to FreePageManagerPutInternal may have set + * contiguous_pages_dirty if it needed to allocate internal pages, so + * recompute contiguous_pages if necessary. + */ + FreePageManagerUpdateLargest(fpm); + +#ifdef FPM_EXTRA_ASSERTS + fpm->free_pages += npages; + Assert(fpm->free_pages == sum_free_pages(fpm)); + Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); +#endif +} + +/* + * Produce a debugging dump of the state of a free page manager. + */ +char * +FreePageManagerDump(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + StringInfoData buf; + FreePageSpanLeader *recycle; + bool dumped_any_freelist = false; + Size f; + + /* Initialize output buffer. */ + initStringInfo(&buf); + + /* Dump general stuff. */ + appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n", + fpm->self.relptr_off, fpm->contiguous_pages); + + /* Dump btree. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root; + + appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth); + root = relptr_access(base, fpm->btree_root); + FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf); + } + else if (fpm->singleton_npages > 0) + { + appendStringInfo(&buf, "singleton: %zu(%zu)\n", + fpm->singleton_first_page, fpm->singleton_npages); + } + + /* Dump btree recycle list. */ + recycle = relptr_access(base, fpm->btree_recycle); + if (recycle != NULL) + { + appendStringInfo(&buf, "btree recycle:"); + FreePageManagerDumpSpans(fpm, recycle, 1, &buf); + } + + /* Dump free lists. */ + for (f = 0; f < FPM_NUM_FREELISTS; ++f) + { + FreePageSpanLeader *span; + + if (relptr_is_null(fpm->freelist[f])) + continue; + if (!dumped_any_freelist) + { + appendStringInfo(&buf, "freelists:\n"); + dumped_any_freelist = true; + } + appendStringInfo(&buf, " %zu:", f + 1); + span = relptr_access(base, fpm->freelist[f]); + FreePageManagerDumpSpans(fpm, span, f + 1, &buf); + } + + /* And return result to caller. */ + return buf.data; +} + + +/* + * The first_page value stored at index zero in any non-root page must match + * the first_page value stored in its parent at the index which points to that + * page. So when the value stored at index zero in a btree page changes, we've + * got to walk up the tree adjusting ancestor keys until we reach an ancestor + * where that key isn't index zero. This function should be called after + * updating the first key on the target page; it will propagate the change + * upward as far as needed. + * + * We assume here that the first key on the page has not changed enough to + * require changes in the ordering of keys on its ancestor pages. Thus, + * if we search the parent page for the first key greater than or equal to + * the first key on the current page, the downlink to this page will be either + * the exact index returned by the search (if the first key decreased) + * or one less (if the first key increased). + */ +static void +FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + Size first_page; + FreePageBtree *parent; + FreePageBtree *child; + + /* This might be either a leaf or an internal page. */ + Assert(btp->hdr.nused > 0); + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + first_page = btp->u.leaf_key[0].first_page; + } + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + first_page = btp->u.internal_key[0].first_page; + } + child = btp; + + /* Loop until we find an ancestor that does not require adjustment. */ + for (;;) + { + Size s; + + parent = relptr_access(base, child->hdr.parent); + if (parent == NULL) + break; + s = FreePageBtreeSearchInternal(parent, first_page); + + /* Key is either at index s or index s-1; figure out which. */ + if (s >= parent->hdr.nused) + { + Assert(s == parent->hdr.nused); + --s; + } + else + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + if (check != child) + { + Assert(s > 0); + --s; + } + } + +#ifdef USE_ASSERT_CHECKING + /* Debugging double-check. */ + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + Assert(s < parent->hdr.nused); + Assert(child == check); + } +#endif + + /* Update the parent key. */ + parent->u.internal_key[s].first_page = first_page; + + /* + * If this is the first key in the parent, go up another level; else + * done. + */ + if (s > 0) + break; + child = parent; + } +} + +/* + * Attempt to reclaim space from the free-page btree. The return value is + * the largest range of contiguous pages created by the cleanup operation. + */ +static Size +FreePageBtreeCleanup(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + Size max_contiguous_pages = 0; + + /* Attempt to shrink the depth of the btree. */ + while (!relptr_is_null(fpm->btree_root)) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + /* If the root contains only one key, reduce depth by one. */ + if (root->hdr.nused == 1) + { + /* Shrink depth of tree by one. */ + Assert(fpm->btree_depth > 0); + --fpm->btree_depth; + if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + /* If root is a leaf, convert only entry to singleton range. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages; + } + else + { + FreePageBtree *newroot; + + /* If root is an internal page, make only child the root. */ + Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + relptr_copy(fpm->btree_root, root->u.internal_key[0].child); + newroot = relptr_access(base, fpm->btree_root); + relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); + } + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root)); + } + else if (root->hdr.nused == 2 && + root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Size end_of_first; + Size start_of_second; + + end_of_first = root->u.leaf_key[0].first_page + + root->u.leaf_key[0].npages; + start_of_second = root->u.leaf_key[1].first_page; + + if (end_of_first + 1 == start_of_second) + { + Size root_page = fpm_pointer_to_page(base, root); + + if (end_of_first == root_page) + { + FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page); + FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages + + root->u.leaf_key[1].npages + 1; + fpm->btree_depth = 0; + relptr_store(base, fpm->btree_root, + (FreePageBtree *) NULL); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + Assert(max_contiguous_pages == 0); + max_contiguous_pages = fpm->singleton_npages; + } + } + + /* Whether it worked or not, it's time to stop. */ + break; + } + else + { + /* Nothing more to do. Stop. */ + break; + } + } + + /* + * Attempt to free recycled btree pages. We skip this if releasing the + * recycled page would require a btree page split, because the page we're + * trying to recycle would be consumed by the split, which would be + * counterproductive. + * + * We also currently only ever attempt to recycle the first page on the + * list; that could be made more aggressive, but it's not clear that the + * complexity would be worthwhile. + */ + while (fpm->btree_recycle_count > 0) + { + FreePageBtree *btp; + Size first_page; + Size contiguous_pages; + + btp = FreePageBtreeGetRecycled(fpm); + first_page = fpm_pointer_to_page(base, btp); + contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true); + if (contiguous_pages == 0) + { + FreePageBtreeRecycle(fpm, first_page); + break; + } + else + { + if (contiguous_pages > max_contiguous_pages) + max_contiguous_pages = contiguous_pages; + } + } + + return max_contiguous_pages; +} + +/* + * Consider consolidating the given page with its left or right sibling, + * if it's fairly empty. + */ +static void +FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *np; + Size max; + + /* + * We only try to consolidate pages that are less than a third full. We + * could be more aggressive about this, but that might risk performing + * consolidation only to end up splitting again shortly thereafter. Since + * the btree should be very small compared to the space under management, + * our goal isn't so much to ensure that it always occupies the absolutely + * smallest possible number of pages as to reclaim pages before things get + * too egregiously out of hand. + */ + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + max = FPM_ITEMS_PER_LEAF_PAGE; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + max = FPM_ITEMS_PER_INTERNAL_PAGE; + } + if (btp->hdr.nused >= max / 3) + return; + + /* + * If we can fit our right sibling's keys onto this page, consolidate. + */ + np = FreePageBtreeFindRightSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + } + else + { + memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + FreePageBtreeUpdateParentPointers(base, btp); + } + FreePageBtreeRemovePage(fpm, np); + return; + } + + /* + * If we can fit our keys onto our left sibling's page, consolidate. In + * this case, we move our keys onto the other page rather than visca + * versa, to avoid having to adjust ancestor keys. + */ + np = FreePageBtreeFindLeftSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + } + else + { + memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + FreePageBtreeUpdateParentPointers(base, np); + } + FreePageBtreeRemovePage(fpm, btp); + return; + } +} + +/* + * Find the passed page's left sibling; that is, the page at the same level + * of the tree whose keyspace immediately precedes ours. + */ +static FreePageBtree * +FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move left. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index > 0) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index - 1].child); + break; + } + Assert(index == 0); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Find the passed page's right sibling; that is, the page at the same level + * of the tree whose keyspace immediately follows ours. + */ +static FreePageBtree * +FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move right. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index < p->hdr.nused - 1) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index + 1].child); + break; + } + Assert(index == p->hdr.nused - 1); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[0].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Get the first key on a btree page. + */ +static Size +FreePageBtreeFirstKey(FreePageBtree *btp) +{ + Assert(btp->hdr.nused > 0); + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + return btp->u.leaf_key[0].first_page; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + return btp->u.internal_key[0].first_page; + } +} + +/* + * Get a page from the btree recycle list for use as a btree page. + */ +static FreePageBtree * +FreePageBtreeGetRecycled(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *newhead; + + Assert(victim != NULL); + newhead = relptr_access(base, victim->next); + if (newhead != NULL) + relptr_copy(newhead->prev, victim->prev); + relptr_store(base, fpm->btree_recycle, newhead); + Assert(fpm_pointer_is_page_aligned(base, victim)); + fpm->btree_recycle_count--; + return (FreePageBtree *) victim; +} + +/* + * Insert an item into an internal page. + */ +static void +FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, + Size first_page, FreePageBtree *child) +{ + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], + sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); + btp->u.internal_key[index].first_page = first_page; + relptr_store(base, btp->u.internal_key[index].child, child); + ++btp->hdr.nused; +} + +/* + * Insert an item into a leaf page. + */ +static void +FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, + Size npages) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + btp->u.leaf_key[index].first_page = first_page; + btp->u.leaf_key[index].npages = npages; + ++btp->hdr.nused; +} + +/* + * Put a page on the btree recycle list. + */ +static void +FreePageBtreeRecycle(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = 1; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->btree_recycle, span); + fpm->btree_recycle_count++; +} + +/* + * Remove an item from the btree at the given position on the given page. + */ +static void +FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(index < btp->hdr.nused); + + /* When last item is removed, extirpate entire page from btree. */ + if (btp->hdr.nused == 1) + { + FreePageBtreeRemovePage(fpm, btp); + return; + } + + /* Physically remove the key from the page. */ + --btp->hdr.nused; + if (index < btp->hdr.nused) + memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + + /* If we just removed the first key, adjust ancestor keys. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, btp); + + /* Consider whether to consolidate this page with a sibling. */ + FreePageBtreeConsolidate(fpm, btp); +} + +/* + * Remove a page from the btree. Caller is responsible for having relocated + * any keys from this page that are still wanted. The page is placed on the + * recycled list. + */ +static void +FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *parent; + Size index; + Size first_page; + + for (;;) + { + /* Find parent page. */ + parent = relptr_access(base, btp->hdr.parent); + if (parent == NULL) + { + /* We are removing the root page. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->btree_depth = 0; + Assert(fpm->singleton_first_page == 0); + Assert(fpm->singleton_npages == 0); + return; + } + + /* + * If the parent contains only one item, we need to remove it as well. + */ + if (parent->hdr.nused > 1) + break; + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + btp = parent; + } + + /* Find and remove the downlink. */ + first_page = FreePageBtreeFirstKey(btp); + if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + index = FreePageBtreeSearchLeaf(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.leaf_key[index], + &parent->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) + * (parent->hdr.nused - index - 1)); + } + else + { + index = FreePageBtreeSearchInternal(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.internal_key[index], + &parent->u.internal_key[index + 1], + sizeof(FreePageBtreeInternalKey) + * (parent->hdr.nused - index - 1)); + } + parent->hdr.nused--; + Assert(parent->hdr.nused > 0); + + /* Recycle the page. */ + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + + /* Adjust ancestor keys if needed. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + + /* Consider whether to consolidate the parent with a sibling. */ + FreePageBtreeConsolidate(fpm, parent); +} + +/* + * Search the btree for an entry for the given first page and initialize + * *result with the results of the search. result->page and result->index + * indicate either the position of an exact match or the position at which + * the new key should be inserted. result->found is true for an exact match, + * otherwise false. result->split_pages will contain the number of additional + * btree pages that will be needed when performing a split to insert a key. + * Except as described above, the contents of fields in the result object are + * undefined on return. + */ +static void +FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *btp = relptr_access(base, fpm->btree_root); + Size index; + + result->split_pages = 1; + + /* If the btree is empty, there's nothing to find. */ + if (btp == NULL) + { + result->page = NULL; + result->found = false; + return; + } + + /* Descend until we hit a leaf. */ + while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + FreePageBtree *child; + bool found_exact; + + index = FreePageBtreeSearchInternal(btp, first_page); + found_exact = index < btp->hdr.nused && + btp->u.internal_key[index].first_page == first_page; + + /* + * If we found an exact match we descend directly. Otherwise, we + * descend into the child to the left if possible so that we can find + * the insertion point at that child's high end. + */ + if (!found_exact && index > 0) + --index; + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Descend to appropriate child page. */ + Assert(index < btp->hdr.nused); + child = relptr_access(base, btp->u.internal_key[index].child); + Assert(relptr_access(base, child->hdr.parent) == btp); + btp = child; + } + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Search leaf page. */ + index = FreePageBtreeSearchLeaf(btp, first_page); + + /* Assemble results. */ + result->page = btp; + result->index = index; + result->found = index < btp->hdr.nused && + first_page == btp->u.leaf_key[index].first_page; +} + +/* + * Search an internal page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.internal_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Search a leaf page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.leaf_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Allocate a new btree page and move half the keys from the provided page + * to the new page. Caller is responsible for making sure that there's a + * page available from fpm->btree_recycle. Returns a pointer to the new page, + * to which caller must add a downlink. + */ +static FreePageBtree * +FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp) +{ + FreePageBtree *newsibling; + + newsibling = FreePageBtreeGetRecycled(fpm); + newsibling->hdr.magic = btp->hdr.magic; + newsibling->hdr.nused = btp->hdr.nused / 2; + relptr_copy(newsibling->hdr.parent, btp->hdr.parent); + btp->hdr.nused -= newsibling->hdr.nused; + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + memcpy(&newsibling->u.leaf_key, + &btp->u.leaf_key[btp->hdr.nused], + sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused); + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + memcpy(&newsibling->u.internal_key, + &btp->u.internal_key[btp->hdr.nused], + sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused); + FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling); + } + + return newsibling; +} + +/* + * When internal pages are split or merged, the parent pointers of their + * children must be updated. + */ +static void +FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp) +{ + Size i; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + for (i = 0; i < btp->hdr.nused; ++i) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[i].child); + relptr_store(base, child->hdr.parent, btp); + } +} + +/* + * Debugging dump of btree data. + */ +static void +FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + Size pageno = fpm_pointer_to_page(base, btp); + Size index; + FreePageBtree *check_parent; + + check_stack_depth(); + check_parent = relptr_access(base, btp->hdr.parent); + appendStringInfo(buf, " %zu@%d %c", pageno, level, + btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l'); + if (parent != check_parent) + appendStringInfo(buf, " [actual parent %zu, expected %zu]", + fpm_pointer_to_page(base, check_parent), + fpm_pointer_to_page(base, parent)); + appendStringInfoChar(buf, ':'); + for (index = 0; index < btp->hdr.nused; ++index) + { + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + appendStringInfo(buf, " %zu->%zu", + btp->u.internal_key[index].first_page, + btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE); + else + appendStringInfo(buf, " %zu(%zu)", + btp->u.leaf_key[index].first_page, + btp->u.leaf_key[index].npages); + } + appendStringInfo(buf, "\n"); + + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf); + } + } +} + +/* + * Debugging dump of free-span data. + */ +static void +FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, + Size expected_pages, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + + while (span != NULL) + { + if (span->npages != expected_pages) + appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span), + span->npages); + else + appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span)); + span = relptr_access(base, span->next); + } + + appendStringInfo(buf, "\n"); +} + +/* + * This function allocates a run of pages of the given length from the free + * page manager. + */ +static bool +FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = NULL; + FreePageSpanLeader *prev; + FreePageSpanLeader *next; + FreePageBtreeSearchResult result; + Size victim_page = 0; /* placate compiler */ + Size f; + + /* + * Search for a free span. + * + * Right now, we use a simple best-fit policy here, but it's possible for + * this to result in memory fragmentation if we're repeatedly asked to + * allocate chunks just a little smaller than what we have available. + * Hopefully, this is unlikely, because we expect most requests to be + * single pages or superblock-sized chunks -- but no policy can be optimal + * under all circumstances unless it has knowledge of future allocation + * patterns. + */ + for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f) + { + /* Skip empty freelists. */ + if (relptr_is_null(fpm->freelist[f])) + continue; + + /* + * All of the freelists except the last one contain only items of a + * single size, so we just take the first one. But the final free + * list contains everything too big for any of the other lists, so we + * need to search the list. + */ + if (f < FPM_NUM_FREELISTS - 1) + victim = relptr_access(base, fpm->freelist[f]); + else + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[f]); + do + { + if (candidate->npages >= npages && (victim == NULL || + victim->npages > candidate->npages)) + { + victim = candidate; + if (victim->npages == npages) + break; + } + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + break; + } + + /* If we didn't find an allocatable span, return failure. */ + if (victim == NULL) + return false; + + /* Remove span from free list. */ + Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC); + prev = relptr_access(base, victim->prev); + next = relptr_access(base, victim->next); + if (prev != NULL) + relptr_copy(prev->next, victim->next); + else + relptr_copy(fpm->freelist[f], victim->next); + if (next != NULL) + relptr_copy(next->prev, victim->prev); + victim_page = fpm_pointer_to_page(base, victim); + + /* Decide whether we might be invalidating contiguous_pages. */ + if (f == FPM_NUM_FREELISTS - 1 && + victim->npages == fpm->contiguous_pages) + { + /* + * The victim span came from the oversized freelist, and had the same + * size as the longest span. There may or may not be another one of + * the same size, so contiguous_pages must be recomputed just to be + * safe. + */ + fpm->contiguous_pages_dirty = true; + } + else if (f + 1 == fpm->contiguous_pages && + relptr_is_null(fpm->freelist[f])) + { + /* + * The victim span came from a fixed sized freelist, and it was the + * list for spans of the same size as the current longest span, and + * the list is now empty after removing the victim. So + * contiguous_pages must be recomputed without a doubt. + */ + fpm->contiguous_pages_dirty = true; + } + + /* + * If we haven't initialized the btree yet, the victim must be the single + * span stored within the FreePageManager itself. Otherwise, we need to + * update the btree. + */ + if (relptr_is_null(fpm->btree_root)) + { + Assert(victim_page == fpm->singleton_first_page); + Assert(victim->npages == fpm->singleton_npages); + Assert(victim->npages >= npages); + fpm->singleton_first_page += npages; + fpm->singleton_npages -= npages; + if (fpm->singleton_npages > 0) + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + } + else + { + /* + * If the span we found is exactly the right size, remove it from the + * btree completely. Otherwise, adjust the btree entry to reflect the + * still-unallocated portion of the span, and put that portion on the + * appropriate free list. + */ + FreePageBtreeSearch(fpm, victim_page, &result); + Assert(result.found); + if (victim->npages == npages) + FreePageBtreeRemove(fpm, result.page, result.index); + else + { + FreePageBtreeLeafKey *key; + + /* Adjust btree to reflect remaining pages. */ + Assert(victim->npages > npages); + key = &result.page->u.leaf_key[result.index]; + Assert(key->npages == victim->npages); + key->first_page += npages; + key->npages -= npages; + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put the unallocated pages back on the appropriate free list. */ + FreePagePushSpanLeader(fpm, victim_page + npages, + victim->npages - npages); + } + } + + /* Return results to caller. */ + *first_page = fpm_pointer_to_page(base, victim); + return true; +} + +/* + * Put a range of pages into the btree and freelists, consolidating it with + * existing free spans just before and/or after it. If 'soft' is true, + * only perform the insertion if it can be done without allocating new btree + * pages; if false, do it always. Returns 0 if the soft flag caused the + * insertion to be skipped, or otherwise the size of the contiguous span + * created by the insertion. This may be larger than npages if we're able + * to consolidate with an adjacent range. *internal_pages_used is set to + * true if the btree allocated pages for internal purposes, which might + * invalidate the current largest run requiring it to be recomputed. + */ +static Size +FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, + bool soft) +{ + char *base = fpm_segment_base(fpm); + FreePageBtreeSearchResult result; + FreePageBtreeLeafKey *prevkey = NULL; + FreePageBtreeLeafKey *nextkey = NULL; + FreePageBtree *np; + Size nindex; + + Assert(npages > 0); + + /* We can store a single free span without initializing the btree. */ + if (fpm->btree_depth == 0) + { + if (fpm->singleton_npages == 0) + { + /* Don't have a span yet; store this one. */ + fpm->singleton_first_page = first_page; + fpm->singleton_npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return fpm->singleton_npages; + } + else if (fpm->singleton_first_page + fpm->singleton_npages == + first_page) + { + /* New span immediately follows sole existing span. */ + fpm->singleton_npages += npages; + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else if (first_page + npages == fpm->singleton_first_page) + { + /* New span immediately precedes sole existing span. */ + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + fpm->singleton_first_page = first_page; + fpm->singleton_npages += npages; + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else + { + /* Not contiguous; we need to initialize the btree. */ + Size root_page; + FreePageBtree *root; + + if (!relptr_is_null(fpm->btree_recycle)) + root = FreePageBtreeGetRecycled(fpm); + else if (FreePageManagerGetInternal(fpm, 1, &root_page)) + root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); + else + { + /* We'd better be able to get a page from the existing range. */ + elog(FATAL, "free page manager btree is corrupt"); + } + + /* Create the btree and move the preexisting range into it. */ + root->hdr.magic = FREE_PAGE_LEAF_MAGIC; + root->hdr.nused = 1; + relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL); + root->u.leaf_key[0].first_page = fpm->singleton_first_page; + root->u.leaf_key[0].npages = fpm->singleton_npages; + relptr_store(base, fpm->btree_root, root); + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->btree_depth = 1; + + /* + * Corner case: it may be that the btree root took the very last + * free page. In that case, the sole btree entry covers a zero + * page run, which is invalid. Overwrite it with the entry we're + * trying to insert and get out. + */ + if (root->u.leaf_key[0].npages == 0) + { + root->u.leaf_key[0].first_page = first_page; + root->u.leaf_key[0].npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return npages; + } + + /* Fall through to insert the new key. */ + } + } + + /* Search the btree. */ + FreePageBtreeSearch(fpm, first_page, &result); + Assert(!result.found); + if (result.index > 0) + prevkey = &result.page->u.leaf_key[result.index - 1]; + if (result.index < result.page->hdr.nused) + { + np = result.page; + nindex = result.index; + nextkey = &result.page->u.leaf_key[result.index]; + } + else + { + np = FreePageBtreeFindRightSibling(base, result.page); + nindex = 0; + if (np != NULL) + nextkey = &np->u.leaf_key[0]; + } + + /* Consolidate with the previous entry if possible. */ + if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page) + { + bool remove_next = false; + Size result; + + Assert(prevkey->first_page + prevkey->npages == first_page); + prevkey->npages = (first_page - prevkey->first_page) + npages; + + /* Check whether we can *also* consolidate with the following entry. */ + if (nextkey != NULL && + prevkey->first_page + prevkey->npages >= nextkey->first_page) + { + Assert(prevkey->first_page + prevkey->npages == + nextkey->first_page); + prevkey->npages = (nextkey->first_page - prevkey->first_page) + + nextkey->npages; + FreePagePopSpanLeader(fpm, nextkey->first_page); + remove_next = true; + } + + /* Put the span on the correct freelist and save size. */ + FreePagePopSpanLeader(fpm, prevkey->first_page); + FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages); + result = prevkey->npages; + + /* + * If we consolidated with both the preceding and following entries, + * we must remove the following entry. We do this last, because + * removing an element from the btree may invalidate pointers we hold + * into the current data structure. + * + * NB: The btree is technically in an invalid state a this point + * because we've already updated prevkey to cover the same key space + * as nextkey. FreePageBtreeRemove() shouldn't notice that, though. + */ + if (remove_next) + FreePageBtreeRemove(fpm, np, nindex); + + return result; + } + + /* Consolidate with the next entry if possible. */ + if (nextkey != NULL && first_page + npages >= nextkey->first_page) + { + Size newpages; + + /* Compute new size for span. */ + Assert(first_page + npages == nextkey->first_page); + newpages = (nextkey->first_page - first_page) + nextkey->npages; + + /* Put span on correct free list. */ + FreePagePopSpanLeader(fpm, nextkey->first_page); + FreePagePushSpanLeader(fpm, first_page, newpages); + + /* Update key in place. */ + nextkey->first_page = first_page; + nextkey->npages = newpages; + + /* If reducing first key on page, ancestors might need adjustment. */ + if (nindex == 0) + FreePageBtreeAdjustAncestorKeys(fpm, np); + + return nextkey->npages; + } + + /* Split leaf page and as many of its ancestors as necessary. */ + if (result.split_pages > 0) + { + /* + * NB: We could consider various coping strategies here to avoid a + * split; most obviously, if np != result.page, we could target that + * page instead. More complicated shuffling strategies could be + * possible as well; basically, unless every single leaf page is 100% + * full, we can jam this key in there if we try hard enough. It's + * unlikely that trying that hard is worthwhile, but it's possible we + * might need to make more than no effort. For now, we just do the + * easy thing, which is nothing. + */ + + /* If this is a soft insert, it's time to give up. */ + if (soft) + return 0; + + /* Check whether we need to allocate more btree pages to split. */ + if (result.split_pages > fpm->btree_recycle_count) + { + Size pages_needed; + Size recycle_page; + Size i; + + /* + * Allocate the required number of pages and split each one in + * turn. This should never fail, because if we've got enough + * spans of free pages kicking around that we need additional + * storage space just to remember them all, then we should + * certainly have enough to expand the btree, which should only + * ever use a tiny number of pages compared to the number under + * management. If it does, something's badly screwed up. + */ + pages_needed = result.split_pages - fpm->btree_recycle_count; + for (i = 0; i < pages_needed; ++i) + { + if (!FreePageManagerGetInternal(fpm, 1, &recycle_page)) + elog(FATAL, "free page manager btree is corrupt"); + FreePageBtreeRecycle(fpm, recycle_page); + } + + /* + * The act of allocating pages to recycle may have invalidated the + * results of our previous btree reserch, so repeat it. (We could + * recheck whether any of our split-avoidance strategies that were + * not viable before now are, but it hardly seems worthwhile, so + * we don't bother. Consolidation can't be possible now if it + * wasn't previously.) + */ + FreePageBtreeSearch(fpm, first_page, &result); + + /* + * The act of allocating pages for use in constructing our btree + * should never cause any page to become more full, so the new + * split depth should be no greater than the old one, and perhaps + * less if we fortutiously allocated a chunk that freed up a slot + * on the page we need to update. + */ + Assert(result.split_pages <= fpm->btree_recycle_count); + } + + /* If we still need to perform a split, do it. */ + if (result.split_pages > 0) + { + FreePageBtree *split_target = result.page; + FreePageBtree *child = NULL; + Size key = first_page; + + for (;;) + { + FreePageBtree *newsibling; + FreePageBtree *parent; + + /* Identify parent page, which must receive downlink. */ + parent = relptr_access(base, split_target->hdr.parent); + + /* Split the page - downlink not added yet. */ + newsibling = FreePageBtreeSplitPage(fpm, split_target); + + /* + * At this point in the loop, we're always carrying a pending + * insertion. On the first pass, it's the actual key we're + * trying to insert; on subsequent passes, it's the downlink + * that needs to be added as a result of the split performed + * during the previous loop iteration. Since we've just split + * the page, there's definitely room on one of the two + * resulting pages. + */ + if (child == NULL) + { + Size index; + FreePageBtree *insert_into; + + insert_into = key < newsibling->u.leaf_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchLeaf(insert_into, key); + FreePageBtreeInsertLeaf(insert_into, index, key, npages); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + else + { + Size index; + FreePageBtree *insert_into; + + insert_into = + key < newsibling->u.internal_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchInternal(insert_into, key); + FreePageBtreeInsertInternal(base, insert_into, index, + key, child); + relptr_store(base, child->hdr.parent, insert_into); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + + /* If the page we just split has no parent, split the root. */ + if (parent == NULL) + { + FreePageBtree *newroot; + + newroot = FreePageBtreeGetRecycled(fpm); + newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC; + newroot->hdr.nused = 2; + relptr_store(base, newroot->hdr.parent, + (FreePageBtree *) NULL); + newroot->u.internal_key[0].first_page = + FreePageBtreeFirstKey(split_target); + relptr_store(base, newroot->u.internal_key[0].child, + split_target); + relptr_store(base, split_target->hdr.parent, newroot); + newroot->u.internal_key[1].first_page = + FreePageBtreeFirstKey(newsibling); + relptr_store(base, newroot->u.internal_key[1].child, + newsibling); + relptr_store(base, newsibling->hdr.parent, newroot); + relptr_store(base, fpm->btree_root, newroot); + fpm->btree_depth++; + + break; + } + + /* If the parent page isn't full, insert the downlink. */ + key = newsibling->u.internal_key[0].first_page; + if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE) + { + Size index; + + index = FreePageBtreeSearchInternal(parent, key); + FreePageBtreeInsertInternal(base, parent, index, + key, newsibling); + relptr_store(base, newsibling->hdr.parent, parent); + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + break; + } + + /* The parent also needs to be split, so loop around. */ + child = newsibling; + split_target = parent; + } + + /* + * The loop above did the insert, so just need to update the free + * list, and we're done. + */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; + } + } + + /* Physically add the key to the page. */ + Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); + FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages); + + /* If new first key on page, ancestors might need adjustment. */ + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put it on the free list. */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; +} + +/* + * Remove a FreePageSpanLeader from the linked-list that contains it, either + * because we're changing the size of the span, or because we're allocating it. + */ +static void +FreePagePopSpanLeader(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *span; + FreePageSpanLeader *next; + FreePageSpanLeader *prev; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + + next = relptr_access(base, span->next); + prev = relptr_access(base, span->prev); + if (next != NULL) + relptr_copy(next->prev, span->prev); + if (prev != NULL) + relptr_copy(prev->next, span->next); + else + { + Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1; + + Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE); + relptr_copy(fpm->freelist[f], span->next); + } +} + +/* + * Initialize a new FreePageSpanLeader and put it on the appropriate free list. + */ +static void +FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages) +{ + char *base = fpm_segment_base(fpm); + Size f = Min(npages, FPM_NUM_FREELISTS) - 1; + FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = npages; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->freelist[f], span); +} diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h new file mode 100644 index 00000000000..5e1305bc25c --- /dev/null +++ b/src/include/utils/freepage.h @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------- + * + * freepage.h + * Management of page-organized free memory. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/freepage.h + * + *------------------------------------------------------------------------- + */ + +#ifndef FREEPAGE_H +#define FREEPAGE_H + +#include "storage/lwlock.h" +#include "utils/relptr.h" + +/* Forward declarations. */ +typedef struct FreePageSpanLeader FreePageSpanLeader; +typedef struct FreePageBtree FreePageBtree; +typedef struct FreePageManager FreePageManager; + +/* + * PostgreSQL normally uses 8kB pages for most things, but many common + * architecture/operating system pairings use a 4kB page size for memory + * allocation, so we do that here also. + */ +#define FPM_PAGE_SIZE 4096 + +/* + * Each freelist except for the last contains only spans of one particular + * size. Everything larger goes on the last one. In some sense this seems + * like a waste since most allocations are in a few common sizes, but it + * means that small allocations can simply pop the head of the relevant list + * without needing to worry about whether the object we find there is of + * precisely the correct size (because we know it must be). + */ +#define FPM_NUM_FREELISTS 129 + +/* Define relative pointer types. */ +relptr_declare(FreePageBtree, RelptrFreePageBtree); +relptr_declare(FreePageManager, RelptrFreePageManager); +relptr_declare(FreePageSpanLeader, RelptrFreePageSpanLeader); + +/* Everything we need in order to manage free pages (see freepage.c) */ +struct FreePageManager +{ + RelptrFreePageManager self; + RelptrFreePageBtree btree_root; + RelptrFreePageSpanLeader btree_recycle; + unsigned btree_depth; + unsigned btree_recycle_count; + Size singleton_first_page; + Size singleton_npages; + Size contiguous_pages; + bool contiguous_pages_dirty; + RelptrFreePageSpanLeader freelist[FPM_NUM_FREELISTS]; +#ifdef FPM_EXTRA_ASSERTS + /* For debugging only, pages put minus pages gotten. */ + Size free_pages; +#endif +}; + +/* Macros to convert between page numbers (expressed as Size) and pointers. */ +#define fpm_page_to_pointer(base, page) \ + (AssertVariableIsOfTypeMacro(page, Size), \ + (base) + FPM_PAGE_SIZE * (page)) +#define fpm_pointer_to_page(base, ptr) \ + (((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE) + +/* Macro to convert an allocation size to a number of pages. */ +#define fpm_size_to_pages(sz) \ + (((sz) + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) + +/* Macros to check alignment of absolute and relative pointers. */ +#define fpm_pointer_is_page_aligned(base, ptr) \ + (((Size) (((char *) (ptr)) - (base))) % FPM_PAGE_SIZE == 0) +#define fpm_relptr_is_page_aligned(base, relptr) \ + ((relptr).relptr_off % FPM_PAGE_SIZE == 0) + +/* Macro to find base address of the segment containing a FreePageManager. */ +#define fpm_segment_base(fpm) \ + (((char *) fpm) - fpm->self.relptr_off) + +/* Macro to access a FreePageManager's largest consecutive run of pages. */ +#define fpm_largest(fpm) \ + (fpm->contiguous_pages) + +/* Functions to manipulate the free page map. */ +extern void FreePageManagerInitialize(FreePageManager *fpm, char *base); +extern bool FreePageManagerGet(FreePageManager *fpm, Size npages, + Size *first_page); +extern void FreePageManagerPut(FreePageManager *fpm, Size first_page, + Size npages); +extern char *FreePageManagerDump(FreePageManager *fpm); + +#endif /* FREEPAGE_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0c55d44bdb2..c59af3ee8f9 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -638,6 +638,13 @@ Form_pg_ts_template Form_pg_type Form_pg_user_mapping FormatNode +FreePageBtree +FreePageBtreeHeader +FreePageBtreeInternalKey +FreePageBtreeLeafKey +FreePageBtreeSearchResult +FreePageManager +FreePageSpanLeader FromCharDateMode FromExpr FuncCall From d56867e016615bc85d341a2e78e4a12fa2ce2404 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 23 Aug 2016 14:32:23 -0400 Subject: [PATCH 03/10] Extend dsm API with a new function dsm_unpin_segment. If you have previously pinned a segment and decide that you don't actually want to keep it around until shutdown, this new API lets you remove the pin. This is pretty trivial except on Windows, where it requires closing the duplicate handle that was used to implement the pin. Thomas Munro and Amit Kapila, reviewed by Amit Kapila and by me. (cherry picked from commit 0fda682e542c9acd368588e50a1993fecd3b73e2) Changes from original commit: remove API changes and windows compatibility code to keep binary compatibility with 6.x --- src/backend/storage/ipc/dsm.c | 103 +++++++++++++++++++++++++++-- src/backend/storage/ipc/dsm_impl.c | 4 +- src/include/storage/dsm.h | 1 + src/include/storage/dsm_impl.h | 2 +- 4 files changed, 101 insertions(+), 9 deletions(-) diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c index a487146c1db..2dcded71bd5 100644 --- a/src/backend/storage/ipc/dsm.c +++ b/src/backend/storage/ipc/dsm.c @@ -82,6 +82,7 @@ typedef struct dsm_control_item { dsm_handle handle; uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */ + bool pinned; } dsm_control_item; /* Layout of the dynamic shared memory control segment. */ @@ -490,6 +491,7 @@ dsm_create(Size size) dsm_control->item[i].handle = seg->handle; /* refcnt of 1 triggers destruction, so start at 2 */ dsm_control->item[i].refcnt = 2; + dsm_control->item[i].pinned = false; seg->control_slot = i; LWLockRelease(DynamicSharedMemoryControlLock); return seg; @@ -515,6 +517,7 @@ dsm_create(Size size) dsm_control->item[nitems].handle = seg->handle; /* refcnt of 1 triggers destruction, so start at 2 */ dsm_control->item[nitems].refcnt = 2; + dsm_control->item[nitems].pinned = false; seg->control_slot = nitems; dsm_control->nitems++; LWLockRelease(DynamicSharedMemoryControlLock); @@ -753,6 +756,9 @@ dsm_detach(dsm_segment *seg) /* If new reference count is 1, try to destroy the segment. */ if (refcnt == 1) { + /* A pinned segment should never reach 1. */ + Assert(!dsm_control->item[control_slot].pinned); + /* * If we fail to destroy the segment here, or are killed before we * finish doing so, the reference count will remain at 1, which @@ -805,11 +811,11 @@ dsm_pin_mapping(dsm_segment *seg) } /* - * Keep a dynamic shared memory segment until postmaster shutdown. + * Keep a dynamic shared memory segment until postmaster shutdown, or until + * dsm_unpin_segment is called. * - * This function should not be called more than once per segment; - * on Windows, doing so will create unnecessary handles which will - * consume system resources to no benefit. + * This function should not be called more than once per segment, unless the + * segment is explicitly unpinned with dsm_unpin_segment in between calls. * * Note that this function does not arrange for the current process to * keep the segment mapped indefinitely; if that behavior is desired, @@ -822,13 +828,98 @@ dsm_pin_segment(dsm_segment *seg) /* * Bump reference count for this segment in shared memory. This will * ensure that even if there is no session which is attached to this - * segment, it will remain until postmaster shutdown. + * segment, it will remain until postmaster shutdown or an explicit call + * to unpin. */ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (dsm_control->item[seg->control_slot].pinned) + elog(ERROR, "cannot pin a segment that is already pinned"); + dsm_impl_pin_segment(seg->handle, seg->impl_private); + dsm_control->item[seg->control_slot].pinned = true; dsm_control->item[seg->control_slot].refcnt++; LWLockRelease(DynamicSharedMemoryControlLock); +} - dsm_impl_pin_segment(seg->handle, seg->impl_private); +/* + * Unpin a dynamic shared memory segment that was previously pinned with + * dsm_pin_segment. This function should not be called unless dsm_pin_segment + * was previously called for this segment. + * + * The argument is a dsm_handle rather than a dsm_segment in case you want + * to unpin a segment to which you haven't attached. This turns out to be + * useful if, for example, a reference to one shared memory segment is stored + * within another shared memory segment. You might want to unpin the + * referenced segment before destroying the referencing segment. + */ +void +dsm_unpin_segment(dsm_handle handle) +{ + uint32 control_slot = INVALID_CONTROL_SLOT; + bool destroy = false; + uint32 i; + + /* Find the control slot for the given handle. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + for (i = 0; i < dsm_control->nitems; ++i) + { + /* Skip unused slots. */ + if (dsm_control->item[i].refcnt == 0) + continue; + + /* If we've found our handle, we can stop searching. */ + if (dsm_control->item[i].handle == handle) + { + control_slot = i; + break; + } + } + + /* + * We should definitely have found the slot, and it should not already be + * in the process of going away, because this function should only be + * called on a segment which is pinned. + */ + if (control_slot == INVALID_CONTROL_SLOT) + elog(ERROR, "cannot unpin unknown segment handle"); + if (!dsm_control->item[control_slot].pinned) + elog(ERROR, "cannot unpin a segment that is not pinned"); + Assert(dsm_control->item[control_slot].refcnt > 1); + + /* Note that 1 means no references (0 means unused slot). */ + if (--dsm_control->item[control_slot].refcnt == 1) + destroy = true; + dsm_control->item[control_slot].pinned = false; + + /* Now we can release the lock. */ + LWLockRelease(DynamicSharedMemoryControlLock); + + /* Clean up resources if that was the last reference. */ + if (destroy) + { + void *junk_impl_private = NULL; + void *junk_mapped_address = NULL; + Size junk_mapped_size = 0; + + /* + * For an explanation of how error handling works in this case, see + * comments in dsm_detach. Note that if we reach this point, the + * current process certainly does not have the segment mapped, because + * if it did, the reference count would have still been greater than 1 + * even after releasing the reference count held by the pin. The fact + * that there can't be a dsm_segment for this handle makes it OK to + * pass the mapped size, mapped address, and private data as NULL + * here. + */ + if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + Assert(dsm_control->item[control_slot].handle == handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } } /* diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c index ebe84618ad2..bf949fbd91d 100644 --- a/src/backend/storage/ipc/dsm_impl.c +++ b/src/backend/storage/ipc/dsm_impl.c @@ -1055,8 +1055,8 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, #endif /* - * Implementation-specific actions that must be performed when a segment - * is to be preserved until postmaster shutdown. + * Implementation-specific actions that must be performed when a segment is to + * be preserved even when no backend has it attached. * * Except on Windows, we don't need to do anything at all. But since Windows * cleans up segments automatically when no references remain, we duplicate diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h index 9ff49162c18..766f94e3d64 100644 --- a/src/include/storage/dsm.h +++ b/src/include/storage/dsm.h @@ -41,6 +41,7 @@ extern void dsm_detach(dsm_segment *seg); /* Resource management functions. */ extern void dsm_pin_mapping(dsm_segment *seg); extern void dsm_pin_segment(dsm_segment *seg); +extern void dsm_unpin_segment(dsm_handle h); extern dsm_segment *dsm_find_mapping(dsm_handle h); /* Informational functions. */ diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h index 32cfed2ee9c..37155354a6e 100644 --- a/src/include/storage/dsm_impl.h +++ b/src/include/storage/dsm_impl.h @@ -72,7 +72,7 @@ extern bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, /* Some implementations cannot resize segments. Can this one? */ extern bool dsm_impl_can_resize(void); -/* Implementation-dependent actions required to keep segment until shudown. */ +/* Implementation-dependent actions required to keep segment until shutdown. */ extern void dsm_impl_pin_segment(dsm_handle handle, void *impl_private); #endif /* DSM_IMPL_H */ From b7a2ca11b529079333414ca86fc48dc029f450b4 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 2 Dec 2016 12:34:36 -0500 Subject: [PATCH 04/10] Introduce dynamic shared memory areas. Programmers discovered decades ago that it was useful to have a simple interface for allocating and freeing memory, which is why malloc() and free() were invented. Unfortunately, those handy tools don't work with dynamic shared memory segments because those are specific to PostgreSQL and are not necessarily mapped at the same address in every cooperating process. So invent our own allocator instead. This makes it possible for processes cooperating as part of parallel query execution to allocate and free chunks of memory without having to reserve them prior to the start of execution. It could also be used for longer lived objects; for example, we could consider storing data for pg_stat_statements or the stats collector in shared memory using these interfaces, rather than writing them to files. Basically, anything that needs shared memory but can't predict in advance how much it's going to need might find this useful. Thomas Munro and Robert Haas. The original code (of mine) on which Thomas based his work was actually designed to be a new backend-local memory allocator for PostgreSQL, but that hasn't gone anywhere - or not yet, anyway. Thomas took that work and performed major refactoring and extensive modifications to make it work with dynamic shared memory, including the addition of appropriate locking. Discussion: CA+TgmobkeWptGwiNa+SGFWsTLzTzD-CeLz0KcE-y6LFgoUus4A@mail.gmail.com Discussion: CAEepm=1z5WLuNoJ80PaCvz6EtG9dN0j-KuHcHtU6QEfcPP5-qA@mail.gmail.com (cherry picked from commit 13df76a537cca3b8884911d8fdf7c89a457a8dd3) Changes from original commit: removed extra argument from dsm_create() call --- src/backend/utils/mmgr/Makefile | 2 +- src/backend/utils/mmgr/dsa.c | 2202 ++++++++++++++++++++++++++++++ src/include/utils/dsa.h | 108 ++ src/tools/pgindent/typedefs.list | 6 + 4 files changed, 2317 insertions(+), 1 deletion(-) create mode 100644 src/backend/utils/mmgr/dsa.c create mode 100644 src/include/utils/dsa.h diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile index 4071dcf15c9..ee473f313d4 100644 --- a/src/backend/utils/mmgr/Makefile +++ b/src/backend/utils/mmgr/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/utils/mmgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = aset.o freepage.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o +OBJS = aset.o dsa.o freepage.o mcxt.o memaccounting.o mpool.o portalmem.o memprot.o vmem_tracker.o redzone_handler.o runaway_cleaner.o idle_tracker.o event_version.o # In PostgreSQL, this is under src/common. It has been backported, but because # we haven't merged the changes that introduced the src/common directory, it diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c new file mode 100644 index 00000000000..eeaf143d0a9 --- /dev/null +++ b/src/backend/utils/mmgr/dsa.c @@ -0,0 +1,2202 @@ +/*------------------------------------------------------------------------- + * + * dsa.c + * Dynamic shared memory areas. + * + * This module provides dynamic shared memory areas which are built on top of + * DSM segments. While dsm.c allows segments of memory of shared memory to be + * created and shared between backends, it isn't designed to deal with small + * objects. A DSA area is a shared memory heap usually backed by one or more + * DSM segments which can allocate memory using dsa_allocate() and dsa_free(). + * Alternatively, it can be created in pre-existing shared memory, including a + * DSM segment, and then create extra DSM segments as required. Unlike the + * regular system heap, it deals in pseudo-pointers which must be converted to + * backend-local pointers before they are dereferenced. These pseudo-pointers + * can however be shared with other backends, and can be used to construct + * shared data structures. + * + * Each DSA area manages a set of DSM segments, adding new segments as + * required and detaching them when they are no longer needed. Each segment + * contains a number of 4KB pages, a free page manager for tracking + * consecutive runs of free pages, and a page map for tracking the source of + * objects allocated on each page. Allocation requests above 8KB are handled + * by choosing a segment and finding consecutive free pages in its free page + * manager. Allocation requests for smaller sizes are handled using pools of + * objects of a selection of sizes. Each pool consists of a number of 16 page + * (64KB) superblocks allocated in the same way as large objects. Allocation + * of large objects and new superblocks is serialized by a single LWLock, but + * allocation of small objects from pre-existing superblocks uses one LWLock + * per pool. Currently there is one pool, and therefore one lock, per size + * class. Per-core pools to increase concurrency and strategies for reducing + * the resulting fragmentation are areas for future research. Each superblock + * is managed with a 'span', which tracks the superblock's freelist. Free + * requests are handled by looking in the page map to find which span an + * address was allocated from, so that small objects can be returned to the + * appropriate free list, and large object pages can be returned directly to + * the free page map. When allocating, simple heuristics for selecting + * segments and superblocks try to encourage occupied memory to be + * concentrated, increasing the likelihood that whole superblocks can become + * empty and be returned to the free page manager, and whole segments can + * become empty and be returned to the operating system. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/dsa.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/dsa.h" +#include "utils/freepage.h" +#include "utils/memutils.h" + +/* + * The size of the initial DSM segment that backs a dsa_area created by + * dsa_create. After creating some number of segments of this size we'll + * double this size, and so on. Larger segments may be created if necessary + * to satisfy large requests. + */ +#define DSA_INITIAL_SEGMENT_SIZE ((Size) (1 * 1024 * 1024)) + +/* + * How many segments to create before we double the segment size. If this is + * low, then there is likely to be a lot of wasted space in the largest + * segment. If it is high, then we risk running out of segment slots (see + * dsm.c's limits on total number of segments), or limiting the total size + * an area can manage when using small pointers. + */ +#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4 + +/* + * The number of bits used to represent the offset part of a dsa_pointer. + * This controls the maximum size of a segment, the maximum possible + * allocation size and also the maximum number of segments per area. + */ +#if SIZEOF_DSA_POINTER == 4 +#define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */ +#else +#define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */ +#endif + +/* + * The maximum number of DSM segments that an area can own, determined by + * the number of bits remaining (but capped at 1024). + */ +#define DSA_MAX_SEGMENTS \ + Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH))) + +/* The bitmask for extracting the offset from a dsa_pointer. */ +#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1) + +/* The maximum size of a DSM segment. */ +#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH) + +/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */ +#define DSA_PAGES_PER_SUPERBLOCK 16 + +/* + * A magic number used as a sanity check for following DSM segments belonging + * to a DSA area (this number will be XORed with the area handle and + * the segment index). + */ +#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608 + +/* Build a dsa_pointer given a segment number and offset. */ +#define DSA_MAKE_POINTER(segment_number, offset) \ + (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset)) + +/* Extract the segment number from a dsa_pointer. */ +#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH) + +/* Extract the offset from a dsa_pointer. */ +#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK) + +/* The type used for index segment indexes (zero based). */ +typedef Size dsa_segment_index; + +/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */ +#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0) + +/* + * How many bins of segments do we have? The bins are used to categorize + * segments by their largest contiguous run of free pages. + */ +#define DSA_NUM_SEGMENT_BINS 16 + +/* + * What is the lowest bin that holds segments that *might* have n contiguous + * free pages? There is no point in looking in segments in lower bins; they + * definitely can't service a request for n free pages. + */ +#define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1) + +/* Macros for access to locks. */ +#define DSA_AREA_LOCK(area) (&area->control->lock) +#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock) + +/* + * The header for an individual segment. This lives at the start of each DSM + * segment owned by a DSA area including the first segment (where it appears + * as part of the dsa_area_control struct). + */ +typedef struct +{ + /* Sanity check magic value. */ + uint32 magic; + /* Total number of pages in this segment (excluding metadata area). */ + Size usable_pages; + /* Total size of this segment in bytes. */ + Size size; + + /* + * Index of the segment that precedes this one in the same segment bin, or + * DSA_SEGMENT_INDEX_NONE if this is the first one. + */ + dsa_segment_index prev; + + /* + * Index of the segment that follows this one in the same segment bin, or + * DSA_SEGMENT_INDEX_NONE if this is the last one. + */ + dsa_segment_index next; + /* The index of the bin that contains this segment. */ + Size bin; + + /* + * A flag raised to indicate that this segment is being returned to the + * operating system and has been unpinned. + */ + bool freed; +} dsa_segment_header; + +/* + * Metadata for one superblock. + * + * For most blocks, span objects are stored out-of-line; that is, the span + * object is not stored within the block itself. But, as an exception, for a + * "span of spans", the span object is stored "inline". The allocation is + * always exactly one page, and the dsa_area_span object is located at + * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS, + * and the remaining fields are used just as they would be in an ordinary + * block. We can't allocate spans out of ordinary superblocks because + * creating an ordinary superblock requires us to be able to allocate a span + * *first*. Doing it this way avoids that circularity. + */ +typedef struct +{ + dsa_pointer pool; /* Containing pool. */ + dsa_pointer prevspan; /* Previous span. */ + dsa_pointer nextspan; /* Next span. */ + dsa_pointer start; /* Starting address. */ + Size npages; /* Length of span in pages. */ + uint16 size_class; /* Size class. */ + uint16 ninitialized; /* Maximum number of objects ever allocated. */ + uint16 nallocatable; /* Number of objects currently allocatable. */ + uint16 firstfree; /* First object on free list. */ + uint16 nmax; /* Maximum number of objects ever possible. */ + uint16 fclass; /* Current fullness class. */ +} dsa_area_span; + +/* + * Given a pointer to an object in a span, access the index of the next free + * object in the same span (ie in the span's freelist) as an L-value. + */ +#define NextFreeObjectIndex(object) (* (uint16 *) (object)) + +/* + * Small allocations are handled by dividing a single block of memory into + * many small objects of equal size. The possible allocation sizes are + * defined by the following array. Larger size classes are spaced more widely + * than smaller size classes. We fudge the spacing for size classes >1kB to + * avoid space wastage: based on the knowledge that we plan to allocate 64kB + * blocks, we bump the maximum object size up to the largest multiple of + * 8 bytes that still lets us fit the same number of objects into one block. + * + * NB: Because of this fudging, if we were ever to use differently-sized blocks + * for small allocations, these size classes would need to be reworked to be + * optimal for the new size. + * + * NB: The optimal spacing for size classes, as well as the size of the blocks + * out of which small objects are allocated, is not a question that has one + * right answer. Some allocators (such as tcmalloc) use more closely-spaced + * size classes than we do here, while others (like aset.c) use more + * widely-spaced classes. Spacing the classes more closely avoids wasting + * memory within individual chunks, but also means a larger number of + * potentially-unfilled blocks. + */ +static const uint16 dsa_size_classes[] = { + sizeof(dsa_area_span), 0, /* special size classes */ + 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */ + 80, 96, 112, 128, /* 4 classes separated by 16 bytes */ + 160, 192, 224, 256, /* 4 classes separated by 32 bytes */ + 320, 384, 448, 512, /* 4 classes separated by 64 bytes */ + 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */ + 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */ + 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */ + 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */ +}; +#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes) + +/* Special size classes. */ +#define DSA_SCLASS_BLOCK_OF_SPANS 0 +#define DSA_SCLASS_SPAN_LARGE 1 + +/* + * The following lookup table is used to map the size of small objects + * (less than 1kB) onto the corresponding size class. To use this table, + * round the size of the object up to the next multiple of 8 bytes, and then + * index into this array. + */ +static char dsa_size_class_map[] = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, + 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, + 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 +}; +#define DSA_SIZE_CLASS_MAP_QUANTUM 8 + +/* + * Superblocks are binned by how full they are. Generally, each fullness + * class corresponds to one quartile, but the block being used for + * allocations is always at the head of the list for fullness class 1, + * regardless of how full it really is. + */ +#define DSA_FULLNESS_CLASSES 4 + +/* + * Maximum length of a DSA name. + */ +#define DSA_MAXLEN 64 + +/* + * A dsa_area_pool represents a set of objects of a given size class. + * + * Perhaps there should be multiple pools for the same size class for + * contention avoidance, but for now there is just one! + */ +typedef struct +{ + /* A lock protecting access to this pool. */ + LWLock lock; + /* A set of linked lists of spans, arranged by fullness. */ + dsa_pointer spans[DSA_FULLNESS_CLASSES]; + /* Should we pad this out to a cacheline boundary? */ +} dsa_area_pool; + +/* + * The control block for an area. This lives in shared memory, at the start of + * the first DSM segment controlled by this area. + */ +typedef struct +{ + /* The segment header for the first segment. */ + dsa_segment_header segment_header; + /* The handle for this area. */ + dsa_handle handle; + /* The handles of the segments owned by this area. */ + dsm_handle segment_handles[DSA_MAX_SEGMENTS]; + /* Lists of segments, binned by maximum contiguous run of free pages. */ + dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]; + /* The object pools for each size class. */ + dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]; + /* The total size of all active segments. */ + Size total_segment_size; + /* The maximum total size of backing storage we are allowed. */ + Size max_total_segment_size; + /* Highest used segment index in the history of this area. */ + dsa_segment_index high_segment_index; + /* The reference count for this area. */ + int refcnt; + /* A flag indicating that this area has been pinned. */ + bool pinned; + /* The number of times that segments have been freed. */ + Size freed_segment_counter; + /* The LWLock tranche ID. */ + int lwlock_tranche_id; + char lwlock_tranche_name[DSA_MAXLEN]; + /* The general lock (protects everything except object pools). */ + LWLock lock; +} dsa_area_control; + +/* Given a pointer to a pool, find a dsa_pointer. */ +#define DsaAreaPoolToDsaPointer(area, p) \ + DSA_MAKE_POINTER(0, (char *) p - (char *) area->control) + +/* + * A dsa_segment_map is stored within the backend-private memory of each + * individual backend. It holds the base address of the segment within that + * backend, plus the addresses of key objects within the segment. Those + * could instead be derived from the base address but it's handy to have them + * around. + */ +typedef struct +{ + dsm_segment *segment; /* DSM segment */ + char *mapped_address; /* Address at which segment is mapped */ + dsa_segment_header *header; /* Header (same as mapped_address) */ + FreePageManager *fpm; /* Free page manager within segment. */ + dsa_pointer *pagemap; /* Page map within segment. */ +} dsa_segment_map; + +/* + * Per-backend state for a storage area. Backends obtain one of these by + * creating an area or attaching to an existing one using a handle. Each + * process that needs to use an area uses its own object to track where the + * segments are mapped. + */ +struct dsa_area +{ + /* Pointer to the control object in shared memory. */ + dsa_area_control *control; + + /* The lock tranche for this process. */ + LWLockTranche lwlock_tranche; + + /* Has the mapping been pinned? */ + bool mapping_pinned; + + /* + * This backend's array of segment maps, ordered by segment index + * corresponding to control->segment_handles. Some of the area's segments + * may not be mapped in in this backend yet, and some slots may have been + * freed and need to be detached; these operations happen on demand. + */ + dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]; + + /* The highest segment index this backend has ever mapped. */ + dsa_segment_index high_segment_index; + + /* The last observed freed_segment_counter. */ + Size freed_segment_counter; +}; + +#define DSA_SPAN_NOTHING_FREE ((uint16) -1) +#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE) + +/* Given a pointer to a segment_map, obtain a segment index number. */ +#define get_segment_index(area, segment_map_ptr) \ + (segment_map_ptr - &area->segment_maps[0]) + +static void init_span(dsa_area *area, dsa_pointer span_pointer, + dsa_area_pool *pool, dsa_pointer start, Size npages, + uint16 size_class); +static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, + int fromclass, int toclass); +static inline dsa_pointer alloc_object(dsa_area *area, int size_class); +static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, + int size_class); +static dsa_segment_map *get_segment_by_index(dsa_area *area, + dsa_segment_index index); +static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer); +static void unlink_span(dsa_area *area, dsa_area_span *span); +static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, + dsa_pointer span_pointer, int fclass); +static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map); +static dsa_segment_map *get_best_segment(dsa_area *area, Size npages); +static dsa_segment_map *make_new_segment(dsa_area *area, Size requested_pages); +static dsa_area *create_internal(void *place, size_t size, + int tranche_id, const char *tranche_name, + dsm_handle control_handle, + dsm_segment *control_segment); +static dsa_area *attach_internal(void *place, dsm_segment *segment, + dsa_handle handle); +static void check_for_freed_segments(dsa_area *area); + +/* + * Create a new shared area in a new DSM segment. Further DSM segments will + * be allocated as required to extend the available space. + * + * We can't allocate a LWLock tranche_id within this function, because tranche + * IDs are a scarce resource; there are only 64k available, using low numbers + * when possible matters, and we have no provision for recycling them. So, + * we require the caller to provide one. The caller must also provide the + * tranche name, so that we can distinguish LWLocks belonging to different + * DSAs. + */ +dsa_area * +dsa_create(int tranche_id, const char *tranche_name) +{ + dsm_segment *segment; + dsa_area *area; + + /* + * Create the DSM segment that will hold the shared control object and the + * first segment of usable space. + */ + segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE); + + /* + * All segments backing this area are pinned, so that DSA can explicitly + * control their lifetime (otherwise a newly created segment belonging to + * this area might be freed when the only backend that happens to have it + * mapped in ends, corrupting the area). + */ + dsm_pin_segment(segment); + + /* Create a new DSA area with the control objet in this segment. */ + area = create_internal(dsm_segment_address(segment), + DSA_INITIAL_SEGMENT_SIZE, + tranche_id, tranche_name, + dsm_segment_handle(segment), segment); + + /* Clean up when the control segment detaches. */ + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(dsm_segment_address(segment))); + + return area; +} + +/* + * Create a new shared area in an existing shared memory space, which may be + * either DSM or Postmaster-initialized memory. DSM segments will be + * allocated as required to extend the available space, though that can be + * prevented with dsa_set_size_limit(area, size) using the same size provided + * to dsa_create_in_place. + * + * Areas created in-place must eventually be released by the backend that + * created them and all backends that attach to them. This can be done + * explicitly with dsa_release_in_place, or, in the special case that 'place' + * happens to be in a pre-existing DSM segment, by passing in a pointer to the + * segment so that a detach hook can be registered with the containing DSM + * segment. + * + * See dsa_create() for a note about the tranche arguments. + */ +dsa_area * +dsa_create_in_place(void *place, size_t size, + int tranche_id, const char *tranche_name, + dsm_segment *segment) +{ + dsa_area *area; + + area = create_internal(place, size, tranche_id, tranche_name, + DSM_HANDLE_INVALID, NULL); + + /* + * Clean up when the control segment detaches, if a containing DSM segment + * was provided. + */ + if (segment != NULL) + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(place)); + + return area; +} + +/* + * Obtain a handle that can be passed to other processes so that they can + * attach to the given area. Cannot be called for areas created with + * dsa_create_in_place. + */ +dsa_handle +dsa_get_handle(dsa_area *area) +{ + Assert(area->control->handle != DSM_HANDLE_INVALID); + return area->control->handle; +} + +/* + * Attach to an area given a handle generated (possibly in another process) by + * dsa_get_area_handle. The area must have been created with dsa_create (not + * dsa_create_in_place). + */ +dsa_area * +dsa_attach(dsa_handle handle) +{ + dsm_segment *segment; + dsa_area *area; + + /* + * An area handle is really a DSM segment handle for the first segment, so + * we go ahead and attach to that. + */ + segment = dsm_attach(handle); + if (segment == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to dsa_handle"))); + + area = attach_internal(dsm_segment_address(segment), segment, handle); + + /* Clean up when the control segment detaches. */ + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(dsm_segment_address(segment))); + + return area; +} + +/* + * Attach to an area that was created with dsa_create_in_place. The caller + * must somehow know the location in memory that was used when the area was + * created, though it may be mapped at a different virtual address in this + * process. + * + * See dsa_create_in_place for note about releasing in-place areas, and the + * optional 'segment' argument which can be provided to allow automatic + * release if the containing memory happens to be a DSM segment. + */ +dsa_area * +dsa_attach_in_place(void *place, dsm_segment *segment) +{ + dsa_area *area; + + area = attach_internal(place, NULL, DSM_HANDLE_INVALID); + + /* + * Clean up when the control segment detaches, if a containing DSM segment + * was provided. + */ + if (segment != NULL) + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(place)); + + return area; +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. The 'segment' argument is ignored but provides an + * interface suitable for on_dsm_detach, for the convenience of users who want + * to create a DSA segment inside an existing DSM segment and have it + * automatically released when the containing DSM segment is detached. + * 'place' should be the address of the place where the area was created. + * + * This callback is automatically registered for the DSM segment containing + * the control object of in-place areas when a segment is provided to + * dsa_create_in_place or dsa_attach_in_place, and also for all areas created + * with dsa_create. + */ +void +dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place) +{ + dsa_release_in_place(DatumGetPointer(place)); +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. The 'code' argument is ignored but provides an + * interface suitable for on_shmem_exit or before_shmem_exit, for the + * convenience of users who want to create a DSA segment inside shared memory + * other than a DSM segment and have it automatically release at backend exit. + * 'place' should be the address of the place where the area was created. + */ +void +dsa_on_shmem_exit_release_in_place(int code, Datum place) +{ + dsa_release_in_place(DatumGetPointer(place)); +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX' + * callbacks so that this is managed automatically, because failure to release + * an area created in-place leaks its segments permanently. + * + * This is also called automatically for areas produced by dsa_create or + * dsa_attach as an implementation detail. + */ +void +dsa_release_in_place(void *place) +{ + dsa_area_control *control = (dsa_area_control *) place; + int i; + + LWLockAcquire(&control->lock, LW_EXCLUSIVE); + Assert(control->segment_header.magic == + (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0)); + Assert(control->refcnt > 0); + if (--control->refcnt == 0) + { + for (i = 0; i <= control->high_segment_index; ++i) + { + dsm_handle handle; + + handle = control->segment_handles[i]; + if (handle != DSM_HANDLE_INVALID) + dsm_unpin_segment(handle); + } + } + LWLockRelease(&control->lock); +} + +/* + * Keep a DSA area attached until end of session or explicit detach. + * + * By default, areas are owned by the current resource owner, which means they + * are detached automatically when that scope ends. + */ +void +dsa_pin_mapping(dsa_area *area) +{ + int i; + + Assert(!area->mapping_pinned); + area->mapping_pinned = true; + + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_pin_mapping(area->segment_maps[i].segment); +} + +/* + * Allocate memory in this storage area. The return value is a dsa_pointer + * that can be passed to other processes, and converted to a local pointer + * with dsa_get_address. If no memory is available, returns + * InvalidDsaPointer. + */ +dsa_pointer +dsa_allocate(dsa_area *area, Size size) +{ + uint16 size_class; + dsa_pointer start_pointer; + dsa_segment_map *segment_map; + + Assert(size > 0); + + /* + * If bigger than the largest size class, just grab a run of pages from + * the free page manager, instead of allocating an object from a pool. + * There will still be a span, but it's a special class of span that + * manages this whole allocation and simply gives all pages back to the + * free page manager when dsa_free is called. + */ + if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1]) + { + Size npages = fpm_size_to_pages(size); + Size first_page; + dsa_pointer span_pointer; + dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE]; + + /* Obtain a span object. */ + span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); + if (!DsaPointerIsValid(span_pointer)) + return InvalidDsaPointer; + + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + + /* Find a segment from which to allocate. */ + segment_map = get_best_segment(area, npages); + if (segment_map == NULL) + segment_map = make_new_segment(area, npages); + if (segment_map == NULL) + { + /* Can't make any more segments: game over. */ + LWLockRelease(DSA_AREA_LOCK(area)); + dsa_free(area, span_pointer); + return InvalidDsaPointer; + } + + /* + * Ask the free page manager for a run of pages. This should always + * succeed, since both get_best_segment and make_new_segment should + * only return a non-NULL pointer if it actually contains enough + * contiguous freespace. If it does fail, something in our backend + * private state is out of whack, so use FATAL to kill the process. + */ + if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) + elog(FATAL, + "dsa_allocate could not find %zu free pages", npages); + LWLockRelease(DSA_AREA_LOCK(area)); + + start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map), + first_page * FPM_PAGE_SIZE); + + /* Initialize span and pagemap. */ + LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), + LW_EXCLUSIVE); + init_span(area, span_pointer, pool, start_pointer, npages, + DSA_SCLASS_SPAN_LARGE); + segment_map->pagemap[first_page] = span_pointer; + LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); + + return start_pointer; + } + + /* Map allocation to a size class. */ + if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM) + { + int mapidx; + + /* For smaller sizes we have a lookup table... */ + mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) / + DSA_SIZE_CLASS_MAP_QUANTUM) - 1; + size_class = dsa_size_class_map[mapidx]; + } + else + { + uint16 min; + uint16 max; + + /* ... and for the rest we search by binary chop. */ + min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1]; + max = lengthof(dsa_size_classes) - 1; + + while (min < max) + { + uint16 mid = (min + max) / 2; + uint16 class_size = dsa_size_classes[mid]; + + if (class_size < size) + min = mid + 1; + else + max = mid; + } + + size_class = min; + } + Assert(size <= dsa_size_classes[size_class]); + Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]); + + /* + * Attempt to allocate an object from the appropriate pool. This might + * return InvalidDsaPointer if there's no space available. + */ + return alloc_object(area, size_class); +} + +/* + * Free memory obtained with dsa_allocate. + */ +void +dsa_free(dsa_area *area, dsa_pointer dp) +{ + dsa_segment_map *segment_map; + int pageno; + dsa_pointer span_pointer; + dsa_area_span *span; + char *superblock; + char *object; + Size size; + int size_class; + + /* Make sure we don't have a stale segment in the slot 'dp' refers to. */ + check_for_freed_segments(area); + + /* Locate the object, span and pool. */ + segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp)); + pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE; + span_pointer = segment_map->pagemap[pageno]; + span = dsa_get_address(area, span_pointer); + superblock = dsa_get_address(area, span->start); + object = dsa_get_address(area, dp); + size_class = span->size_class; + size = dsa_size_classes[size_class]; + + /* + * Special case for large objects that live in a special span: we return + * those pages directly to the free page manager and free the span. + */ + if (span->size_class == DSA_SCLASS_SPAN_LARGE) + { + +#ifdef CLOBBER_FREED_MEMORY + memset(object, 0x7f, span->npages * FPM_PAGE_SIZE); +#endif + + /* Give pages back to free page manager. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + FreePageManagerPut(segment_map->fpm, + DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, + span->npages); + LWLockRelease(DSA_AREA_LOCK(area)); + /* Unlink span. */ + LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), + LW_EXCLUSIVE); + unlink_span(area, span); + LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); + /* Free the span object so it can be reused. */ + dsa_free(area, span_pointer); + return; + } + +#ifdef CLOBBER_FREED_MEMORY + memset(object, 0x7f, size); +#endif + + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + + /* Put the object on the span's freelist. */ + Assert(object >= superblock); + Assert(object < superblock + DSA_SUPERBLOCK_SIZE); + Assert((object - superblock) % size == 0); + NextFreeObjectIndex(object) = span->firstfree; + span->firstfree = (object - superblock) / size; + ++span->nallocatable; + + /* + * See if the span needs to moved to a different fullness class, or be + * freed so its pages can be given back to the segment. + */ + if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1) + { + /* + * The block was completely full and is located in the + * highest-numbered fullness class, which is never scanned for free + * chunks. We must move it to the next-lower fullness class. + */ + unlink_span(area, span); + add_span_to_fullness_class(area, span, span_pointer, + DSA_FULLNESS_CLASSES - 2); + + /* + * If this is the only span, and there is no active span, then we + * should probably move this span to fullness class 1. (Otherwise if + * you allocate exactly all the objects in the only span, it moves to + * class 3, then you free them all, it moves to 2, and then is given + * back, leaving no active span). + */ + } + else if (span->nallocatable == span->nmax && + (span->fclass != 1 || span->prevspan != InvalidDsaPointer)) + { + /* + * This entire block is free, and it's not the active block for this + * size class. Return the memory to the free page manager. We don't + * do this for the active block to prevent hysteresis: if we + * repeatedly allocate and free the only chunk in the active block, it + * will be very inefficient if we deallocate and reallocate the block + * every time. + */ + destroy_superblock(area, span_pointer); + } + + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); +} + +/* + * Obtain a backend-local address for a dsa_pointer. 'dp' must point to + * memory allocated by the given area (possibly in another process) that + * hasn't yet been freed. This may cause a segment to be mapped into the + * current process if required, and may cause freed segments to be unmapped. + */ +void * +dsa_get_address(dsa_area *area, dsa_pointer dp) +{ + dsa_segment_index index; + Size offset; + + /* Convert InvalidDsaPointer to NULL. */ + if (!DsaPointerIsValid(dp)) + return NULL; + + /* Process any requests to detach from freed segments. */ + check_for_freed_segments(area); + + /* Break the dsa_pointer into its components. */ + index = DSA_EXTRACT_SEGMENT_NUMBER(dp); + offset = DSA_EXTRACT_OFFSET(dp); + Assert(index < DSA_MAX_SEGMENTS); + + /* Check if we need to cause this segment to be mapped in. */ + if (unlikely(area->segment_maps[index].mapped_address == NULL)) + { + /* Call for effect (we don't need the result). */ + get_segment_by_index(area, index); + } + + return area->segment_maps[index].mapped_address + offset; +} + +/* + * Pin this area, so that it will continue to exist even if all backends + * detach from it. In that case, the area can still be reattached to if a + * handle has been recorded somewhere. + */ +void +dsa_pin(dsa_area *area) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + if (area->control->pinned) + { + LWLockRelease(DSA_AREA_LOCK(area)); + elog(ERROR, "dsa_area already pinned"); + } + area->control->pinned = true; + ++area->control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Undo the effects of dsa_pin, so that the given area can be freed when no + * backends are attached to it. May be called only if dsa_pin has been + * called. + */ +void +dsa_unpin(dsa_area *area) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + Assert(area->control->refcnt > 1); + if (!area->control->pinned) + { + LWLockRelease(DSA_AREA_LOCK(area)); + elog(ERROR, "dsa_area not pinned"); + } + area->control->pinned = false; + --area->control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Set the total size limit for this area. This limit is checked whenever new + * segments need to be allocated from the operating system. If the new size + * limit is already exceeded, this has no immediate effect. + * + * Note that the total virtual memory usage may be temporarily larger than + * this limit when segments have been freed, but not yet detached by all + * backends that have attached to them. + */ +void +dsa_set_size_limit(dsa_area *area, Size limit) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + area->control->max_total_segment_size = limit; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Aggressively free all spare memory in the hope of returning DSM segments to + * the operating system. + */ +void +dsa_trim(dsa_area *area) +{ + int size_class; + + /* + * Trim in reverse pool order so we get to the spans-of-spans last, just + * in case any become entirely free while processing all the other pools. + */ + for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class) + { + dsa_area_pool *pool = &area->control->pools[size_class]; + dsa_pointer span_pointer; + + if (size_class == DSA_SCLASS_SPAN_LARGE) + { + /* Large object frees give back segments aggressively already. */ + continue; + } + + /* + * Search fullness class 1 only. That is where we expect to find an + * entirely empty superblock (entirely empty superblocks in other + * fullness classes are returned to the free page map by dsa_free). + */ + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + span_pointer = pool->spans[1]; + while (DsaPointerIsValid(span_pointer)) + { + dsa_area_span *span = dsa_get_address(area, span_pointer); + dsa_pointer next = span->nextspan; + + if (span->nallocatable == span->nmax) + destroy_superblock(area, span_pointer); + + span_pointer = next; + } + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); + } +} + +/* + * Print out debugging information about the internal state of the shared + * memory area. + */ +void +dsa_dump(dsa_area *area) +{ + Size i, + j; + + /* + * Note: This gives an inconsistent snapshot as it acquires and releases + * individual locks as it goes... + */ + + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + fprintf(stderr, "dsa_area handle %x:\n", area->control->handle); + fprintf(stderr, " max_total_segment_size: %zu\n", + area->control->max_total_segment_size); + fprintf(stderr, " total_segment_size: %zu\n", + area->control->total_segment_size); + fprintf(stderr, " refcnt: %d\n", area->control->refcnt); + fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f'); + fprintf(stderr, " segment bins:\n"); + for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) + { + if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_index segment_index; + + fprintf(stderr, + " segment bin %zu (at least %d contiguous pages free):\n", + i, 1 << (i - 1)); + segment_index = area->control->segment_bins[i]; + while (segment_index != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *segment_map; + + segment_map = + get_segment_by_index(area, segment_index); + + fprintf(stderr, + " segment index %zu, usable_pages = %zu, " + "contiguous_pages = %zu, mapped at %p\n", + segment_index, + segment_map->header->usable_pages, + fpm_largest(segment_map->fpm), + segment_map->mapped_address); + segment_index = segment_map->header->next; + } + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + + fprintf(stderr, " pools:\n"); + for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) + { + bool found = false; + + LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE); + for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) + if (DsaPointerIsValid(area->control->pools[i].spans[j])) + found = true; + if (found) + { + if (i == DSA_SCLASS_BLOCK_OF_SPANS) + fprintf(stderr, " pool for blocks of span objects:\n"); + else if (i == DSA_SCLASS_SPAN_LARGE) + fprintf(stderr, " pool for large object spans:\n"); + else + fprintf(stderr, + " pool for size class %zu (object size %hu bytes):\n", + i, dsa_size_classes[i]); + for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) + { + if (!DsaPointerIsValid(area->control->pools[i].spans[j])) + fprintf(stderr, " fullness class %zu is empty\n", j); + else + { + dsa_pointer span_pointer = area->control->pools[i].spans[j]; + + fprintf(stderr, " fullness class %zu:\n", j); + while (DsaPointerIsValid(span_pointer)) + { + dsa_area_span *span; + + span = dsa_get_address(area, span_pointer); + fprintf(stderr, + " span descriptor at %016lx, " + "superblock at %016lx, pages = %zu, " + "objects free = %hu/%hu\n", + span_pointer, span->start, span->npages, + span->nallocatable, span->nmax); + span_pointer = span->nextspan; + } + } + } + } + LWLockRelease(DSA_SCLASS_LOCK(area, i)); + } +} + +/* + * Return the smallest size that you can successfully provide to + * dsa_create_in_place. + */ +Size +dsa_minimum_size(void) +{ + Size size; + int pages = 0; + + size = MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager)); + + /* Figure out how many pages we need, including the page map... */ + while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages) + { + ++pages; + size += sizeof(dsa_pointer); + } + + return pages * FPM_PAGE_SIZE; +} + +/* + * Workhorse function for dsa_create and dsa_create_in_place. + */ +static dsa_area * +create_internal(void *place, size_t size, + int tranche_id, const char *tranche_name, + dsm_handle control_handle, + dsm_segment *control_segment) +{ + dsa_area_control *control; + dsa_area *area; + dsa_segment_map *segment_map; + Size usable_pages; + Size total_pages; + Size metadata_bytes; + int i; + + /* Sanity check on the space we have to work in. */ + if (size < dsa_minimum_size()) + elog(ERROR, "dsa_area space must be at least %zu, but %zu provided", + dsa_minimum_size(), size); + + /* Now figure out how much space is usuable */ + total_pages = size / FPM_PAGE_SIZE; + metadata_bytes = + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager)) + + total_pages * sizeof(dsa_pointer); + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + Assert(metadata_bytes <= size); + usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE; + + /* + * Initialize the dsa_area_control object located at the start of the + * space. + */ + control = (dsa_area_control *) place; + control->segment_header.magic = + DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0; + control->segment_header.next = DSA_SEGMENT_INDEX_NONE; + control->segment_header.prev = DSA_SEGMENT_INDEX_NONE; + control->segment_header.usable_pages = usable_pages; + control->segment_header.freed = false; + control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE; + control->handle = control_handle; + control->max_total_segment_size = SIZE_MAX; + control->total_segment_size = size; + memset(&control->segment_handles[0], 0, + sizeof(dsm_handle) * DSA_MAX_SEGMENTS); + control->segment_handles[0] = control_handle; + for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) + control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE; + control->high_segment_index = 0; + control->refcnt = 1; + control->freed_segment_counter = 0; + control->lwlock_tranche_id = tranche_id; + strlcpy(control->lwlock_tranche_name, tranche_name, DSA_MAXLEN); + + /* + * Create the dsa_area object that this backend will use to access the + * area. Other backends will need to obtain their own dsa_area object by + * attaching. + */ + area = palloc(sizeof(dsa_area)); + area->control = control; + area->mapping_pinned = false; + memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); + area->high_segment_index = 0; + area->lwlock_tranche.array_base = &area->control->pools[0]; + area->lwlock_tranche.array_stride = sizeof(dsa_area_pool); + area->lwlock_tranche.name = control->lwlock_tranche_name; + LWLockRegisterTranche(control->lwlock_tranche_id, &area->lwlock_tranche); + LWLockInitialize(&control->lock, control->lwlock_tranche_id); + for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) + LWLockInitialize(DSA_SCLASS_LOCK(area, i), + control->lwlock_tranche_id); + + /* Set up the segment map for this process's mapping. */ + segment_map = &area->segment_maps[0]; + segment_map->segment = control_segment; + segment_map->mapped_address = place; + segment_map->header = (dsa_segment_header *) place; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_area_control))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Set up the free page map. */ + FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); + /* There can be 0 usable pages if size is dsa_minimum_size(). */ + + if (usable_pages > 0) + FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, + usable_pages); + + /* Put this segment into the appropriate bin. */ + control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0; + segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); + + return area; +} + +/* + * Workhorse function for dsa_attach and dsa_attach_in_place. + */ +static dsa_area * +attach_internal(void *place, dsm_segment *segment, dsa_handle handle) +{ + dsa_area_control *control; + dsa_area *area; + dsa_segment_map *segment_map; + + control = (dsa_area_control *) place; + Assert(control->handle == handle); + Assert(control->segment_handles[0] == handle); + Assert(control->segment_header.magic == + (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0)); + + /* Build the backend-local area object. */ + area = palloc(sizeof(dsa_area)); + area->control = control; + area->mapping_pinned = false; + memset(&area->segment_maps[0], 0, + sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); + area->high_segment_index = 0; + area->lwlock_tranche.array_base = &area->control->pools[0]; + area->lwlock_tranche.array_stride = sizeof(dsa_area_pool); + area->lwlock_tranche.name = control->lwlock_tranche_name; + LWLockRegisterTranche(control->lwlock_tranche_id, &area->lwlock_tranche); + + /* Set up the segment map for this process's mapping. */ + segment_map = &area->segment_maps[0]; + segment_map->segment = segment; /* NULL for in-place */ + segment_map->mapped_address = place; + segment_map->header = (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Bump the reference count. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + ++control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); + + return area; +} + +/* + * Add a new span to fullness class 1 of the indicated pool. + */ +static void +init_span(dsa_area *area, + dsa_pointer span_pointer, + dsa_area_pool *pool, dsa_pointer start, Size npages, + uint16 size_class) +{ + dsa_area_span *span = dsa_get_address(area, span_pointer); + Size obsize = dsa_size_classes[size_class]; + + /* + * The per-pool lock must be held because we manipulate the span list for + * this pool. + */ + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + + /* Push this span onto the front of the span list for fullness class 1. */ + if (DsaPointerIsValid(pool->spans[1])) + { + dsa_area_span *head = (dsa_area_span *) + dsa_get_address(area, pool->spans[1]); + + head->prevspan = span_pointer; + } + span->pool = DsaAreaPoolToDsaPointer(area, pool); + span->nextspan = pool->spans[1]; + span->prevspan = InvalidDsaPointer; + pool->spans[1] = span_pointer; + + span->start = start; + span->npages = npages; + span->size_class = size_class; + span->ninitialized = 0; + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + { + /* + * A block-of-spans contains its own descriptor, so mark one object as + * initialized and reduce the count of allocatable objects by one. + * Doing this here has the side effect of also reducing nmax by one, + * which is important to make sure we free this object at the correct + * time. + */ + span->ninitialized = 1; + span->nallocatable = FPM_PAGE_SIZE / obsize - 1; + } + else if (size_class != DSA_SCLASS_SPAN_LARGE) + span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize; + span->firstfree = DSA_SPAN_NOTHING_FREE; + span->nmax = span->nallocatable; + span->fclass = 1; +} + +/* + * Transfer the first span in one fullness class to the head of another + * fullness class. + */ +static bool +transfer_first_span(dsa_area *area, + dsa_area_pool *pool, int fromclass, int toclass) +{ + dsa_pointer span_pointer; + dsa_area_span *span; + dsa_area_span *nextspan; + + /* Can't do it if source list is empty. */ + span_pointer = pool->spans[fromclass]; + if (!DsaPointerIsValid(span_pointer)) + return false; + + /* Remove span from head of source list. */ + span = dsa_get_address(area, span_pointer); + pool->spans[fromclass] = span->nextspan; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = InvalidDsaPointer; + } + + /* Add span to head of target list. */ + span->nextspan = pool->spans[toclass]; + pool->spans[toclass] = span_pointer; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = span_pointer; + } + span->fclass = toclass; + + return true; +} + +/* + * Allocate one object of the requested size class from the given area. + */ +static inline dsa_pointer +alloc_object(dsa_area *area, int size_class) +{ + dsa_area_pool *pool = &area->control->pools[size_class]; + dsa_area_span *span; + dsa_pointer block; + dsa_pointer result; + char *object; + Size size; + + /* + * Even though ensure_active_superblock can in turn call alloc_object if + * it needs to allocate a new span, that's always from a different pool, + * and the order of lock acquisition is always the same, so it's OK that + * we hold this lock for the duration of this function. + */ + Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + + /* + * If there's no active superblock, we must successfully obtain one or + * fail the request. + */ + if (!DsaPointerIsValid(pool->spans[1]) && + !ensure_active_superblock(area, pool, size_class)) + { + result = InvalidDsaPointer; + } + else + { + /* + * There should be a block in fullness class 1 at this point, and it + * should never be completely full. Thus we can either pop an object + * from the free list or, failing that, initialize a new object. + */ + Assert(DsaPointerIsValid(pool->spans[1])); + span = (dsa_area_span *) + dsa_get_address(area, pool->spans[1]); + Assert(span->nallocatable > 0); + block = span->start; + Assert(size_class < DSA_NUM_SIZE_CLASSES); + size = dsa_size_classes[size_class]; + if (span->firstfree != DSA_SPAN_NOTHING_FREE) + { + result = block + span->firstfree * size; + object = dsa_get_address(area, result); + span->firstfree = NextFreeObjectIndex(object); + } + else + { + result = block + span->ninitialized * size; + ++span->ninitialized; + } + --span->nallocatable; + + /* If it's now full, move it to the highest-numbered fullness class. */ + if (span->nallocatable == 0) + transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1); + } + + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); + + return result; +} + +/* + * Ensure an active (i.e. fullness class 1) superblock, unless all existing + * superblocks are completely full and no more can be allocated. + * + * Fullness classes K of 0..N are loosely intended to represent blocks whose + * utilization percentage is at least K/N, but we only enforce this rigorously + * for the highest-numbered fullness class, which always contains exactly + * those blocks that are completely full. It's otherwise acceptable for a + * block to be in a higher-numbered fullness class than the one to which it + * logically belongs. In addition, the active block, which is always the + * first block in fullness class 1, is permitted to have a higher allocation + * percentage than would normally be allowable for that fullness class; we + * don't move it until it's completely full, and then it goes to the + * highest-numbered fullness class. + * + * It might seem odd that the active block is the head of fullness class 1 + * rather than fullness class 0, but experience with other allocators has + * shown that it's usually better to allocate from a block that's moderately + * full rather than one that's nearly empty. Insofar as is reasonably + * possible, we want to avoid performing new allocations in a block that would + * otherwise become empty soon. + */ +static bool +ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, + int size_class) +{ + dsa_pointer span_pointer; + dsa_pointer start_pointer; + Size obsize = dsa_size_classes[size_class]; + Size nmax; + int fclass; + Size npages = 1; + Size first_page; + Size i; + dsa_segment_map *segment_map; + + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + + /* + * Compute the number of objects that will fit in a block of this size + * class. Span-of-spans blocks are just a single page, and the first + * object isn't available for use because it describes the block-of-spans + * itself. + */ + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + nmax = FPM_PAGE_SIZE / obsize - 1; + else + nmax = DSA_SUPERBLOCK_SIZE / obsize; + + /* + * If fullness class 1 is empty, try to find a span to put in it by + * scanning higher-numbered fullness classes (excluding the last one, + * whose blocks are certain to all be completely full). + */ + for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) + { + span_pointer = pool->spans[fclass]; + + while (DsaPointerIsValid(span_pointer)) + { + int tfclass; + dsa_area_span *span; + dsa_area_span *nextspan; + dsa_area_span *prevspan; + dsa_pointer next_span_pointer; + + span = (dsa_area_span *) + dsa_get_address(area, span_pointer); + next_span_pointer = span->nextspan; + + /* Figure out what fullness class should contain this span. */ + tfclass = (nmax - span->nallocatable) + * (DSA_FULLNESS_CLASSES - 1) / nmax; + + /* Look up next span. */ + if (DsaPointerIsValid(span->nextspan)) + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + else + nextspan = NULL; + + /* + * If utilization has dropped enough that this now belongs in some + * other fullness class, move it there. + */ + if (tfclass < fclass) + { + /* Remove from the current fullness class list. */ + if (pool->spans[fclass] == span_pointer) + { + /* It was the head; remove it. */ + Assert(!DsaPointerIsValid(span->prevspan)); + pool->spans[fclass] = span->nextspan; + if (nextspan != NULL) + nextspan->prevspan = InvalidDsaPointer; + } + else + { + /* It was not the head. */ + Assert(DsaPointerIsValid(span->prevspan)); + prevspan = (dsa_area_span *) + dsa_get_address(area, span->prevspan); + prevspan->nextspan = span->nextspan; + } + if (nextspan != NULL) + nextspan->prevspan = span->prevspan; + + /* Push onto the head of the new fullness class list. */ + span->nextspan = pool->spans[tfclass]; + pool->spans[tfclass] = span_pointer; + span->prevspan = InvalidDsaPointer; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = span_pointer; + } + span->fclass = tfclass; + } + + /* Advance to next span on list. */ + span_pointer = next_span_pointer; + } + + /* Stop now if we found a suitable block. */ + if (DsaPointerIsValid(pool->spans[1])) + return true; + } + + /* + * If there are no blocks that properly belong in fullness class 1, pick + * one from some other fullness class and move it there anyway, so that we + * have an allocation target. Our last choice is to transfer a block + * that's almost empty (and might become completely empty soon if left + * alone), but even that is better than failing, which is what we must do + * if there are no blocks at all with freespace. + */ + Assert(!DsaPointerIsValid(pool->spans[1])); + for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) + if (transfer_first_span(area, pool, fclass, 1)) + return true; + if (!DsaPointerIsValid(pool->spans[1]) && + transfer_first_span(area, pool, 0, 1)) + return true; + + /* + * We failed to find an existing span with free objects, so we need to + * allocate a new superblock and construct a new span to manage it. + * + * First, get a dsa_area_span object to describe the new superblock block + * ... unless this allocation is for a dsa_area_span object, in which case + * that's surely not going to work. We handle that case by storing the + * span describing a block-of-spans inline. + */ + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + { + span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); + if (!DsaPointerIsValid(span_pointer)) + return false; + npages = DSA_PAGES_PER_SUPERBLOCK; + } + + /* Find or create a segment and allocate the superblock. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + segment_map = get_best_segment(area, npages); + if (segment_map == NULL) + { + segment_map = make_new_segment(area, npages); + if (segment_map == NULL) + { + LWLockRelease(DSA_AREA_LOCK(area)); + return false; + } + } + if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) + { + LWLockRelease(DSA_AREA_LOCK(area)); + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + dsa_free(area, span_pointer); + return false; + } + LWLockRelease(DSA_AREA_LOCK(area)); + + /* Compute the start of the superblock. */ + start_pointer = + DSA_MAKE_POINTER(get_segment_index(area, segment_map), + first_page * FPM_PAGE_SIZE); + + /* + * If this is a block-of-spans, carve the descriptor right out of the + * allocated space. + */ + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + { + /* + * We have a pointer into the segment. We need to build a dsa_pointer + * from the segment index and offset into the segment. + */ + span_pointer = start_pointer; + } + + /* Initialize span and pagemap. */ + init_span(area, span_pointer, pool, start_pointer, npages, size_class); + for (i = 0; i < npages; ++i) + segment_map->pagemap[first_page + i] = span_pointer; + + return true; +} + +/* + * Return the segment map corresponding to a given segment index, mapping the + * segment in if necessary. For internal segment book-keeping, this is called + * with the area lock held. It is also called by dsa_free and dsa_get_address + * without any locking, relying on the fact they have a known live segment + * index and they always call check_for_freed_segments to ensures that any + * freed segment occupying the same slot is detached first. + */ +static dsa_segment_map * +get_segment_by_index(dsa_area *area, dsa_segment_index index) +{ + if (unlikely(area->segment_maps[index].mapped_address == NULL)) + { + dsm_handle handle; + dsm_segment *segment; + dsa_segment_map *segment_map; + + /* + * If we are reached by dsa_free or dsa_get_address, there must be at + * least one object allocated in the referenced segment. Otherwise, + * their caller has a double-free or access-after-free bug, which we + * have no hope of detecting. So we know it's safe to access this + * array slot without holding a lock; it won't change underneath us. + * Furthermore, we know that we can see the latest contents of the + * slot, as explained in check_for_freed_segments, which those + * functions call before arriving here. + */ + handle = area->control->segment_handles[index]; + + /* It's an erro to try to access an unused slot. */ + if (handle == DSM_HANDLE_INVALID) + elog(ERROR, + "dsa_area could not attach to a segment that has been freed"); + + segment = dsm_attach(handle); + if (segment == NULL) + elog(ERROR, "dsa_area could not attach to segment"); + if (area->mapping_pinned) + dsm_pin_mapping(segment); + segment_map = &area->segment_maps[index]; + segment_map->segment = segment; + segment_map->mapped_address = dsm_segment_address(segment); + segment_map->header = + (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Remember the highest index this backend has ever mapped. */ + if (area->high_segment_index < index) + area->high_segment_index = index; + + Assert(segment_map->header->magic == + (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index)); + } + + return &area->segment_maps[index]; +} + +/* + * Return a superblock to the free page manager. If the underlying segment + * has become entirely free, then return it to the operating system. + * + * The appropriate pool lock must be held. + */ +static void +destroy_superblock(dsa_area *area, dsa_pointer span_pointer) +{ + dsa_area_span *span = dsa_get_address(area, span_pointer); + int size_class = span->size_class; + dsa_segment_map *segment_map; + + segment_map = + get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start)); + + /* Remove it from its fullness class list. */ + unlink_span(area, span); + + /* + * Note: Here we acquire the area lock while we already hold a per-pool + * lock. We never hold the area lock and then take a pool lock, or we + * could deadlock. + */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + FreePageManagerPut(segment_map->fpm, + DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, + span->npages); + /* Check if the segment is now entirely free. */ + if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages) + { + dsa_segment_index index = get_segment_index(area, segment_map); + + /* If it's not the segment with extra control data, free it. */ + if (index != 0) + { + /* + * Give it back to the OS, and allow other backends to detect that + * they need to detach. + */ + unlink_segment(area, segment_map); + segment_map->header->freed = true; + Assert(area->control->total_segment_size >= + segment_map->header->size); + area->control->total_segment_size -= + segment_map->header->size; + dsm_unpin_segment(dsm_segment_handle(segment_map->segment)); + dsm_detach(segment_map->segment); + area->control->segment_handles[index] = DSM_HANDLE_INVALID; + ++area->control->freed_segment_counter; + segment_map->segment = NULL; + segment_map->header = NULL; + segment_map->mapped_address = NULL; + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + + /* + * Span-of-spans blocks store the span which describes them within the + * block itself, so freeing the storage implicitly frees the descriptor + * also. If this is a block of any other type, we need to separately free + * the span object also. This recursive call to dsa_free will acquire the + * span pool's lock. We can't deadlock because the acquisition order is + * always some other pool and then the span pool. + */ + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + dsa_free(area, span_pointer); +} + +static void +unlink_span(dsa_area *area, dsa_area_span *span) +{ + if (DsaPointerIsValid(span->nextspan)) + { + dsa_area_span *next = dsa_get_address(area, span->nextspan); + + next->prevspan = span->prevspan; + } + if (DsaPointerIsValid(span->prevspan)) + { + dsa_area_span *prev = dsa_get_address(area, span->prevspan); + + prev->nextspan = span->nextspan; + } + else + { + dsa_area_pool *pool = dsa_get_address(area, span->pool); + + pool->spans[span->fclass] = span->nextspan; + } +} + +static void +add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, + dsa_pointer span_pointer, + int fclass) +{ + dsa_area_pool *pool = dsa_get_address(area, span->pool); + + if (DsaPointerIsValid(pool->spans[fclass])) + { + dsa_area_span *head = dsa_get_address(area, + pool->spans[fclass]); + + head->prevspan = span_pointer; + } + span->prevspan = InvalidDsaPointer; + span->nextspan = pool->spans[fclass]; + pool->spans[fclass] = span_pointer; + span->fclass = fclass; +} + +/* + * Detach from an area that was either created or attached to by this process. + */ +void +dsa_detach(dsa_area *area) +{ + int i; + + /* Detach from all segments. */ + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_detach(area->segment_maps[i].segment); + + /* + * Note that 'detaching' (= detaching from DSM segments) doesn't include + * 'releasing' (= adjusting the reference count). It would be nice to + * combine these operations, but client code might never get around to + * calling dsa_detach because of an error path, and a detach hook on any + * particular segment is too late to detach other segments in the area + * without risking a 'leak' warning in the non-error path. + */ + + /* Free the backend-local area object. */ + pfree(area); +} + +/* + * Unlink a segment from the bin that contains it. + */ +static void +unlink_segment(dsa_area *area, dsa_segment_map *segment_map) +{ + if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *prev; + + prev = get_segment_by_index(area, segment_map->header->prev); + prev->header->next = segment_map->header->next; + } + else + { + Assert(area->control->segment_bins[segment_map->header->bin] == + get_segment_index(area, segment_map)); + area->control->segment_bins[segment_map->header->bin] = + segment_map->header->next; + } + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next; + + next = get_segment_by_index(area, segment_map->header->next); + next->header->prev = segment_map->header->prev; + } +} + +/* + * Find a segment that could satisfy a request for 'npages' of contiguous + * memory, or return NULL if none can be found. This may involve attaching to + * segments that weren't previously attached so that we can query their free + * pages map. + */ +static dsa_segment_map * +get_best_segment(dsa_area *area, Size npages) +{ + Size bin; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + + /* + * Start searching from the first bin that *might* have enough contiguous + * pages. + */ + for (bin = contiguous_pages_to_segment_bin(npages); + bin < DSA_NUM_SEGMENT_BINS; + ++bin) + { + /* + * The minimum contiguous size that any segment in this bin should + * have. We'll re-bin if we see segments with fewer. + */ + Size threshold = 1 << (bin - 1); + dsa_segment_index segment_index; + + /* Search this bin for a segment with enough contiguous space. */ + segment_index = area->control->segment_bins[bin]; + while (segment_index != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *segment_map; + dsa_segment_index next_segment_index; + Size contiguous_pages; + + segment_map = get_segment_by_index(area, segment_index); + next_segment_index = segment_map->header->next; + contiguous_pages = fpm_largest(segment_map->fpm); + + /* Not enough for the request, still enough for this bin. */ + if (contiguous_pages >= threshold && contiguous_pages < npages) + { + segment_index = next_segment_index; + continue; + } + + /* Re-bin it if it's no longer in the appropriate bin. */ + if (contiguous_pages < threshold) + { + Size new_bin; + + new_bin = contiguous_pages_to_segment_bin(contiguous_pages); + + /* Remove it from its current bin. */ + unlink_segment(area, segment_map); + + /* Push it onto the front of its new bin. */ + segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; + segment_map->header->next = + area->control->segment_bins[new_bin]; + segment_map->header->bin = new_bin; + area->control->segment_bins[new_bin] = segment_index; + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next; + + next = get_segment_by_index(area, + segment_map->header->next); + Assert(next->header->bin == new_bin); + next->header->prev = segment_index; + } + + /* + * But fall through to see if it's enough to satisfy this + * request anyway.... + */ + } + + /* Check if we are done. */ + if (contiguous_pages >= npages) + return segment_map; + + /* Continue searching the same bin. */ + segment_index = next_segment_index; + } + } + + /* Not found. */ + return NULL; +} + +/* + * Create a new segment that can handle at least requested_pages. Returns + * NULL if the requested total size limit or maximum allowed number of + * segments would be exceeded. + */ +static dsa_segment_map * +make_new_segment(dsa_area *area, Size requested_pages) +{ + dsa_segment_index new_index; + Size metadata_bytes; + Size total_size; + Size total_pages; + Size usable_pages; + dsa_segment_map *segment_map; + dsm_segment *segment; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + + /* Find a segment slot that is not in use (linearly for now). */ + for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index) + { + if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID) + break; + } + if (new_index == DSA_MAX_SEGMENTS) + return NULL; + + /* + * If the total size limit is already exceeded, then we exit early and + * avoid arithmetic wraparound in the unsigned expressions below. + */ + if (area->control->total_segment_size >= + area->control->max_total_segment_size) + return NULL; + + /* + * The size should be at least as big as requested, and at least big + * enough to follow a geometric series that approximately doubles the + * total storage each time we create a new segment. We use geometric + * growth because the underlying DSM system isn't designed for large + * numbers of segments (otherwise we might even consider just using one + * DSM segment for each large allocation and for each superblock, and then + * we wouldn't need to use FreePageManager). + * + * We decide on a total segment size first, so that we produce tidy + * power-of-two sized segments. This is a good property to have if we + * move to huge pages in the future. Then we work back to the number of + * pages we can fit. + */ + total_size = DSA_INITIAL_SEGMENT_SIZE * + ((Size) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE)); + total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE); + total_size = Min(total_size, + area->control->max_total_segment_size - + area->control->total_segment_size); + + total_pages = total_size / FPM_PAGE_SIZE; + metadata_bytes = + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager)) + + sizeof(dsa_pointer) * total_pages; + + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + if (total_size <= metadata_bytes) + return NULL; + usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE; + Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size); + + /* See if that is enough... */ + if (requested_pages > usable_pages) + { + /* + * We'll make an odd-sized segment, working forward from the requested + * number of pages. + */ + usable_pages = requested_pages; + metadata_bytes = + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager)) + + usable_pages * sizeof(dsa_pointer); + + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE; + + /* Is that too large for dsa_pointer's addressing scheme? */ + if (total_size > DSA_MAX_SEGMENT_SIZE) + return NULL; + + /* Would that exceed the limit? */ + if (total_size > area->control->max_total_segment_size - + area->control->total_segment_size) + return NULL; + } + + /* Create the segment. */ + segment = dsm_create(total_size); + if (segment == NULL) + return NULL; + dsm_pin_segment(segment); + if (area->mapping_pinned) + dsm_pin_mapping(segment); + + /* Store the handle in shared memory to be found by index. */ + area->control->segment_handles[new_index] = + dsm_segment_handle(segment); + /* Track the highest segment index in the history of the area. */ + if (area->control->high_segment_index < new_index) + area->control->high_segment_index = new_index; + /* Track the highest segment index this backend has ever mapped. */ + if (area->high_segment_index < new_index) + area->high_segment_index = new_index; + /* Track total size of all segments. */ + area->control->total_segment_size += total_size; + Assert(area->control->total_segment_size <= + area->control->max_total_segment_size); + + /* Build a segment map for this segment in this backend. */ + segment_map = &area->segment_maps[new_index]; + segment_map->segment = segment; + segment_map->mapped_address = dsm_segment_address(segment); + segment_map->header = (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Set up the free page map. */ + FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); + FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, + usable_pages); + + /* Set up the segment header and put it in the appropriate bin. */ + segment_map->header->magic = + DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index; + segment_map->header->usable_pages = usable_pages; + segment_map->header->size = total_size; + segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); + segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; + segment_map->header->next = + area->control->segment_bins[segment_map->header->bin]; + segment_map->header->freed = false; + area->control->segment_bins[segment_map->header->bin] = new_index; + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next = + get_segment_by_index(area, segment_map->header->next); + + Assert(next->header->bin == segment_map->header->bin); + next->header->prev = new_index; + } + + return segment_map; +} + +/* + * Check if any segments have been freed by destroy_superblock, so we can + * detach from them in this backend. This function is called by + * dsa_get_address and dsa_free to make sure that a dsa_pointer they have + * received can be resolved to the correct segment. + * + * The danger we want to defend against is that there could be an old segment + * mapped into a given slot in this backend, and the dsa_pointer they have + * might refer to some new segment in the same slot. So those functions must + * be sure to process all instructions to detach from a freed segment that had + * been generated by the time this process received the dsa_pointer, before + * they call get_segment_by_index. + */ +static void +check_for_freed_segments(dsa_area *area) +{ + Size freed_segment_counter; + + /* + * Any other process that has freed a segment has incremented + * free_segment_counter while holding an LWLock, and that must precede any + * backend creating a new segment in the same slot while holding an + * LWLock, and that must precede the creation of any dsa_pointer pointing + * into the new segment which might reach us here, and the caller must + * have sent the dsa_pointer to this process using appropriate memory + * synchronization (some kind of locking or atomic primitive or system + * call). So all we need to do on the reading side is ask for the load of + * freed_segment_counter to follow the caller's load of the dsa_pointer it + * has, and we can be sure to detect any segments that had been freed as + * of the time that the dsa_pointer reached this process. + */ + pg_read_barrier(); + freed_segment_counter = area->control->freed_segment_counter; + if (unlikely(area->freed_segment_counter != freed_segment_counter)) + { + int i; + + /* Check all currently mapped segments to find what's been freed. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + for (i = 0; i <= area->high_segment_index; ++i) + { + if (area->segment_maps[i].header != NULL && + area->segment_maps[i].header->freed) + { + dsm_detach(area->segment_maps[i].segment); + area->segment_maps[i].segment = NULL; + area->segment_maps[i].header = NULL; + area->segment_maps[i].mapped_address = NULL; + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + area->freed_segment_counter = freed_segment_counter; + } +} diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h new file mode 100644 index 00000000000..4ef5c241c91 --- /dev/null +++ b/src/include/utils/dsa.h @@ -0,0 +1,108 @@ +/*------------------------------------------------------------------------- + * + * dsa.h + * Dynamic shared memory areas. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/utils/dsa.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSA_H +#define DSA_H + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/dsm.h" + +/* The opaque type used for an area. */ +struct dsa_area; +typedef struct dsa_area dsa_area; + +/* + * If this system doesn't support atomic operations on 64 bit values then + * we fall back to 32 bit dsa_pointer. For testing purposes, + * USE_SMALL_DSA_POINTER can be defined to force the use of 32 bit + * dsa_pointer even on systems that support 64 bit atomics. + */ +#ifndef PG_HAVE_ATOMIC_U64_SUPPORT +#define SIZEOF_DSA_POINTER 4 +#else +#ifdef USE_SMALL_DSA_POINTER +#define SIZEOF_DSA_POINTER 4 +#else +#define SIZEOF_DSA_POINTER 8 +#endif +#endif + +/* + * The type of 'relative pointers' to memory allocated by a dynamic shared + * area. dsa_pointer values can be shared with other processes, but must be + * converted to backend-local pointers before they can be dereferenced. See + * dsa_get_address. Also, an atomic version and appropriately sized atomic + * operations. + */ +#if DSA_POINTER_SIZEOF == 4 +typedef uint32 dsa_pointer; +typedef pg_atomic_uint32 dsa_pointer_atomic; +#define dsa_pointer_atomic_init pg_atomic_init_u32 +#define dsa_pointer_atomic_read pg_atomic_read_u32 +#define dsa_pointer_atomic_write pg_atomic_write_u32 +#define dsa_pointer_atomic_fetch_add pg_atomic_fetch_add_u32 +#define dsa_pointer_atomic_compare_exchange pg_atomic_compare_exchange_u32 +#else +typedef uint64 dsa_pointer; +typedef pg_atomic_uint64 dsa_pointer_atomic; +#define dsa_pointer_atomic_init pg_atomic_init_u64 +#define dsa_pointer_atomic_read pg_atomic_read_u64 +#define dsa_pointer_atomic_write pg_atomic_write_u64 +#define dsa_pointer_atomic_fetch_add pg_atomic_fetch_add_u64 +#define dsa_pointer_atomic_compare_exchange pg_atomic_compare_exchange_u64 +#endif + +/* A sentinel value for dsa_pointer used to indicate failure to allocate. */ +#define InvalidDsaPointer ((dsa_pointer) 0) + +/* Check if a dsa_pointer value is valid. */ +#define DsaPointerIsValid(x) ((x) != InvalidDsaPointer) + +/* + * The type used for dsa_area handles. dsa_handle values can be shared with + * other processes, so that they can attach to them. This provides a way to + * share allocated storage with other processes. + * + * The handle for a dsa_area is currently implemented as the dsm_handle + * for the first DSM segment backing this dynamic storage area, but client + * code shouldn't assume that is true. + */ +typedef dsm_handle dsa_handle; + +extern void dsa_startup(void); + +extern dsa_area *dsa_create(int tranche_id, const char *tranche_name); +extern dsa_area *dsa_create_in_place(void *place, Size size, + int tranche_id, const char *tranche_name, + dsm_segment *segment); +extern dsa_area *dsa_attach(dsa_handle handle); +extern dsa_area *dsa_attach_in_place(void *place, dsm_segment *segment); +extern void dsa_release_in_place(void *place); +extern void dsa_on_dsm_detach_release_in_place(dsm_segment *, Datum); +extern void dsa_on_shmem_exit_release_in_place(int, Datum); +extern void dsa_pin_mapping(dsa_area *area); +extern void dsa_detach(dsa_area *area); +extern void dsa_pin(dsa_area *area); +extern void dsa_unpin(dsa_area *area); +extern void dsa_set_size_limit(dsa_area *area, Size limit); +extern Size dsa_minimum_size(void); +extern dsa_handle dsa_get_handle(dsa_area *area); +extern dsa_pointer dsa_allocate(dsa_area *area, Size size); +extern void dsa_free(dsa_area *area, dsa_pointer dp); +extern void *dsa_get_address(dsa_area *area, dsa_pointer dp); +extern void dsa_trim(dsa_area *area); +extern void dsa_dump(dsa_area *area); + +#endif /* DSA_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index c59af3ee8f9..818e305138a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2141,6 +2141,12 @@ dlist_iter dlist_mutable_iter dlist_node ds_state +dsa_area +dsa_area_control +dsa_area_pool +dsa_area_span +dsa_segment_header +dsa_segment_map dsm_control_header dsm_control_item dsm_handle From f79d880a762337856855324b79f6743151e7eca6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 15 Nov 2023 10:34:28 +0100 Subject: [PATCH 05/10] Fix dsa.c with different resource owners. The comments in dsa.c suggested that areas were owned by resource owners, but it was not in fact tracked explicitly. The DSM attachments held by the dsa were owned by resource owners, but not the area itself. That led to confusion if you used one resource owner to attach or create the area, but then switched to a different resource owner before allocating or even just accessing the allocations in the area with dsa_get_address(). The additional DSM segments associated with the area would get owned by a different resource owner than the initial segment. To fix, add an explicit 'resowner' field to dsa_area. It replaces the 'mapping_pinned' flag; resowner == NULL now indicates that the mapping is pinned. This is arguably a bug fix, but I'm not backpatching because it doesn't seem to be a live bug in the back branches. In 'master', it is a bug because commit b8bff07daa made ResourceOwners more strict so that you are no longer allowed to remember new resources in a ResourceOwner after you have started to release it. Merely accessing a dsa pointer might need to attach a new DSM segment, and before this commit it was temporarily remembered in the current owner for a very brief period even if the DSA was pinned. And that could happen in AtEOXact_PgStat(), which is called after the owner is already released. Reported-by: Alexander Lakhin Reviewed-by: Alexander Lakhin, Thomas Munro, Andres Freund Discussion: https://www.postgresql.org/message-id/11b70743-c5f3-3910-8e5b-dd6c115ff829%40gmail.com (cherry picked from commit postgres/postgres@a8b330ffb6f71db8ef306a7110dade6ca64cf773) --- src/backend/utils/mmgr/dsa.c | 38 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index eeaf143d0a9..670b12f792d 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -58,6 +58,7 @@ #include "utils/dsa.h" #include "utils/freepage.h" #include "utils/memutils.h" +#include "utils/resowner.h" /* * The size of the initial DSM segment that backs a dsa_area created by @@ -365,8 +366,13 @@ struct dsa_area /* The lock tranche for this process. */ LWLockTranche lwlock_tranche; - /* Has the mapping been pinned? */ - bool mapping_pinned; + /* + * All the mappings are owned by this. The dsa_area itself is not + * directly tracked by the ResourceOwner, but the effect is the same. NULL + * if the attachment has session lifespan, i.e if dsa_pin_mapping() has + * been called. + */ + ResourceOwner resowner; /* * This backend's array of segment maps, ordered by segment index @@ -643,12 +649,14 @@ dsa_pin_mapping(dsa_area *area) { int i; - Assert(!area->mapping_pinned); - area->mapping_pinned = true; + if (area->resowner != NULL) + { + area->resowner = NULL; - for (i = 0; i <= area->high_segment_index; ++i) - if (area->segment_maps[i].segment != NULL) - dsm_pin_mapping(area->segment_maps[i].segment); + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_pin_mapping(area->segment_maps[i].segment); + } } /* @@ -1203,7 +1211,7 @@ create_internal(void *place, size_t size, */ area = palloc(sizeof(dsa_area)); area->control = control; - area->mapping_pinned = false; + area->resowner = CurrentResourceOwner; memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); area->high_segment_index = 0; area->lwlock_tranche.array_base = &area->control->pools[0]; @@ -1262,7 +1270,7 @@ attach_internal(void *place, dsm_segment *segment, dsa_handle handle) /* Build the backend-local area object. */ area = palloc(sizeof(dsa_area)); area->control = control; - area->mapping_pinned = false; + area->resowner = CurrentResourceOwner; memset(&area->segment_maps[0], 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); area->high_segment_index = 0; @@ -1679,6 +1687,7 @@ get_segment_by_index(dsa_area *area, dsa_segment_index index) dsm_handle handle; dsm_segment *segment; dsa_segment_map *segment_map; + ResourceOwner oldowner; /* * If we are reached by dsa_free or dsa_get_address, there must be at @@ -1697,11 +1706,12 @@ get_segment_by_index(dsa_area *area, dsa_segment_index index) elog(ERROR, "dsa_area could not attach to a segment that has been freed"); + oldowner = CurrentResourceOwner; + CurrentResourceOwner = area->resowner; segment = dsm_attach(handle); + CurrentResourceOwner = oldowner; if (segment == NULL) elog(ERROR, "dsa_area could not attach to segment"); - if (area->mapping_pinned) - dsm_pin_mapping(segment); segment_map = &area->segment_maps[index]; segment_map->segment = segment; segment_map->mapped_address = dsm_segment_address(segment); @@ -2000,6 +2010,7 @@ make_new_segment(dsa_area *area, Size requested_pages) Size usable_pages; dsa_segment_map *segment_map; dsm_segment *segment; + ResourceOwner oldowner; Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); @@ -2084,12 +2095,13 @@ make_new_segment(dsa_area *area, Size requested_pages) } /* Create the segment. */ + oldowner = CurrentResourceOwner; + CurrentResourceOwner = area->resowner; segment = dsm_create(total_size); + CurrentResourceOwner = oldowner; if (segment == NULL) return NULL; dsm_pin_segment(segment); - if (area->mapping_pinned) - dsm_pin_mapping(segment); /* Store the handle in shared memory to be found by index. */ area->control->segment_handles[new_index] = From e5c2f5d64a1a96cd1ec4f1c4ab91c9ed05c5ac51 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 15 Nov 2023 10:34:31 +0100 Subject: [PATCH 06/10] Add test_dsa module. This covers basic calls within a single backend process, and also calling dsa_allocate() or dsa_get_address() while in a different resource owners. The latter case was fixed by the previous commit. Discussion: https://www.postgresql.org/message-id/11b70743-c5f3-3910-8e5b-dd6c115ff829%40gmail.com (cherry picked from commit postgres/postgres@325f54033e59499cfc087932f68eec5356061640) Changes from original commit: test was adapted to 6.x remove meson.build --- src/test/modules/Makefile | 1 + src/test/modules/test_dsa/Makefile | 28 +++++ .../modules/test_dsa/expected/test_dsa.out | 13 ++ src/test/modules/test_dsa/sql/test_dsa.sql | 4 + src/test/modules/test_dsa/test_dsa--1.0.sql | 12 ++ src/test/modules/test_dsa/test_dsa.c | 111 ++++++++++++++++++ src/test/modules/test_dsa/test_dsa.control | 4 + 7 files changed, 173 insertions(+) create mode 100644 src/test/modules/test_dsa/Makefile create mode 100644 src/test/modules/test_dsa/expected/test_dsa.out create mode 100644 src/test/modules/test_dsa/sql/test_dsa.sql create mode 100644 src/test/modules/test_dsa/test_dsa--1.0.sql create mode 100644 src/test/modules/test_dsa/test_dsa.c create mode 100644 src/test/modules/test_dsa/test_dsa.control diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 191734ab621..b8ea609b3d7 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -8,5 +8,6 @@ SUBDIRS = test_planner SUBDIRS += connection SUBDIRS += test_extensions SUBDIRS += test_bloomfilter +SUBDIRS += test_dsa $(recurse) diff --git a/src/test/modules/test_dsa/Makefile b/src/test/modules/test_dsa/Makefile new file mode 100644 index 00000000000..bcddb84b061 --- /dev/null +++ b/src/test/modules/test_dsa/Makefile @@ -0,0 +1,28 @@ +# src/test/modules/test_dsa/Makefile + +MODULE_big = test_dsa +OBJS = \ + $(WIN32RES) \ + test_dsa.o +PGFILEDESC = "test_dsa - test code for dynamic shared memory areas" + +EXTENSION = test_dsa +DATA = test_dsa--1.0.sql + +REGRESS = test_dsa + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_dsa +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +installcheck: install + +test: clean all install + psql postgres -f sql/test_dsa.sql 2>&1 diff --git a/src/test/modules/test_dsa/expected/test_dsa.out b/src/test/modules/test_dsa/expected/test_dsa.out new file mode 100644 index 00000000000..266010e77fe --- /dev/null +++ b/src/test/modules/test_dsa/expected/test_dsa.out @@ -0,0 +1,13 @@ +CREATE EXTENSION test_dsa; +SELECT test_dsa_basic(); + test_dsa_basic +---------------- + +(1 row) + +SELECT test_dsa_resowners(); + test_dsa_resowners +-------------------- + +(1 row) + diff --git a/src/test/modules/test_dsa/sql/test_dsa.sql b/src/test/modules/test_dsa/sql/test_dsa.sql new file mode 100644 index 00000000000..c3d8db94372 --- /dev/null +++ b/src/test/modules/test_dsa/sql/test_dsa.sql @@ -0,0 +1,4 @@ +CREATE EXTENSION test_dsa; + +SELECT test_dsa_basic(); +SELECT test_dsa_resowners(); diff --git a/src/test/modules/test_dsa/test_dsa--1.0.sql b/src/test/modules/test_dsa/test_dsa--1.0.sql new file mode 100644 index 00000000000..2904cb23525 --- /dev/null +++ b/src/test/modules/test_dsa/test_dsa--1.0.sql @@ -0,0 +1,12 @@ +/* src/test/modules/test_dsa/test_dsa--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_dsa" to load this file. \quit + +CREATE FUNCTION test_dsa_basic() + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION test_dsa_resowners() + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c new file mode 100644 index 00000000000..5ad4c405d79 --- /dev/null +++ b/src/test/modules/test_dsa/test_dsa.c @@ -0,0 +1,111 @@ +/*-------------------------------------------------------------------------- + * + * test_dsa.c + * Test dynamic shared memory areas (DSAs) + * + * Copyright (c) 2022-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_dsa/test_dsa.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "fmgr.h" +#include "utils/dsa.h" +#include "storage/lwlock.h" +#include "utils/resowner.h" + +PG_MODULE_MAGIC; + +/* Test basic DSA functionality */ +PG_FUNCTION_INFO_V1(test_dsa_basic); +Datum +test_dsa_basic(PG_FUNCTION_ARGS) +{ + int tranche_id; + dsa_area *a; + dsa_pointer p[100]; + + /* XXX: this tranche is leaked */ + tranche_id = LWLockNewTrancheId(); + + a = dsa_create(tranche_id, "test_dsa"); + for (int i = 0; i < 100; i++) + { + p[i] = dsa_allocate(a, 1000); + snprintf(dsa_get_address(a, p[i]), 1000, "foobar%d", i); + } + + for (int i = 0; i < 100; i++) + { + char buf[100]; + + snprintf(buf, 100, "foobar%d", i); + if (strcmp(dsa_get_address(a, p[i]), buf) != 0) + elog(ERROR, "no match"); + } + + for (int i = 0; i < 100; i++) + { + dsa_free(a, p[i]); + } + + dsa_detach(a); + + PG_RETURN_VOID(); +} + +/* Test using DSA across different resource owners */ +PG_FUNCTION_INFO_V1(test_dsa_resowners); +Datum +test_dsa_resowners(PG_FUNCTION_ARGS) +{ + int tranche_id; + dsa_area *a; + dsa_pointer p[10000]; + ResourceOwner oldowner; + ResourceOwner childowner; + + /* XXX: this tranche is leaked */ + tranche_id = LWLockNewTrancheId(); + + /* Create DSA in parent resource owner */ + a = dsa_create(tranche_id, "test_dsa"); + + /* + * Switch to child resource owner, and do a bunch of allocations in the + * DSA + */ + oldowner = CurrentResourceOwner; + childowner = ResourceOwnerCreate(oldowner, "test_dsa temp owner"); + CurrentResourceOwner = childowner; + + for (int i = 0; i < 10000; i++) + { + p[i] = dsa_allocate(a, 1000); + snprintf(dsa_get_address(a, p[i]), 1000, "foobar%d", i); + } + + /* Also test freeing, by freeing some of the allocations. */ + for (int i = 0; i < 500; i++) + dsa_free(a, p[i]); + + /* Release the child resource owner */ + CurrentResourceOwner = oldowner; + ResourceOwnerRelease(childowner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, false); + ResourceOwnerRelease(childowner, + RESOURCE_RELEASE_LOCKS, + true, false); + ResourceOwnerRelease(childowner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, false); + ResourceOwnerDelete(childowner); + + dsa_detach(a); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_dsa/test_dsa.control b/src/test/modules/test_dsa/test_dsa.control new file mode 100644 index 00000000000..ac9674b2193 --- /dev/null +++ b/src/test/modules/test_dsa/test_dsa.control @@ -0,0 +1,4 @@ +comment = 'Test code for dynamic shared memory areas' +default_version = '1.0' +module_pathname = '$libdir/test_dsa' +relocatable = true From 08acba1a573abce7354b57be31ca8f6f907842af Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Tue, 13 Feb 2024 14:18:51 +0500 Subject: [PATCH 07/10] Allow DSM segments to be created as pinned dsm_create and dsm_attach assumed that a current resource owner was always in place. Exploration with the API show that this is inconvenient: sometimes one must create a dummy resowner, create/attach the DSM, only to pin the mapping later, which is wasteful. Change create/attach so that if there is no current resowner, the dsm is effectively pinned right from the start. Discussion: https://postgr.es/m/20170324232710.32acsfsvjqfgc6ud@alvherre.pgsql Reviewed by Thomas Munro. This is backport of commit 767bc02 Co-authored-by: Alvaro Herrera --- src/backend/storage/ipc/dsm.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c index 2dcded71bd5..d5b200d1a28 100644 --- a/src/backend/storage/ipc/dsm.c +++ b/src/backend/storage/ipc/dsm.c @@ -453,6 +453,13 @@ dsm_set_control_handle(dsm_handle h) /* * Create a new dynamic shared memory segment. + * + * If there is a non-NULL CurrentResourceOwner, the new segment is associated + * with it and must be detached before the resource owner releases, or a + * warning will be logged. If CurrentResourceOwner is NULL, the segment + * remains attached until explicitely detached or the session ends. + * Creating with a NULL CurrentResourceOwner is equivalent to creating + * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping. */ dsm_segment * dsm_create(Size size) @@ -535,6 +542,11 @@ dsm_create(Size size) * This can happen if we're asked to attach the segment, but then everyone * else detaches it (causing it to be destroyed) before we get around to * attaching it. + * + * If there is a non-NULL CurrentResourceOwner, the attached segment is + * associated with it and must be detached before the resource owner releases, + * or a warning will be logged. Otherwise the segment remains attached until + * explicitely detached or the session ends. See the note atop dsm_create(). */ dsm_segment * dsm_attach(dsm_handle h) @@ -1055,7 +1067,8 @@ dsm_create_descriptor(void) { dsm_segment *seg; - ResourceOwnerEnlargeDSMs(CurrentResourceOwner); + if (CurrentResourceOwner) + ResourceOwnerEnlargeDSMs(CurrentResourceOwner); seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment)); dlist_push_head(&dsm_segment_list, &seg->node); @@ -1067,7 +1080,8 @@ dsm_create_descriptor(void) seg->mapped_size = 0; seg->resowner = CurrentResourceOwner; - ResourceOwnerRememberDSM(CurrentResourceOwner, seg); + if (CurrentResourceOwner) + ResourceOwnerRememberDSM(CurrentResourceOwner, seg); slist_init(&seg->on_detach); From 4d70df88f7bb211be90e7fac86c179967677b5c8 Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Wed, 12 Nov 2025 11:56:49 +0300 Subject: [PATCH 08/10] Fix typo --- src/backend/utils/mmgr/dsa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 670b12f792d..1badafb9624 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -1701,7 +1701,7 @@ get_segment_by_index(dsa_area *area, dsa_segment_index index) */ handle = area->control->segment_handles[index]; - /* It's an erro to try to access an unused slot. */ + /* It's an error to try to access an unused slot. */ if (handle == DSM_HANDLE_INVALID) elog(ERROR, "dsa_area could not attach to a segment that has been freed"); From 18e5420cac579c8c5e5746838aa5dd845282bbf8 Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Wed, 12 Nov 2025 15:01:53 +0300 Subject: [PATCH 09/10] Add tests from the modules directory Rewrite the test_bloomfilter function call to make the test_bloomfilter test passed --- .github/workflows/build-gpdb.yml | 3 +++ .../modules/test_bloomfilter/expected/test_bloomfilter.out | 5 +---- src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql | 5 +---- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-gpdb.yml b/.github/workflows/build-gpdb.yml index 6872dd66f32..3455e25fadb 100644 --- a/.github/workflows/build-gpdb.yml +++ b/.github/workflows/build-gpdb.yml @@ -168,6 +168,9 @@ jobs: }, {"test":"ic-mirrorless", "make_configs":["src/test/isolation2:installcheck-mirrorless"] + }, + {"test":"ic-modules", + "make_configs":["src/test/modules:installcheck"] } ] }' diff --git a/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out b/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out index 21c068867d6..8868f8d1a3c 100644 --- a/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out +++ b/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out @@ -1,9 +1,6 @@ CREATE EXTENSION test_bloomfilter; -- See README for explanation of arguments: -SELECT test_bloomfilter(power => 23, - nelements => 838861, - seed => -1, - tests => 1); +SELECT test_bloomfilter(23, 838861); test_bloomfilter ------------------ diff --git a/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql b/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql index 9ec159ce404..1b236a190e9 100644 --- a/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql +++ b/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql @@ -1,10 +1,7 @@ CREATE EXTENSION test_bloomfilter; -- See README for explanation of arguments: -SELECT test_bloomfilter(power => 23, - nelements => 838861, - seed => -1, - tests => 1); +SELECT test_bloomfilter(23, 838861); -- Equivalent "10 bits per element" tests for all possible bitset sizes: -- From b069fc48a998022aed56f91922e5202d9e143056 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 18 Feb 2019 09:53:26 +1300 Subject: [PATCH 10/10] Fix race in dsm_unpin_segment() when handles are reused. Teach dsm_unpin_segment() to skip segments that are in the process of being destroyed by another backend, when searching for a handle. Such a segment cannot possibly be the one we are looking for, even if its handle matches. Another slot might hold a recently created segment that has the same handle value by coincidence, and we need to keep searching for that one. The bug caused rare "cannot unpin a segment that is not pinned" errors on 10 and 11. Similar to commit 6c0fb941 for dsm_attach(). Back-patch to 10, where dsm_unpin_segment() landed. Author: Thomas Munro Reported-by: Justin Pryzby Tested-by: Justin Pryzby (along with other recent DSA/DSM fixes) Discussion: https://postgr.es/m/20190216023854.GF30291@telsasoft.com cherry-picked from 0b55aaa --- src/backend/storage/ipc/dsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c index d5b200d1a28..f9e1249b980 100644 --- a/src/backend/storage/ipc/dsm.c +++ b/src/backend/storage/ipc/dsm.c @@ -874,8 +874,8 @@ dsm_unpin_segment(dsm_handle handle) LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); for (i = 0; i < dsm_control->nitems; ++i) { - /* Skip unused slots. */ - if (dsm_control->item[i].refcnt == 0) + /* Skip unused slots and segments that are concurrently going away. */ + if (dsm_control->item[i].refcnt <= 1) continue; /* If we've found our handle, we can stop searching. */