facebook
diff --git a/‎Makefile.in‎
Lines changed: 1 addition & 1 deletion b/‎Makefile.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/jemalloc/internal/arena_externs.h‎
Lines changed: 1 addition & 1 deletion b/‎include/jemalloc/internal/arena_externs.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/jemalloc/internal/ctl.h‎
Lines changed: 0 additions & 1 deletion b/‎include/jemalloc/internal/ctl.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/jemalloc/internal/edata.h‎
Lines changed: 1 addition & 1 deletion b/‎include/jemalloc/internal/edata.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/jemalloc/internal/hpa.h‎
Lines changed: 13 additions & 4 deletions b/‎include/jemalloc/internal/hpa.h‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎include/jemalloc/internal/pa.h‎
Lines changed: 4 additions & 8 deletions b/‎include/jemalloc/internal/pa.h‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎include/jemalloc/internal/pai.h‎
Lines changed: 0 additions & 37 deletions b/‎include/jemalloc/internal/pai.h‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎include/jemalloc/internal/sec.h‎
Lines changed: 62 additions & 57 deletions b/‎include/jemalloc/internal/sec.h‎
Lines changed: 62 additions & 57 deletions
@@ -133,7 +133,6 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/nstime.c \
 	$(srcroot)src/pa.c \
 	$(srcroot)src/pa_extra.c \
-	$(srcroot)src/pai.c \
 	$(srcroot)src/pac.c \
 	$(srcroot)src/pages.c \
 	$(srcroot)src/peak_event.c \
@@ -228,6 +227,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/hook.c \
 	$(srcroot)test/unit/hpa.c \
+	$(srcroot)test/unit/hpa_sec_integration.c \
 	$(srcroot)test/unit/hpa_thp_always.c \
 	$(srcroot)test/unit/hpa_vectorized_madvise.c \
 	$(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \
 
@@ -46,7 +46,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
     bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats,
-    hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
+    hpa_shard_stats_t *hpastats);
 void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
 edata_t *arena_extent_alloc_large(
     tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);
 
@@ -51,7 +51,6 @@ typedef struct ctl_arena_stats_s {
 	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
 	pac_estats_t        estats[SC_NPSIZES];
 	hpa_shard_stats_t   hpastats;
-	sec_stats_t         secstats;
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
 
@@ -222,7 +222,7 @@ struct edata_s {
 		 * ssssssss [...] ssssssss ssssnnnn nnnnnnnn
 		 */
 		size_t e_size_esn;
-#define EDATA_SIZE_MASK ((size_t) ~(PAGE - 1))
+#define EDATA_SIZE_MASK ((size_t)~(PAGE - 1))
 #define EDATA_ESN_MASK ((size_t)PAGE - 1)
 		/* Base extent size, which may not be a multiple of PAGE. */
 		size_t e_bsize;
 
@@ -11,6 +11,7 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/pai.h"
 #include "jemalloc/internal/psset.h"
+#include "jemalloc/internal/sec.h"
 
 typedef struct hpa_central_s hpa_central_t;
 struct hpa_central_s {
@@ -80,6 +81,7 @@ typedef struct hpa_shard_stats_s hpa_shard_stats_t;
 struct hpa_shard_stats_s {
 	psset_stats_t                psset_stats;
 	hpa_shard_nonderived_stats_t nonderived_stats;
+	sec_stats_t                  secstats;
 };
 
 typedef struct hpa_shard_s hpa_shard_t;
@@ -92,6 +94,10 @@ struct hpa_shard_s {
 
 	/* The central allocator we get our hugepages from. */
 	hpa_central_t *central;
+
+	/* Small extent cache */
+	sec_t sec;
+
 	/* Protects most of this shard's state. */
 	malloc_mutex_t mtx;
 	/*
@@ -167,9 +173,9 @@ bool hpa_hugepage_size_exceeds_limit(void);
 bool hpa_supported(void);
 bool hpa_central_init(
     hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks);
-bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
-    base_t *base, edata_cache_t *edata_cache, unsigned ind,
-    const hpa_shard_opts_t *opts);
+bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
+    emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
+    const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts);
 
 void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
 void hpa_shard_stats_merge(
@@ -182,15 +188,18 @@ void hpa_shard_stats_merge(
  */
 void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
+/* Flush caches that shard may be using */
+void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard);
 
 void hpa_shard_set_deferral_allowed(
     tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed);
 void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);
 
 /*
  * We share the fork ordering with the PA and arena prefork handling; that's why
- * these are 3 and 4 rather than 0 and 1.
+ * these are 2, 3 and 4 rather than 0 and 1.
  */
+void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
 
@@ -96,12 +96,6 @@ struct pa_shard_s {
 	/* Allocates from a PAC. */
 	pac_t pac;
 
-	/*
-	 * We place a small extent cache in front of the HPA, since we intend
-	 * these configurations to use many fewer arenas, and therefore have a
-	 * higher risk of hot locks.
-	 */
-	sec_t       hpa_sec;
 	hpa_shard_t hpa_shard;
 
 	/* The source of edata_t objects. */
@@ -166,6 +160,9 @@ void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard);
  */
 void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard);
 
+/* Flush any caches used by shard */
+void pa_shard_flush(tsdn_t *tsdn, pa_shard_t *shard);
+
 /* Gets an edata for the given allocation. */
 edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size,
     size_t alignment, bool slab, szind_t szind, bool zero, bool guarded,
@@ -233,8 +230,7 @@ void pa_shard_basic_stats_merge(
 
 void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
     pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
-    hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
-    size_t *resident);
+    hpa_shard_stats_t *hpa_stats_out, size_t *resident);
 
 /*
  * Reads the PA-owned mutex stats into the output stats array, at the
 
@@ -13,25 +13,13 @@ struct pai_s {
 	edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size,
 	    size_t alignment, bool zero, bool guarded, bool frequent_reuse,
 	    bool *deferred_work_generated);
-	/*
-	 * Returns the number of extents added to the list (which may be fewer
-	 * than requested, in case of OOM).  The list should already be
-	 * initialized.  The only alignment guarantee is page-alignment, and
-	 * the results are not necessarily zeroed.
-	 */
-	size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size,
-	    size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
-	    bool *deferred_work_generated);
 	bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 	    size_t old_size, size_t new_size, bool zero,
 	    bool *deferred_work_generated);
 	bool (*shrink)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 	    size_t old_size, size_t new_size, bool *deferred_work_generated);
 	void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 	    bool *deferred_work_generated);
-	/* This function empties out list as a side-effect of being called. */
-	void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self,
-	    edata_list_active_t *list, bool *deferred_work_generated);
 	uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self);
 };
 
@@ -47,14 +35,6 @@ pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
 	    frequent_reuse, deferred_work_generated);
 }
 
-static inline size_t
-pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
-    edata_list_active_t *results, bool frequent_reuse,
-    bool *deferred_work_generated) {
-	return self->alloc_batch(tsdn, self, size, nallocs, results,
-	    frequent_reuse, deferred_work_generated);
-}
-
 static inline bool
 pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
     size_t new_size, bool zero, bool *deferred_work_generated) {
@@ -75,26 +55,9 @@ pai_dalloc(
 	self->dalloc(tsdn, self, edata, deferred_work_generated);
 }
 
-static inline void
-pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
-    bool *deferred_work_generated) {
-	self->dalloc_batch(tsdn, self, list, deferred_work_generated);
-}
-
 static inline uint64_t
 pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
 	return self->time_until_deferred_work(tsdn, self);
 }
 
-/*
- * An implementation of batch allocation that simply calls alloc once for
- * each item in the list.
- */
-size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size,
-    size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
-    bool *deferred_work_generated);
-/* Ditto, for dalloc. */
-void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self,
-    edata_list_active_t *list, bool *deferred_work_generated);
-
 #endif /* JEMALLOC_INTERNAL_PAI_H */
@@ -17,91 +17,96 @@
  * knowledge of the underlying PAI implementation).
  */
 
-/*
- * For now, this is just one field; eventually, we'll probably want to get more
- * fine-grained data out (like per-size class statistics).
- */
+typedef struct sec_bin_stats_s sec_bin_stats_t;
+struct sec_bin_stats_s {
+	/* Number of alloc requests that did not find extent in this bin */
+	size_t nmisses;
+	/* Number of successful alloc requests. */
+	size_t nhits;
+	/* Number of dallocs causing the flush */
+	size_t ndalloc_flush;
+	/* Number of dallocs not causing the flush */
+	size_t ndalloc_noflush;
+};
 typedef struct sec_stats_s sec_stats_t;
 struct sec_stats_s {
 	/* Sum of bytes_cur across all shards. */
 	size_t bytes;
+
+	/* Totals of bin_stats. */
+	sec_bin_stats_t total;
 };
 
+static inline void
+sec_bin_stats_init(sec_bin_stats_t *stats) {
+	stats->ndalloc_flush = 0;
+	stats->nmisses = 0;
+	stats->nhits = 0;
+	stats->ndalloc_noflush = 0;
+}
+
+static inline void
+sec_bin_stats_accum(sec_bin_stats_t *dst, sec_bin_stats_t *src) {
+	dst->nmisses += src->nmisses;
+	dst->nhits += src->nhits;
+	dst->ndalloc_flush += src->ndalloc_flush;
+	dst->ndalloc_noflush += src->ndalloc_noflush;
+}
+
 static inline void
 sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
 	dst->bytes += src->bytes;
+	sec_bin_stats_accum(&dst->total, &src->total);
 }
 
 /* A collections of free extents, all of the same size. */
 typedef struct sec_bin_s sec_bin_t;
 struct sec_bin_s {
 	/*
-	 * When we fail to fulfill an allocation, we do a batch-alloc on the
-	 * underlying allocator to fill extra items, as well.  We drop the SEC
-	 * lock while doing so, to allow operations on other bins to succeed.
-	 * That introduces the possibility of other threads also trying to
-	 * allocate out of this bin, failing, and also going to the backing
-	 * allocator.  To avoid a thundering herd problem in which lots of
-	 * threads do batch allocs and overfill this bin as a result, we only
-	 * allow one batch allocation at a time for a bin.  This bool tracks
-	 * whether or not some thread is already batch allocating.
-	 *
-	 * Eventually, the right answer may be a smarter sharding policy for the
-	 * bins (e.g. a mutex per bin, which would also be more scalable
-	 * generally; the batch-allocating thread could hold it while
-	 * batch-allocating).
+	 * Protects the data members of the bin.
 	 */
-	bool being_batch_filled;
+	malloc_mutex_t mtx;
 
 	/*
-	 * Number of bytes in this particular bin (as opposed to the
-	 * sec_shard_t's bytes_cur.  This isn't user visible or reported in
-	 * stats; rather, it allows us to quickly determine the change in the
-	 * centralized counter when flushing.
+	 * Number of bytes in this particular bin.
 	 */
 	size_t              bytes_cur;
 	edata_list_active_t freelist;
+	sec_bin_stats_t     stats;
 };
 
-typedef struct sec_shard_s sec_shard_t;
-struct sec_shard_s {
-	/*
-	 * We don't keep per-bin mutexes, even though that would allow more
-	 * sharding; this allows global cache-eviction, which in turn allows for
-	 * better balancing across free lists.
-	 */
-	malloc_mutex_t mtx;
-	/*
-	 * A SEC may need to be shut down (i.e. flushed of its contents and
-	 * prevented from further caching).  To avoid tricky synchronization
-	 * issues, we just track enabled-status in each shard, guarded by a
-	 * mutex.  In practice, this is only ever checked during brief races,
-	 * since the arena-level atomic boolean tracking HPA enabled-ness means
-	 * that we won't go down these pathways very often after custom extent
-	 * hooks are installed.
-	 */
-	bool       enabled;
+typedef struct sec_s sec_t;
+struct sec_s {
+	sec_opts_t opts;
 	sec_bin_t *bins;
-	/* Number of bytes in all bins in the shard. */
-	size_t bytes_cur;
-	/* The next pszind to flush in the flush-some pathways. */
-	pszind_t to_flush_next;
+	pszind_t   npsizes;
 };
 
-typedef struct sec_s sec_t;
-struct sec_s {
-	pai_t  pai;
-	pai_t *fallback;
+static inline bool
+sec_is_used(sec_t *sec) {
+	return sec->opts.nshards != 0;
+}
 
-	sec_opts_t   opts;
-	sec_shard_t *shards;
-	pszind_t     npsizes;
-};
+static inline bool
+sec_size_supported(sec_t *sec, size_t size) {
+	return sec_is_used(sec) && size <= sec->opts.max_alloc;
+}
+
+/* If sec does not have extent available, it will return NULL. */
+edata_t *sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size);
+void     sec_fill(tsdn_t *tsdn, sec_t *sec, size_t size,
+        edata_list_active_t *result, size_t nallocs);
+
+/*
+ * Upon return dalloc_list may be empty if edata is consumed by sec or
+ * non-empty if there are extents that need to be flushed from cache.
+ */
+void sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list);
+
+bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts);
 
-bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
-    const sec_opts_t *opts);
-void sec_flush(tsdn_t *tsdn, sec_t *sec);
-void sec_disable(tsdn_t *tsdn, sec_t *sec);
+/* Fills to_flush with extents that need to be deallocated */
+void sec_flush(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *to_flush);
 
 /*
  * Morally, these two stats methods probably ought to be a single one (and the