llnl · publixsubfan · Jul 24, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -40,6 +40,8 @@ The Axom project release numbers follow [Semantic Versioning](http://semver.org/
 - 2D and 3D implementations for `axom::for_all` were added.
 - Adds `axom::FlatMapView`, a helper class associated with `axom::FlatMap` to support queries from
   within a GPU kernel.
+- Adds an `axom::FlatMap::create()` method to support constructing a hash map over a batch of keys
+  and values on the GPU or with OpenMP.
 - Adds support for custom allocators to `axom::FlatMap`.
 - Primal: Adds ability to perform sample-based shaping on tetrahedral shapes.
 - Improves efficiency of volume fraction computation from quadrature samples during sample-based shaping.

diff --git a/src/axom/core/CMakeLists.txt b/src/axom/core/CMakeLists.txt
@@ -68,6 +68,7 @@ set(core_headers
     MapCollection.hpp
     FlatMap.hpp
     FlatMapView.hpp
+    FlatMapUtil.hpp
     DeviceHash.hpp
     NumericArray.hpp
     NumericLimits.hpp

diff --git a/src/axom/core/FlatMap.hpp b/src/axom/core/FlatMap.hpp
@@ -602,7 +602,7 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
    *
    * \param count the number of elements to fit without a rehash
    */
-  void reserve(IndexType count) { rehash(std::ceil(count / MAX_LOAD_FACTOR)); }
+  void reserve(IndexType count) { rehash(count); }
 
   /*!
    * \brief Returns a read-only view of the FlatMap.
@@ -613,6 +613,30 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
   ConstView view() const;
   /// }@
 
+  /*!
+   * \brief Constructs and returns a FlatMap given a set of key-value pairs.
+   *
+   *  Duplicate keys are handled by selecting the last value in the values
+   *  array corresponding to the equivalent key.
+   *
+   * \param keys   [in] array of keys for the pairs to insert
+   * \param values [in] array of values for the pairs to insert
+   * \param allocator [in] allocator to use for the constructed FlatMap
+   *
+   * \tparam ExecSpace the execution space in which to perform the batched
+   *                   construction
+   *
+   * \return the constructed FlatMap
+   *
+   * \pre keys.size() == values.size()
+   * \pre {keys, values}.getAllocatorID() is accessible from ExecSpace
+   * \pre allocator is accessible from ExecSpace
+   */
+  template <typename ExecSpace>
+  static FlatMap create(axom::ArrayView<KeyType> keys,
+                        axom::ArrayView<ValueType> values,
+                        Allocator allocator = Allocator {});
+
 private:
   friend class FlatMapView<KeyType, ValueType, false, Hash>;
   friend class FlatMapView<KeyType, ValueType, true, Hash>;
@@ -715,13 +739,13 @@ FlatMap<KeyType, ValueType, Hash>::FlatMap(IndexType bucket_count, Allocator all
   , m_loadCount(0)
 {
   IndexType minBuckets = MIN_NUM_BUCKETS;
-  bucket_count = axom::utilities::max(minBuckets, bucket_count);
+  bucket_count = axom::utilities::max<IndexType>(minBuckets, bucket_count / MAX_LOAD_FACTOR);
   // Get the smallest power-of-two number of groups satisfying:
   // N * GroupSize - 1 >= minBuckets
   // TODO: we should add a countl_zero overload for 64-bit integers
   {
     std::int32_t numGroups = std::ceil((bucket_count + 1) / (double)BucketsPerGroup);
-    m_numGroups2 = 31 - (axom::utilities::countl_zero(numGroups));
+    m_numGroups2 = 32 - (axom::utilities::countl_zero(numGroups - 1));
   }
 
   IndexType numGroupsRounded = 1 << m_numGroups2;
@@ -860,4 +884,6 @@ auto FlatMap<KeyType, ValueType, Hash>::erase(const_iterator pos) -> iterator
 
 }  // namespace axom
 
+#include "FlatMapUtil.hpp"
+
 #endif  // Axom_Core_FlatMap_HPP
diff --git a/src/axom/core/FlatMapUtil.hpp b/src/axom/core/FlatMapUtil.hpp
@@ -0,0 +1,210 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and
+// other Axom Project Developers. See the top-level COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+
+#ifndef Axom_Core_FlatMap_Util_HPP
+#define Axom_Core_FlatMap_Util_HPP
+
+#include "axom/config.hpp"
+#include "axom/core/FlatMap.hpp"
+#include "axom/core/execution/reductions.hpp"
+
+namespace axom
+{
+namespace detail
+{
+
+struct SpinLock
+{
+  int value {0};
+
+  AXOM_HOST_DEVICE bool tryLock()
+  {
+    int still_locked = 0;
+#if defined(__HIP_DEVICE_COMPILE__)
+    still_locked = __hip_atomic_exchange(&value, 1, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
+#elif defined(AXOM_USE_RAJA) && defined(__CUDA_ARCH__)
+    still_locked = RAJA::atomicExchange<RAJA::cuda_atomic>(&value, 1);
+    // We really want an acquire-fenced atomic here
+    __threadfence();
+#elif defined(AXOM_USE_RAJA) && defined(AXOM_USE_OPENMP)
+    still_locked = RAJA::atomicExchange<RAJA::omp_atomic>(&value, 1);
+    std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+    return !still_locked;
+  }
+
+  AXOM_HOST_DEVICE void unlock()
+  {
+#if defined(__HIP_DEVICE_COMPILE__)
+    __hip_atomic_exchange(&value, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
+#elif defined(AXOM_USE_RAJA) && defined(__CUDA_ARCH__)
+    // We really want a release-fenced atomic here
+    __threadfence();
+    RAJA::atomicExchange<RAJA::cuda_atomic>(&value, 0);
+#elif defined(AXOM_USE_RAJA) && defined(AXOM_USE_OPENMP)
+    std::atomic_thread_fence(std::memory_order_release);
+    RAJA::atomicExchange<RAJA::omp_atomic>(&value, 0);
+#else
+    value = 0;
+#endif
+  }
+};
+
+}  // namespace detail
+
+template <typename KeyType, typename ValueType, typename Hash>
+template <typename ExecSpace>
+auto FlatMap<KeyType, ValueType, Hash>::create(ArrayView<KeyType> keys,
+                                               ArrayView<ValueType> values,
+                                               Allocator allocator) -> FlatMap
+{
+  assert(keys.size() == values.size());
+
+  const IndexType num_elems = keys.size();
+
+  FlatMap new_map(allocator);
+  new_map.reserve(num_elems);
+
+  using HashResult = typename Hash::result_type;
+  using GroupBucket = detail::flat_map::GroupBucket;
+
+  // Grab some needed internal fields from the flat map.
+  // We're going to be constructing metadata and the K-V pairs directly
+  // in-place.
+  const int ngroups_pow_2 = new_map.m_numGroups2;
+  const auto meta_group = new_map.m_metadata.view();
+  const auto buckets = new_map.m_buckets.view();
+
+  // Construct an array of locks per-group. This guards metadata updates for
+  // each insertion.
+  const IndexType num_groups = 1 << ngroups_pow_2;
+  Array<detail::SpinLock> lock_vec(num_groups, num_groups, allocator.getID());
+  const auto group_locks = lock_vec.view();
+
+  // Map bucket slots to k-v pair indices. This is used to deduplicate pairs
+  // with the same key value.
+  Array<IndexType> key_index_dedup_vec(0, 0, allocator.getID());
+  key_index_dedup_vec.resize(num_groups * GroupBucket::Size, -1);
+  const auto key_index_dedup = key_index_dedup_vec.view();
+
+  // Map k-v pair indices to bucket slots. This is essentially the inverse of
+  // the above mapping.
+  Array<IndexType> key_index_to_bucket_vec(num_elems, num_elems, allocator.getID());
+  const auto key_index_to_bucket = key_index_to_bucket_vec.view();
+
+  for_all<ExecSpace>(
+    num_elems,
+    AXOM_LAMBDA(IndexType idx) {
+      // Hash keys.
+      auto hash = Hash {}(keys[idx]);
+
+      // We use the k MSBs of the hash as the initial group probe point,
+      // where ngroups = 2^k.
+      int bitshift_right = ((CHAR_BIT * sizeof(HashResult)) - ngroups_pow_2);
+      HashResult curr_group = hash >> bitshift_right;
+      curr_group &= ((1 << ngroups_pow_2) - 1);
+
+      std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);
+
+      IndexType duplicate_bucket_index = -1;
+      IndexType empty_bucket_index = -1;
+      int iteration = 0;
+      while(iteration < meta_group.size())
+      {
+        // Try to lock the group. We do this in a non-blocking manner to avoid
+        // intra-warp progress hazards.
+        bool group_locked = group_locks[curr_group].tryLock();
+
+        if(group_locked)
+        {
+          // Every bucket visit - check prior filled buckets for duplicate
+          // keys.
+          int empty_slot_index =
+            meta_group[curr_group].visitHashOrEmptyBucket(hash_8, [&](int matching_slot) {
+              IndexType bucket_index = curr_group * GroupBucket::Size + matching_slot;
+
+              if(keys[key_index_dedup[bucket_index]] == keys[idx])
+              {
+                // Highest-indexed kv pair wins.
+                axom::atomicMax<ExecSpace>(&key_index_dedup[bucket_index], idx);
+                key_index_to_bucket[idx] = bucket_index;
+                duplicate_bucket_index = bucket_index;
+              }
+            });
+
+          if(duplicate_bucket_index == -1)
+          {
+            if(empty_slot_index == GroupBucket::InvalidSlot)
+            {
+              // Group is full. Set overflow bit for the group.
+              meta_group[curr_group].template setOverflow<true>(hash_8);
+            }
+            else
+            {
+              // Got to end of probe sequence without a duplicate.
+              // Update empty bucket index.
+              empty_bucket_index = curr_group * GroupBucket::Size + empty_slot_index;
+              meta_group[curr_group].template setBucket<true>(empty_slot_index, hash_8);
+              key_index_dedup[empty_bucket_index] = idx;
+              key_index_to_bucket[idx] = empty_bucket_index;
+            }
+          }
+          // Unlock group once we're done.
+          group_locks[curr_group].unlock();
+
+          if(duplicate_bucket_index != -1 || empty_bucket_index != -1)
+          {
+            // We've found an empty slot or a duplicate key to place the
+            // value at. Empty slots should only occur at the end of the
+            // probe sequence, since we're only inserting.
+            break;
+          }
+          else
+          {
+            // Move to next group.
+            curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) % meta_group.size();
+            iteration++;
+          }
+        }
+      }
+    });
+
+  // Add a counter for duplicated inserts.
+  axom::ReduceSum<ExecSpace, IndexType> total_inserts(0);
+
+  // Using key-deduplication map, assign unique k-v pairs to buckets.
+  for_all<ExecSpace>(
+    num_elems,
+    AXOM_LAMBDA(IndexType kv_idx) {
+      IndexType bucket_idx = key_index_to_bucket[kv_idx];
+      IndexType winning_idx = key_index_dedup[bucket_idx];
+      // Place k-v pair at bucket_idx.
+      if(kv_idx == winning_idx)
+      {
+#if defined(__CUDA_ARCH__)
+        // HACK: std::pair constructor is not host-device annotated, but CUDA
+        // requires passing in --expt-relaxed-constexpr for it to work.
+        // Instead of requiring this flag, construct each member of the pair
+        // individually.
+        KeyType& key_dst = const_cast<KeyType&>(buckets[bucket_idx].get().first);
+        ValueType& value_dst = buckets[bucket_idx].get().second;
+        new(&key_dst) KeyType {keys[kv_idx]};
+        new(&value_dst) ValueType {values[kv_idx]};
+#else
+        new(&buckets[bucket_idx]) KeyValuePair(keys[kv_idx], values[kv_idx]);
+#endif
+        total_inserts += 1;
+      }
+    });
+
+  new_map.m_size = total_inserts.get();
+  new_map.m_loadCount = total_inserts.get();
+
+  return new_map;
+}
+
+}  // namespace axom
+
+#endif
diff --git a/src/axom/core/detail/FlatTable.hpp b/src/axom/core/detail/FlatTable.hpp
@@ -175,6 +175,37 @@ struct GroupBucket
     return InvalidSlot;
   }
 
+  /*!
+   * \brief Visits matching hash buckets until an empty bucket is encountered.
+   *
+   *  This is used when performing batched insertion: since elements are only
+   *  inserted, not deleted, an empty bucket will always be encountered only at
+   *  the very end of a given probe sequence.
+   *  The visitor function is used to allow for detecting duplicate keys.
+   *
+   * \param [in] hash reduced hash to search for
+   * \param [in] visitor functor to call for each matching bucket slot
+   *
+   * \return the first empty slot found, or InvalidSlot
+   */
+  template <typename Func>
+  AXOM_HOST_DEVICE int visitHashOrEmptyBucket(std::uint8_t hash, Func&& visitor) const
+  {
+    std::uint8_t reducedHash = reduceHash(hash);
+    for(int i = 0; i < Size; i++)
+    {
+      if(metadata.buckets[i] == reducedHash)
+      {
+        visitor(i);
+      }
+      else if(metadata.buckets[i] == GroupBucket::Empty)
+      {
+        return i;
+      }
+    }
+    return InvalidSlot;
+  }
+
   template <bool Atomic = false>
   AXOM_HOST_DEVICE void setBucket(int index, std::uint8_t hash)
   {

diff --git a/src/axom/core/examples/CMakeLists.txt b/src/axom/core/examples/CMakeLists.txt
@@ -96,3 +96,10 @@ if(AXOM_ENABLE_TESTS)
         endif()
     endforeach()
 endif()
+
+axom_add_executable(
+    NAME        core_flatmap_perf_ex
+    SOURCES     core_flatmap_perf.cpp
+    OUTPUT_DIR  ${EXAMPLE_OUTPUT_DIRECTORY}
+    DEPENDS_ON  core
+    FOLDER      axom/core/examples )