Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
5f1ca78
FlatMap: initial commit of batched construction function
publixsubfan Mar 25, 2025
299e2d7
FlatMap: add initial test for batched capability
publixsubfan Mar 25, 2025
d3dbc51
FlatMap: add test for batches with duplicates
publixsubfan Mar 25, 2025
525d957
FlatMap: add test for pathological keys (all the same)
publixsubfan Mar 26, 2025
e167d53
FlatMap: handle duplicates in batched insert
publixsubfan Mar 27, 2025
c1fecc4
FlatMap: fix preallocated bucket logic
publixsubfan Mar 27, 2025
ee48d0e
FlatMap: loop over k-v pairs instead of bucket slots when placing ele…
publixsubfan Mar 28, 2025
f57da66
FlatMap: remove a print
publixsubfan Mar 28, 2025
86fe169
FlatMap: add a performance benchmark test driver
publixsubfan Mar 28, 2025
dc8b7c2
FlatMap: fixes for batched-insert with no RAJA
publixsubfan Mar 28, 2025
4d5f8a6
FlatMap: document create() method
publixsubfan Mar 28, 2025
ac562f7
Allocator fixup
publixsubfan Jul 9, 2025
fda8b64
FlatMap: fixed batched construction tests on GPU
publixsubfan Jul 9, 2025
eaaf890
Add workaround for CUDA construction of std::pair
publixsubfan Jul 9, 2025
ad9823f
Remove RAJA-specific logic in favor of axom-based wrappers
publixsubfan Jul 9, 2025
5233223
Update RELEASE-NOTES
publixsubfan Jul 9, 2025
7c289a6
Update copyright notice
publixsubfan Jul 9, 2025
c14cae8
FlatMap: improve preallocation logic
publixsubfan Jul 11, 2025
dd19921
FlatMapUtil: make some requested changes
publixsubfan Jul 11, 2025
96cfb5b
FlatMap: add some documentation
publixsubfan Jul 12, 2025
3c2a675
FlatMap: test that second kv pair overwrites
publixsubfan Jul 21, 2025
f83228f
FlatMap: document constant-hash test
publixsubfan Jul 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ The Axom project release numbers follow [Semantic Versioning](http://semver.org/
- 2D and 3D implementations for `axom::for_all` were added.
- Adds `axom::FlatMapView`, a helper class associated with `axom::FlatMap` to support queries from
within a GPU kernel.
- Adds an `axom::FlatMap::create()` method to support constructing a hash map over a batch of keys
and values on the GPU or with OpenMP.
- Adds support for custom allocators to `axom::FlatMap`.
- Primal: Adds ability to perform sample-based shaping on tetrahedral shapes.
- Improves efficiency of volume fraction computation from quadrature samples during sample-based shaping.
Expand Down
1 change: 1 addition & 0 deletions src/axom/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ set(core_headers
MapCollection.hpp
FlatMap.hpp
FlatMapView.hpp
FlatMapUtil.hpp
DeviceHash.hpp
NumericArray.hpp
NumericLimits.hpp
Expand Down
32 changes: 29 additions & 3 deletions src/axom/core/FlatMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
*
* \param count the number of elements to fit without a rehash
*/
void reserve(IndexType count) { rehash(std::ceil(count / MAX_LOAD_FACTOR)); }
void reserve(IndexType count) { rehash(count); }

/*!
* \brief Returns a read-only view of the FlatMap.
Expand All @@ -613,6 +613,30 @@ class FlatMap : detail::flat_map::SequentialLookupPolicy<typename Hash::result_t
ConstView view() const;
/// }@

/*!
* \brief Constructs and returns a FlatMap given a set of key-value pairs.
*
* Duplicate keys are handled by selecting the last value in the values
* array corresponding to the equivalent key.
*
* \param keys [in] array of keys for the pairs to insert
* \param values [in] array of values for the pairs to insert
* \param allocator [in] allocator to use for the constructed FlatMap
*
* \tparam ExecSpace the execution space in which to perform the batched
* construction
*
* \return the constructed FlatMap
*
* \pre keys.size() == values.size()
* \pre {keys, values}.getAllocatorID() is accessible from ExecSpace
* \pre allocator is accessible from ExecSpace
*/
template <typename ExecSpace>
static FlatMap create(axom::ArrayView<KeyType> keys,
axom::ArrayView<ValueType> values,
Allocator allocator = Allocator {});

private:
friend class FlatMapView<KeyType, ValueType, false, Hash>;
friend class FlatMapView<KeyType, ValueType, true, Hash>;
Expand Down Expand Up @@ -715,13 +739,13 @@ FlatMap<KeyType, ValueType, Hash>::FlatMap(IndexType bucket_count, Allocator all
, m_loadCount(0)
{
IndexType minBuckets = MIN_NUM_BUCKETS;
bucket_count = axom::utilities::max(minBuckets, bucket_count);
bucket_count = axom::utilities::max<IndexType>(minBuckets, bucket_count / MAX_LOAD_FACTOR);
// Get the smallest power-of-two number of groups satisfying:
// N * GroupSize - 1 >= minBuckets
// TODO: we should add a countl_zero overload for 64-bit integers
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

{
std::int32_t numGroups = std::ceil((bucket_count + 1) / (double)BucketsPerGroup);
m_numGroups2 = 31 - (axom::utilities::countl_zero(numGroups));
m_numGroups2 = 32 - (axom::utilities::countl_zero(numGroups - 1));
}

IndexType numGroupsRounded = 1 << m_numGroups2;
Expand Down Expand Up @@ -860,4 +884,6 @@ auto FlatMap<KeyType, ValueType, Hash>::erase(const_iterator pos) -> iterator

} // namespace axom

#include "FlatMapUtil.hpp"

#endif // Axom_Core_FlatMap_HPP
210 changes: 210 additions & 0 deletions src/axom/core/FlatMapUtil.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and
// other Axom Project Developers. See the top-level COPYRIGHT file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)

#ifndef Axom_Core_FlatMap_Util_HPP
#define Axom_Core_FlatMap_Util_HPP

#include "axom/config.hpp"
#include "axom/core/FlatMap.hpp"
#include "axom/core/execution/reductions.hpp"

namespace axom
{
namespace detail
{

struct SpinLock
{
int value {0};

AXOM_HOST_DEVICE bool tryLock()
{
int still_locked = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any chance the axom atomics can be used/updated to handle/help with this logic?
(Mostly b/c that could harden the axom atomics. If you think this is a one-off and not useful elsewhere, it's fine as is)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think adding this to the axom atomics would be dependent on support from within RAJA for atomics with memory ordering. Otherwise the logic to implement that might get a little nasty.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, RAJA default atomics don't support memory ordering. RAJA can be configured to use desul atomics, which do support memory ordering. Unfortunately, we only support using those through the original RAJA atomic interface and so we only provide a default we define: https://github.com/LLNL/RAJA/blob/develop/include/RAJA/policy/desul/atomic.hpp#L22.

We should revisit whether we want to switch to desul atomics by default in RAJA. I think the last time we discussed this, there were still some cases where RAJA atomics were faster than desul. If we did switch to desul by default (which is what Kokkos uses), then we could support the full desul interface.

@publixsubfan let me know if you think we should go this route.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could play around with a partial desul default? Something like "default for ordered atomics, but use the original backend for unordered"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did have a PR for the ordered atomics here: llnl/RAJA#1616, if we wanted to try and clean that up.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks -- since this is somewhat of a one-off and it's not super easy to consolidate it into axom::atomics, I think it's fine as is.

#if defined(__HIP_DEVICE_COMPILE__)
still_locked = __hip_atomic_exchange(&value, 1, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
#elif defined(AXOM_USE_RAJA) && defined(__CUDA_ARCH__)
still_locked = RAJA::atomicExchange<RAJA::cuda_atomic>(&value, 1);
// We really want an acquire-fenced atomic here
__threadfence();
#elif defined(AXOM_USE_RAJA) && defined(AXOM_USE_OPENMP)
still_locked = RAJA::atomicExchange<RAJA::omp_atomic>(&value, 1);
std::atomic_thread_fence(std::memory_order_acquire);
#endif
return !still_locked;
}

AXOM_HOST_DEVICE void unlock()
{
#if defined(__HIP_DEVICE_COMPILE__)
__hip_atomic_exchange(&value, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
#elif defined(AXOM_USE_RAJA) && defined(__CUDA_ARCH__)
// We really want a release-fenced atomic here
__threadfence();
RAJA::atomicExchange<RAJA::cuda_atomic>(&value, 0);
#elif defined(AXOM_USE_RAJA) && defined(AXOM_USE_OPENMP)
std::atomic_thread_fence(std::memory_order_release);
RAJA::atomicExchange<RAJA::omp_atomic>(&value, 0);
#else
value = 0;
#endif
}
};

} // namespace detail

template <typename KeyType, typename ValueType, typename Hash>
template <typename ExecSpace>
auto FlatMap<KeyType, ValueType, Hash>::create(ArrayView<KeyType> keys,
ArrayView<ValueType> values,
Allocator allocator) -> FlatMap
{
assert(keys.size() == values.size());

const IndexType num_elems = keys.size();

FlatMap new_map(allocator);
new_map.reserve(num_elems);

using HashResult = typename Hash::result_type;
using GroupBucket = detail::flat_map::GroupBucket;

// Grab some needed internal fields from the flat map.
// We're going to be constructing metadata and the K-V pairs directly
// in-place.
const int ngroups_pow_2 = new_map.m_numGroups2;
const auto meta_group = new_map.m_metadata.view();
const auto buckets = new_map.m_buckets.view();

// Construct an array of locks per-group. This guards metadata updates for
// each insertion.
const IndexType num_groups = 1 << ngroups_pow_2;
Array<detail::SpinLock> lock_vec(num_groups, num_groups, allocator.getID());
const auto group_locks = lock_vec.view();

// Map bucket slots to k-v pair indices. This is used to deduplicate pairs
// with the same key value.
Array<IndexType> key_index_dedup_vec(0, 0, allocator.getID());
key_index_dedup_vec.resize(num_groups * GroupBucket::Size, -1);
const auto key_index_dedup = key_index_dedup_vec.view();

// Map k-v pair indices to bucket slots. This is essentially the inverse of
// the above mapping.
Array<IndexType> key_index_to_bucket_vec(num_elems, num_elems, allocator.getID());
const auto key_index_to_bucket = key_index_to_bucket_vec.view();

for_all<ExecSpace>(
num_elems,
AXOM_LAMBDA(IndexType idx) {
// Hash keys.
auto hash = Hash {}(keys[idx]);

// We use the k MSBs of the hash as the initial group probe point,
// where ngroups = 2^k.
int bitshift_right = ((CHAR_BIT * sizeof(HashResult)) - ngroups_pow_2);
HashResult curr_group = hash >> bitshift_right;
curr_group &= ((1 << ngroups_pow_2) - 1);

std::uint8_t hash_8 = static_cast<std::uint8_t>(hash);

IndexType duplicate_bucket_index = -1;
IndexType empty_bucket_index = -1;
int iteration = 0;
while(iteration < meta_group.size())
{
// Try to lock the group. We do this in a non-blocking manner to avoid
// intra-warp progress hazards.
bool group_locked = group_locks[curr_group].tryLock();

if(group_locked)
{
// Every bucket visit - check prior filled buckets for duplicate
// keys.
int empty_slot_index =
meta_group[curr_group].visitHashOrEmptyBucket(hash_8, [&](int matching_slot) {
IndexType bucket_index = curr_group * GroupBucket::Size + matching_slot;

if(keys[key_index_dedup[bucket_index]] == keys[idx])
{
// Highest-indexed kv pair wins.
axom::atomicMax<ExecSpace>(&key_index_dedup[bucket_index], idx);
key_index_to_bucket[idx] = bucket_index;
duplicate_bucket_index = bucket_index;
}
});

if(duplicate_bucket_index == -1)
{
if(empty_slot_index == GroupBucket::InvalidSlot)
{
// Group is full. Set overflow bit for the group.
meta_group[curr_group].template setOverflow<true>(hash_8);
}
else
{
// Got to end of probe sequence without a duplicate.
// Update empty bucket index.
empty_bucket_index = curr_group * GroupBucket::Size + empty_slot_index;
meta_group[curr_group].template setBucket<true>(empty_slot_index, hash_8);
key_index_dedup[empty_bucket_index] = idx;
key_index_to_bucket[idx] = empty_bucket_index;
}
}
// Unlock group once we're done.
group_locks[curr_group].unlock();

if(duplicate_bucket_index != -1 || empty_bucket_index != -1)
{
// We've found an empty slot or a duplicate key to place the
// value at. Empty slots should only occur at the end of the
// probe sequence, since we're only inserting.
break;
}
else
{
// Move to next group.
curr_group = (curr_group + LookupPolicy {}.getNext(iteration)) % meta_group.size();
iteration++;
}
}
}
});

// Add a counter for duplicated inserts.
axom::ReduceSum<ExecSpace, IndexType> total_inserts(0);

// Using key-deduplication map, assign unique k-v pairs to buckets.
for_all<ExecSpace>(
num_elems,
AXOM_LAMBDA(IndexType kv_idx) {
IndexType bucket_idx = key_index_to_bucket[kv_idx];
IndexType winning_idx = key_index_dedup[bucket_idx];
// Place k-v pair at bucket_idx.
if(kv_idx == winning_idx)
{
#if defined(__CUDA_ARCH__)
// HACK: std::pair constructor is not host-device annotated, but CUDA
// requires passing in --expt-relaxed-constexpr for it to work.
// Instead of requiring this flag, construct each member of the pair
// individually.
KeyType& key_dst = const_cast<KeyType&>(buckets[bucket_idx].get().first);
ValueType& value_dst = buckets[bucket_idx].get().second;
new(&key_dst) KeyType {keys[kv_idx]};
new(&value_dst) ValueType {values[kv_idx]};
#else
new(&buckets[bucket_idx]) KeyValuePair(keys[kv_idx], values[kv_idx]);
#endif
total_inserts += 1;
}
});

new_map.m_size = total_inserts.get();
new_map.m_loadCount = total_inserts.get();

return new_map;
}

} // namespace axom

#endif
31 changes: 31 additions & 0 deletions src/axom/core/detail/FlatTable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,37 @@ struct GroupBucket
return InvalidSlot;
}

/*!
* \brief Visits matching hash buckets until an empty bucket is encountered.
*
* This is used when performing batched insertion: since elements are only
* inserted, not deleted, an empty bucket will always be encountered only at
* the very end of a given probe sequence.
* The visitor function is used to allow for detecting duplicate keys.
*
* \param [in] hash reduced hash to search for
* \param [in] visitor functor to call for each matching bucket slot
*
* \return the first empty slot found, or InvalidSlot
*/
template <typename Func>
AXOM_HOST_DEVICE int visitHashOrEmptyBucket(std::uint8_t hash, Func&& visitor) const
{
std::uint8_t reducedHash = reduceHash(hash);
for(int i = 0; i < Size; i++)
{
if(metadata.buckets[i] == reducedHash)
{
visitor(i);
}
else if(metadata.buckets[i] == GroupBucket::Empty)
{
return i;
}
}
return InvalidSlot;
}

template <bool Atomic = false>
AXOM_HOST_DEVICE void setBucket(int index, std::uint8_t hash)
{
Expand Down
7 changes: 7 additions & 0 deletions src/axom/core/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,10 @@ if(AXOM_ENABLE_TESTS)
endif()
endforeach()
endif()

axom_add_executable(
NAME core_flatmap_perf_ex
SOURCES core_flatmap_perf.cpp
OUTPUT_DIR ${EXAMPLE_OUTPUT_DIRECTORY}
DEPENDS_ON core
FOLDER axom/core/examples )
Loading
Loading