diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml index f778a6b117d..caec2af3ac2 100644 --- a/.github/workflows/macos_debug_fetch_hwloc.yml +++ b/.github/workflows/macos_debug_fetch_hwloc.yml @@ -19,7 +19,6 @@ jobs: run: | brew install --overwrite python-tk && \ brew install --overwrite boost gperftools ninja autoconf automake && \ - autoreconf -f -i \ brew upgrade cmake - name: Configure shell: bash diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt index 6fcfed897e2..9090345722d 100644 --- a/libs/core/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/CMakeLists.txt @@ -37,7 +37,9 @@ set(algorithms_headers hpx/parallel/algorithms/detail/parallel_stable_sort.hpp hpx/parallel/algorithms/detail/pivot.hpp hpx/parallel/algorithms/detail/reduce.hpp + hpx/parallel/algorithms/detail/reduce_deterministic.hpp hpx/parallel/algorithms/detail/replace.hpp + hpx/parallel/algorithms/detail/rfa.hpp hpx/parallel/algorithms/detail/rotate.hpp hpx/parallel/algorithms/detail/sample_sort.hpp hpx/parallel/algorithms/detail/search.hpp @@ -72,6 +74,7 @@ set(algorithms_headers hpx/parallel/algorithms/partition.hpp hpx/parallel/algorithms/reduce_by_key.hpp hpx/parallel/algorithms/reduce.hpp + hpx/parallel/algorithms/reduce_deterministic.hpp hpx/parallel/algorithms/remove_copy.hpp hpx/parallel/algorithms/remove.hpp hpx/parallel/algorithms/replace.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 87069858f49..b3773088917 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -32,6 +33,8 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) { + /// TODO: Put constraint on Reduce to be a binary plus operator + (void) r; hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index 302f823fab7..fa9142cdf80 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -1,3 +1,34 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// --------------------------------------------------------------------------- +// This file has been taken from +// https://github.com/maddyscientist/reproducible_floating_sums commit +// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added +// to this file in accordance with the dual license terms for the Reproducible +// Floating-Point Summations and conformance with the HPX policy +// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md +// --------------------------------------------------------------------------- +// +/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. //Reproducible Floating Point Accumulations via Binned Floating Point //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. @@ -26,6 +57,10 @@ #include #include #include +#include +#include + +#include namespace hpx::parallel::detail::rfa { template @@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa { static constexpr int FOLD = FOLD_; private: - std::array data = {0}; + std::array data = {{0}}; ///Floating-point precision bin width static constexpr auto BIN_WIDTH = @@ -351,21 +386,21 @@ namespace hpx::parallel::detail::rfa { ///Get index of float-point precision ///The index of a non-binned type is the smallest index a binned type would - ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///need to have to sum it reproducibly. Higher indices correspond to smaller ///bins. static inline constexpr int binned_dindex(const ftype x) { int exp = EXP(x); if (exp == 0) { - if (x == 0.0) + if (x == static_cast(0.0)) { return MAXINDEX; } else { std::frexp(x, &exp); - return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); } } return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; @@ -373,7 +408,7 @@ namespace hpx::parallel::detail::rfa { ///Get index of manually specified binned double precision ///The index of a binned type is the bin that it corresponds to. Higher - ///indicies correspond to smaller bins. + ///indices correspond to smaller bins. inline int binned_index() const { return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - @@ -416,7 +451,9 @@ namespace hpx::parallel::detail::rfa { int shift = binned_index() - X_index; if (shift > 0) { -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -425,7 +462,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = carry((i - shift) * inccarY); } const ftype* const bins = binned_bins(X_index); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int j = 0; j < FOLD; j++) { if (j >= shift) @@ -457,16 +496,19 @@ namespace hpx::parallel::detail::rfa { if (binned_index0()) { M = primary(0); - ftype qd = x * COMPRESSION; + ftype qd = x * static_cast(COMPRESSION); auto& ql = get_bits(qd); ql |= 1; qd += M; primary(0) = qd; M -= qd; - M *= EXPANSION * 0.5; + auto temp_m = (double) (((double) EXPANSION) * 0.5); + M *= static_cast(temp_m); x += M; x += M; -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 1; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -485,7 +527,9 @@ namespace hpx::parallel::detail::rfa { { ftype qd = x; auto& ql = get_bits(qd); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -550,7 +594,7 @@ namespace hpx::parallel::detail::rfa { int i = 0; if (ISNANINF(primary(0))) - return primary(0); + return (double) primary(0); if (ISZERO(primary(0))) return 0.0; @@ -564,29 +608,36 @@ namespace hpx::parallel::detail::rfa { { scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); - scaled = std::max( - std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + scaled = (std::max)( + (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); if (X_index == 0) { - Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); - Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + Y += ((double) carry(0)) * + ((((double) bins[0]) / 6.0) * scale_down * EXPANSION); + Y += ((double) carry(inccarX)) * + ((((double) bins[1]) / 6.0) * scale_down); + Y += ((double) primary(0) - (double) bins[0]) * scale_down * + EXPANSION; i = 2; } else { - Y += carry(0) * ((bins[0] / 6.0) * scale_down); + Y += ((double) carry(0)) * + (((double) bins[0] / 6.0) * scale_down); i = 1; } for (; i < scaled; i++) { - Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); - Y += - (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + Y += ((double) carry(i * inccarX)) * + (((double) bins[i] / 6.0) * scale_down); + Y += ((double) primary((i - 1) * incpriX) - + (double) (bins[i - 1])) * + scale_down; } if (i == FOLD) { - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + Y += ((double) primary((FOLD - 1) * incpriX) - + (double) (bins[FOLD - 1])) * scale_down; return Y * scale_up; } @@ -597,20 +648,23 @@ namespace hpx::parallel::detail::rfa { Y *= scale_up; for (; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += primary((i - 1) * incpriX) - bins[i - 1]; + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + Y += ((double) primary((FOLD - 1) * incpriX) - + ((double) bins[FOLD - 1])); } else { - Y += carry(0) * (bins[0] / 6.0); + Y += ((double) carry(0)) * ((double) bins[0] / 6.0); for (i = 1; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += (primary((i - 1) * incpriX) - bins[i - 1]); + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); } return Y; } @@ -627,7 +681,7 @@ namespace hpx::parallel::detail::rfa { if (ISNANINF(primary(0))) return primary(0); if (ISZERO(primary(0))) - return 0.0; + return 0.0f; //Note that the following order of summation is in order of decreasing //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and @@ -636,20 +690,22 @@ namespace hpx::parallel::detail::rfa { const auto* const bins = binned_bins(X_index); if (X_index == 0) { - Y += (double) carry(0) * (double) (bins[0] / 6.0) * + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) * (double) EXPANSION; - Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) carry(inccarX) * + (double) (((double) bins[1]) / 6.0); Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; i = 2; } else { - Y += (double) carry(0) * (double) (bins[0] / 6.0); + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0); i = 1; } for (; i < FOLD; i++) { - Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) carry(i * inccarX) * + (double) (((double) bins[i]) / 6.0); Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); @@ -694,8 +750,10 @@ namespace hpx::parallel::detail::rfa { if (shift > 0) { const auto* const bins = binned_bins(Y_index); - //shift Y upwards and add X to Y -#pragma unroll +//shift Y upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -705,7 +763,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = x.carry(i * inccarX) + carry((i - shift) * inccarY); } -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i == shift) @@ -717,8 +777,10 @@ namespace hpx::parallel::detail::rfa { else if (shift < 0) { const auto* const bins = binned_bins(X_index); - //shift X upwards and add X to Y -#pragma unroll +//shift X upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i < -shift) @@ -731,8 +793,10 @@ namespace hpx::parallel::detail::rfa { else if (shift == 0) { const auto* const bins = binned_bins(X_index); - // add X to Y -#pragma unroll +// add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; @@ -771,7 +835,7 @@ namespace hpx::parallel::detail::rfa { } ///Return the endurance of the binned fp - constexpr int endurance() const + constexpr size_t endurance() const { return ENDURANCE; } @@ -867,11 +931,11 @@ namespace hpx::parallel::detail::rfa { { if (std::is_same_v) { - return binned_conv_single(1, 1); + return static_cast(binned_conv_single(1, 1)); } else { - return binned_conv_double(1, 1); + return static_cast(binned_conv_double(1, 1)); } } @@ -888,7 +952,8 @@ namespace hpx::parallel::detail::rfa { { const double X = std::abs(max_abs_val); const double S = std::abs(binned_sum); - return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + return static_cast( + (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) * std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + ((7.0 * EPSILON) / (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - @@ -973,7 +1038,7 @@ namespace hpx::parallel::detail::rfa { T max_abs_val = input[0]; for (size_t i = 0; i < N; i++) { - max_abs_val = max(max_abs_val, std::abs(input[i])); + max_abs_val = (std::max)(max_abs_val, std::abs(input[i])); } add(input, N, max_abs_val); } @@ -1142,4 +1207,4 @@ namespace hpx::parallel::detail::rfa { } }; -} // namespace hpx::parallel::detail::rfa \ No newline at end of file +} // namespace hpx::parallel::detail::rfa diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 92dd2e7f3dc..5a06c509efd 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -4,8 +4,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#pragma once - #include #include #include @@ -19,6 +17,7 @@ #include #include #include +#include #include #include "test_utils.hpp" @@ -27,11 +26,12 @@ int seed = std::random_device{}(); std::mt19937 gen(seed); template -T get_rand( - T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) { return LO + - static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); } /////////////////////////////////////////////////////////////////////////////// @@ -42,10 +42,12 @@ void test_reduce1(IteratorTag) { // check if different type for deterministic and nondeeterministic // and same result i.e. correct computation - using base_iterator_det = std::vector::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; std::vector deterministic(LEN); @@ -75,8 +77,8 @@ void test_reduce1(IteratorTag) FloatTypeNonDeterministic r3 = std::accumulate( nondeterministic.begin(), nondeterministic.end(), val_non_det); - HPX_TEST_EQ(r1, r3); - HPX_TEST_EQ(r2, r3); + HPX_TEST_EQ(static_cast(r1), r3); + HPX_TEST_EQ(static_cast(r2), r3); } template ::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; constexpr FloatTypeDeterministic num_bounds_det = @@ -129,7 +132,8 @@ template void test_orig_reduce_determinism(IteratorTag) { - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; constexpr auto num_bounds_ndet =