diff --git a/include/algorithm.h b/include/algorithm.h
new file mode 100644
index 0000000..d7870d7
--- /dev/null
+++ b/include/algorithm.h
@@ -0,0 +1,6 @@
+
+#pragma once
+
+#include "algorithm/extrema.h"
+#include "algorithm/reduce.h"
+#include "algorithm/sort.h"
diff --git a/include/algorithm/extrema.h b/include/algorithm/extrema.h
new file mode 100644
index 0000000..45a7aba
--- /dev/null
+++ b/include/algorithm/extrema.h
@@ -0,0 +1,193 @@
+
+#pragma once
+
+#include "internal/defines.h"
+
+#if defined(XTD_CUDA_BACKEND) || defined(XTD_HIP_BACKEND)
+#include <thrust/extrema.h>
+#elif defined(XTD_SYCL_BACKEND)
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#else
+#include <algorithm>
+#endif
+
+namespace xtd {
+
+  template <typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator min_element(ForwardIterator first,
+                                                                 ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::min_element(thrust::device, first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::min_element(thrust::hip::par, first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::min_element(oneapi::dpl::execution::dpcpp_default, first, last);
+#else
+    return std::min_element(first, last);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator min_element(ExecutionPolicy&& policy,
+                                                                 ForwardIterator first,
+                                                                 ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::min_element(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::min_element(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::min_element(std::forward<ExecutionPolicy>(policy), first, last);
+#else
+    return std::min_element(std::forward<ExecutionPolicy>(policy), first, last);
+#endif
+  }
+
+  template <typename ForwardIterator, typename BinaryPredicate>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator min_element(ForwardIterator first,
+                                                                 ForwardIterator last,
+                                                                 BinaryPredicate comp) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::min_element(thrust::device, first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::min_element(thrust::hip::par, first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::min_element(oneapi::dpl::execution::dpcpp_default, first, last, comp);
+#else
+    return std::min_element(first, last, comp);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator, typename BinaryPredicate>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator min_element(ExecutionPolicy&& policy,
+                                                                 ForwardIterator first,
+                                                                 ForwardIterator last,
+                                                                 BinaryPredicate comp) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::min_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::min_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::min_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#else
+    return std::min_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#endif
+  }
+
+  template <typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator max_element(ForwardIterator first,
+                                                                 ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::max_element(thrust::device, first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::max_element(thrust::hip::par, first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::max_element(oneapi::dpl::execution::dpcpp_default, first, last);
+#else
+    return std::max_element(first, last);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator max_element(ExecutionPolicy&& policy,
+                                                                 ForwardIterator first,
+                                                                 ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::max_element(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::max_element(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::max_element(std::forward<ExecutionPolicy>(policy), first, last);
+#else
+    return std::max_element(std::forward<ExecutionPolicy>(policy), first, last);
+#endif
+  }
+
+  template <typename ForwardIterator, typename BinaryPredicate>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator max_element(ForwardIterator first,
+                                                                 ForwardIterator last,
+                                                                 BinaryPredicate comp) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::max_element(thrust::device, first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::max_element(thrust::hip::par, first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::max_element(oneapi::dpl::execution::dpcpp_default, first, last, comp);
+#else
+    return std::max_element(first, last, comp);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator, typename BinaryPredicate>
+  XTD_HOST_FUNCTION inline constexpr ForwardIterator max_element(ExecutionPolicy&& policy,
+                                                                 ForwardIterator first,
+                                                                 ForwardIterator last,
+                                                                 BinaryPredicate comp) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::max_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::max_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::max_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#else
+    return std::max_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#endif
+  }
+
+  template <typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr std::pair<ForwardIterator, ForwardIterator> minmax_element(
+      ForwardIterator first, ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::minmax_element(thrust::device, first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::minmax_element(thrust::hip::par, first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::minmax_element(oneapi::dpl::execution::dpcpp_default, first, last);
+#else
+    return std::minmax_element(first, last);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr std::pair<ForwardIterator, ForwardIterator> minmax_element(
+      ExecutionPolicy&& policy, ForwardIterator first, ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::minmax_element(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::minmax_element(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::minmax_element(std::forward<ExecutionPolicy>(policy), first, last);
+#else
+    return std::minmax_element(std::forward<ExecutionPolicy>(policy), first, last);
+#endif
+  }
+
+  template <typename ForwardIterator, typename BinaryPredicate>
+  XTD_HOST_FUNCTION inline constexpr std::pair<ForwardIterator, ForwardIterator> minmax_element(
+      ForwardIterator first, ForwardIterator last, BinaryPredicate comp) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::minmax_element(thrust::device, first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::minmax_element(thrust::hip::par, first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::minmax_element(oneapi::dpl::execution::dpcpp_default, first, last, comp);
+#else
+    return std::minmax_element(first, last, comp);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator, typename BinaryPredicate>
+  XTD_HOST_FUNCTION inline constexpr std::pair<ForwardIterator, ForwardIterator> minmax_element(
+      ExecutionPolicy&& policy, ForwardIterator first, ForwardIterator last, BinaryPredicate comp) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::minmax_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::minmax_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::minmax_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#else
+    return std::minmax_element(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#endif
+  }
+
+}  // namespace xtd
diff --git a/include/algorithm/reduce.h b/include/algorithm/reduce.h
new file mode 100644
index 0000000..e5def9e
--- /dev/null
+++ b/include/algorithm/reduce.h
@@ -0,0 +1,107 @@
+
+#pragma once
+
+#include "internal/defines.h"
+
+#if defined(XTD_CUDA_BACKEND) || defined(XTD_HIP_BACKEND)
+#include <thrust/reduce.h>
+#elif defined(XTD_SYCL_BACKEND)
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#else
+#include <algorithm>
+#endif
+
+namespace xtd {
+
+  template <typename InputIterator>
+  XTD_HOST_FUNCTION inline constexpr typename std::iterator_traits<InputIterator>::value_type
+  reduce(InputIterator first, InputIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::reduce(thrust::device, first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::reduce(thrust::hip::par, first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::reduce(oneapi::dpl::execution::dpcpp_default, first, last);
+#else
+    return std::reduce(first, last);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator>
+  XTD_HOST_FUNCTION inline constexpr typename std::iterator_traits<ForwardIterator>::value_type
+  reduce(ExecutionPolicy&& policy, ForwardIterator first, ForwardIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::reduce(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::reduce(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::reduce(std::forward<ExecutionPolicy>(policy), first, last);
+#else
+    return std::reduce(std::forward<ExecutionPolicy>(policy), first, last);
+#endif
+  }
+
+  template <typename InputIterator, typename T>
+  XTD_HOST_FUNCTION inline constexpr T reduce(InputIterator first, InputIterator last, T init) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::reduce(thrust::device, first, last, init);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::reduce(thrust::hip::par, first, last, init);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::reduce(oneapi::dpl::execution::dpcpp_default, first, last, init);
+#else
+    return std::reduce(first, last, init);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator, typename T>
+  XTD_HOST_FUNCTION inline constexpr T reduce(ExecutionPolicy&& policy,
+                                              ForwardIterator first,
+                                              ForwardIterator last,
+                                              T init) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::reduce(std::forward<ExecutionPolicy>(policy), first, last, init);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::reduce(std::forward<ExecutionPolicy>(policy), first, last, init);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::reduce(std::forward<ExecutionPolicy>(policy), first, last, init);
+#else
+    return std::reduce(std::forward<ExecutionPolicy>(policy), first, last, init);
+#endif
+  }
+
+  template <typename InputIterator, typename T, typename BinaryOperation>
+  XTD_HOST_FUNCTION inline constexpr T reduce(InputIterator first,
+                                              InputIterator last,
+                                              T init,
+                                              BinaryOperation op) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::reduce(thrust::device, first, last, init, op);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::reduce(thrust::hip::par, first, last, init, op);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::reduce(oneapi::dpl::execution::dpcpp_default, first, last, init, op);
+#else
+    return std::reduce(first, last, init, op);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename ForwardIterator, typename T, typename BinaryOperation>
+  XTD_HOST_FUNCTION inline constexpr T reduce(ExecutionPolicy&& policy,
+                                              ForwardIterator first,
+                                              ForwardIterator last,
+                                              T init,
+                                              BinaryOperation op) {
+#if defined(XTD_CUDA_BACKEND)
+    return thrust::reduce(std::forward<ExecutionPolicy>(policy), first, last, init, op);
+#elif defined(XTD_HIP_BACKEND)
+    return rocthrust::reduce(std::forward<ExecutionPolicy>(policy), first, last, init, op);
+#elif defined(XTD_SYCL_BACKEND)
+    return oneapi::dpl::reduce(std::forward<ExecutionPolicy>(policy), first, last, init, op);
+#else
+    return std::reduce(std::forward<ExecutionPolicy>(policy), first, last, init, op);
+#endif
+  }
+
+}  // namespace xtd
diff --git a/include/algorithm/sort.h b/include/algorithm/sort.h
new file mode 100644
index 0000000..2df3f92
--- /dev/null
+++ b/include/algorithm/sort.h
@@ -0,0 +1,77 @@
+
+#pragma once
+
+#include "internal/defines.h"
+
+#if defined(XTD_CUDA_BACKEND) || defined(XTD_HIP_BACKEND)
+#include <thrust/sort.h>
+#elif defined(XTD_SYCL_BACKEND)
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#else
+#include <algorithm>
+#endif
+
+namespace xtd {
+
+  template <typename RandomAccessIterator>
+  XTD_HOST_FUNCTION inline constexpr void sort(RandomAccessIterator first,
+                                               RandomAccessIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    thrust::sort(thrust::device, first, last);
+#elif defined(XTD_HIP_BACKEND)
+    rocthrust::sort(thrustd::hip::par, first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    oneapi::dpl::sort(oneapi::dpl::execution::dpcpp_default, first, last);
+#else
+    std::sort(first, last);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename RandomAccessIterator>
+  XTD_HOST_FUNCTION inline constexpr void sort(ExecutionPolicy&& policy,
+                                               RandomAccessIterator first,
+                                               RandomAccessIterator last) {
+#if defined(XTD_CUDA_BACKEND)
+    thrust::sort(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_HIP_BACKEND)
+    rocthrust::sort(std::forward<ExecutionPolicy>(policy), first, last);
+#elif defined(XTD_SYCL_BACKEND)
+    oneapi::dpl::sort(std::forward<ExecutionPolicy>(policy), first, last);
+#else
+    std::sort(std::forward<ExecutionPolicy>(policy), first, last);
+#endif
+  }
+
+  template <typename RandomAccessIterator, typename Compare>
+  XTD_HOST_FUNCTION inline constexpr void sort(RandomAccessIterator first,
+                                               RandomAccessIterator last,
+                                               Compare comp) {
+#if defined(XTD_CUDA_BACKEND)
+    thrust::sort(thrust::device, first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    rocthrust::sort(thrust::hip::par, first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    oneapi::dpl::sort(oneapi::dpl::execution::dpcpp_default, first, last, comp);
+#else
+    std::sort(first, last, comp);
+#endif
+  }
+
+  template <typename ExecutionPolicy, typename RandomAccessIterator, typename Compare>
+  XTD_HOST_FUNCTION inline constexpr void sort(ExecutionPolicy&& policy,
+                                               RandomAccessIterator first,
+                                               RandomAccessIterator last,
+                                               Compare comp) {
+#if defined(XTD_CUDA_BACKEND)
+    thrust::sort(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_HIP_BACKEND)
+    rocthrust::sort(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#elif defined(XTD_SYCL_BACKEND)
+    oneapi::dpl::sort(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#else
+    std::sort(std::forward<ExecutionPolicy>(policy), first, last, comp);
+#endif
+  }
+
+}  // namespace xtd
diff --git a/include/internal/defines.h b/include/internal/defines.h
index 8be3730..9b229a8 100644
--- a/include/internal/defines.h
+++ b/include/internal/defines.h
@@ -10,9 +10,21 @@
 #if defined(__CUDACC__) || defined(__HIPCC__)
 // CUDA or HIP/ROCm compiler
 #define XTD_DEVICE_FUNCTION __host__ __device__
+#define XTD_HOST_FUNCTION __host__
 #else
 // SYCL or standard C++ code
 #define XTD_DEVICE_FUNCTION
+#define XTD_HOST_FUNCTION
+#endif
+
+#if defined(__CUDACC__)
+#define XTD_CUDA_BACKEND
+#elif defined(__HIPCC__)
+#define XTD_HIP_BACKEND
+#elif defined(__SYCL_COMPILER_VERSION)
+#define XTD_SYCL_BACKEND
+#else
+#define XTD_SERIAL_BACKEND
 #endif
 
 // XTD_TARGET_...
diff --git a/test/Makefile b/test/Makefile
index 404e240..bdd7598 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -7,12 +7,12 @@ CXX := g++
 GCC_TOOLCHAIN := $(abspath $(dir $(shell which $(CXX)))/..)
 GCC_TARGET    := $(shell $(CXX) -dumpmachine)
 # Catch2 needs -Wno-unused-variable
-HOST_CXXFLAGS := -O2 -fPIC -pthread -march=native -Wall -Wextra -Werror -Wfatal-errors -Wno-unused-variable
+HOST_CXXFLAGS := -O2 -fPIC -pthread -march=native -Wall -Wextra -Wfatal-errors -Wno-unused-variable
 
 # Compiler flags supported by GCC but not by the LLVM-based compilers (clang, hipcc, icpx, etc.)
 LLVM_UNSUPPORTED_CXXFLAGS := --param vect-max-version-for-alias-checks=50 -Werror=format-contains-nul -Wno-non-template-friend -Werror=return-local-addr -Werror=unused-but-set-variable
 
-CXXFLAGS := -std=c++17 $(HOST_CXXFLAGS) -g
+CXXFLAGS := -std=c++20 $(HOST_CXXFLAGS) -g
 LDFLAGS := -O2 -fPIC -pthread -Wl,-E -lstdc++fs -ldl
 
 # CUDA
@@ -25,17 +25,17 @@ else
 # CUDA platform at $(CUDA_BASE)
   CUDA_LIBDIR := $(CUDA_BASE)/lib64
   CUDA_DEPS := $(CUDA_LIBDIR)/libcudart.so
-  CUDA_ARCH := 60 70 80
+  CUDA_ARCH := 60 70 80 90
   CUDA_CXXFLAGS := -I$(CUDA_BASE)/include
   CUDA_LDFLAGS := -L$(CUDA_LIBDIR) -lcudart -lcudadevrt
   CUDA_NVCC := $(CUDA_BASE)/bin/nvcc
   define CUFLAGS_template
     $(2)NVCC_FLAGS := $$(foreach ARCH,$(1),-gencode arch=compute_$$(ARCH),code=[sm_$$(ARCH),compute_$$(ARCH)]) -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --expt-relaxed-constexpr --expt-extended-lambda --generate-line-info --source-in-ptx --display-error-number --threads $$(words $(1)) --cudart=shared
-    $(2)NVCC_COMMON := -std=c++17 -O3 -g $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS)'
+    $(2)NVCC_COMMON := -std=c++20 -O3 -g $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS)'
     $(2)CUDA_CUFLAGS := $$($(2)NVCC_COMMON)
   endef
   $(eval $(call CUFLAGS_template,$(CUDA_ARCH),))
-  NVCC_COMMON := -std=c++17 -O3 -g $(NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS)'
+  NVCC_COMMON := -std=c++20 -O3 -g $(NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS)'
   CUDA_CUFLAGS := $(NVCC_COMMON)
 endif
 
@@ -51,7 +51,7 @@ else
   ROCM_DEPS := $(ROCM_LIBDIR)/libamdhip64.so
   ROCM_ARCH := gfx900 gfx90a gfx1030
   ROCM_HIPCC := $(ROCM_BASE)/bin/hipcc
-  HIPCC_CXXFLAGS := -fno-gpu-rdc $(foreach ARCH,$(ROCM_ARCH),--offload-arch=$(ARCH)) $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) --target=$(GCC_TARGET) --gcc-toolchain=$(GCC_TOOLCHAIN) -I$(ROCM_BASE)/include/hip -Wno-unused-result
+  HIPCC_CXXFLAGS := -fno-gpu-rdc $(foreach ARCH,$(ROCM_ARCH),--offload-arch=$(ARCH)) $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) --target=$(GCC_TARGET) --gcc-toolchain=$(GCC_TOOLCHAIN) -I$(ROCM_BASE)/include/hip -I$(ROCM_BASE)/include/thrust -Wno-unused-result
   HIPCC_LDFLAGS := $(LDFLAGS) --target=$(GCC_TARGET) --gcc-toolchain=$(GCC_TOOLCHAIN)
 endif
 
@@ -67,7 +67,7 @@ else
   SYCL_CXX          := $(SYCL_BASE)/bin/icpx
   SYCL_CPU_TARGET   := -fsycl-targets=spir64_x86_64
   SYCL_FLAGS        := -fsycl -fp-model=precise
-  SYCL_CXXFLAGS     := $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) $(SYCL_FLAGS) -Wno-unused-variable
+  SYCL_CXXFLAGS     := $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) $(SYCL_FLAGS) -Wno-unused-variable -Wunused-parameter
   SYCL_LDFLAGS      :=
 
   # Check for Intel GPU existence
diff --git a/test/max_element/max_element_t.cc b/test/max_element/max_element_t.cc
new file mode 100644
index 0000000..42a70ba
--- /dev/null
+++ b/test/max_element/max_element_t.cc
@@ -0,0 +1,42 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+TEST_CASE("max_elementCPU", "[max_element]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  SECTION("Default comparison") {
+    auto max = xtd::max_element(values.begin(), values.end());
+    REQUIRE(*max == N - 1);
+  }
+
+  SECTION("Greater comparison") {
+    auto max = xtd::max_element(values.begin(), values.end(), std::greater<int>());
+    REQUIRE(*max == 0);
+  }
+
+  SECTION("Unseq execution policy") {
+    auto max = xtd::max_element(std::execution::unseq, values.begin(), values.end());
+    REQUIRE(*max == N - 1);
+  }
+
+  SECTION("Unseq execution policy with greater comparison") {
+    auto max =
+        xtd::max_element(std::execution::unseq, values.begin(), values.end(), std::greater<int>());
+    REQUIRE(*max == 0);
+  }
+}
diff --git a/test/max_element/max_element_t.cu b/test/max_element/max_element_t.cu
new file mode 100644
index 0000000..61115e0
--- /dev/null
+++ b/test/max_element/max_element_t.cu
@@ -0,0 +1,48 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/cuda_check.h"
+#include <cuda_runtime.h>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+
+TEST_CASE("max_elementCUDA", "[max_element]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  int* d_values;
+  CUDA_CHECK(cudaMalloc(&d_values, N * sizeof(int)));
+  CUDA_CHECK(cudaMemcpy(d_values, values.data(), N * sizeof(int), cudaMemcpyHostToDevice));
+
+  SECTION("Default comparison") {
+    auto max_iter = xtd::max_element(d_values, d_values + N);
+    int max;
+	thrust::copy(thrust::device, max_iter, max_iter+1, &max);
+    REQUIRE(max == N - 1);
+  }
+
+  SECTION("Greater comparison") {
+    auto max_iter = xtd::max_element(d_values, d_values + N, std::greater<int>());
+    int max;
+	thrust::copy(thrust::device, max_iter, max_iter+1, &max);
+    REQUIRE(max == 0);
+  }
+
+  CUDA_CHECK(cudaFree(d_values));
+}
diff --git a/test/max_element/max_element_t.hip.cc b/test/max_element/max_element_t.hip.cc
new file mode 100644
index 0000000..82b2c7e
--- /dev/null
+++ b/test/max_element/max_element_t.hip.cc
@@ -0,0 +1,45 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/hip_check.h"
+#include <hip_runtime.h>
+#include <rocthrust/copy.h>
+
+TEST_CASE("max_elementHIP", "[max_element]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  int* d_values;
+  HIP_CHECK(hipMalloc(&d_values, N * sizeof(int)));
+  HIP_CHECK(hipMemcpy(d_values, values.data(), N * sizeof(int), hipMemcpyHostToDevice));
+
+  SECTION("Default comparison") {
+    auto max_iter = xtd::max_element(d_values, d_values + N);
+	int max;
+	thrust::copy(thrust::hip::par, max_iter, max_iter + 1, &max);
+    REQUIRE(max == N - 1);
+  }
+
+  SECTION("Greater comparison") {
+    auto max_iter = xtd::max_element(d_values, d_values + N, std::greater<int>());
+	int max;
+	thrust::copy(thrust::hip::par, max_iter, max_iter + 1, &max);
+    REQUIRE(max == 0);
+  }
+
+  HIP_CHECK(hipFree(d_values));
+}
diff --git a/test/max_element/max_element_t.sycl.cc b/test/max_element/max_element_t.sycl.cc
new file mode 100644
index 0000000..b28c4fd
--- /dev/null
+++ b/test/max_element/max_element_t.sycl.cc
@@ -0,0 +1,47 @@
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include <sycl/sycl.hpp>
+
+TEST_CASE("sortSYCL", "[sort]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+#ifdef ONEAPI_CPU
+  auto queue = sycl::queue{sycl::cpu_selector_v, sycl::property::queue::in_order()};
+#else
+  if (sycl::device::get_devices(sycl::info::device_type::gpu).size() == 0) {
+    std::cout << "No SYCL GPUs found, the test will be skipped." << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+  auto queue = sycl::queue{sycl::gpu_selector_v, sycl::property::queue::in_order()};
+#endif
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  auto *d_values = sycl::malloc_device<int>(N, queue);
+  queue.memcpy(d_values, values.data(), N * sizeof(int)).wait();
+
+  SECTION("Default comparison") {
+    auto max = xtd::max_element(d_values, d_values + N);
+    REQUIRE(*max == N - 1);
+  }
+
+  SECTION("Greater comparison") {
+    auto max = xtd::max_element(d_values, d_values + N, std::greater<int>{});
+    REQUIRE(*max == 0);
+  }
+
+  sycl::free(d_values, queue);
+}
diff --git a/test/min_element/min_element_t.cc b/test/min_element/min_element_t.cc
new file mode 100644
index 0000000..36edadc
--- /dev/null
+++ b/test/min_element/min_element_t.cc
@@ -0,0 +1,42 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+TEST_CASE("min_elementCPU", "[min_element]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  SECTION("Default comparison") {
+    auto min = xtd::min_element(values.begin(), values.end());
+    REQUIRE(*min == 0);
+  }
+
+  SECTION("Greater comparison") {
+    auto min = xtd::min_element(values.begin(), values.end(), std::greater<int>());
+     REQUIRE(*min == N - 1);
+  }
+
+  SECTION("Unseq execution policy") {
+    auto min = xtd::min_element(std::execution::unseq, values.begin(), values.end());
+    REQUIRE(*min == 0);
+  }
+
+  SECTION("Unseq execution policy with greater comparison") {
+    auto min =
+        xtd::min_element(std::execution::unseq, values.begin(), values.end(), std::greater<int>());
+    REQUIRE(*min == N - 1);
+  }
+}
diff --git a/test/min_element/min_element_t.cu b/test/min_element/min_element_t.cu
new file mode 100644
index 0000000..6fde3d4
--- /dev/null
+++ b/test/min_element/min_element_t.cu
@@ -0,0 +1,46 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/cuda_check.h"
+#include <cuda_runtime.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+TEST_CASE("min_elementCUDA", "[min_element]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  int* d_values;
+  CUDA_CHECK(cudaMalloc(&d_values, N * sizeof(int)));
+  CUDA_CHECK(cudaMemcpy(d_values, values.data(), N * sizeof(int), cudaMemcpyHostToDevice));
+
+  SECTION("Default comparison") {
+    auto min_iter = xtd::min_element(d_values, d_values + N);
+	int min;
+	thrust::copy(thrust::device, min_iter , min_iter + 1, &min);
+    REQUIRE(min == 0);
+  }
+
+  SECTION("Greater comparison") {
+    auto min_iter = xtd::min_element(d_values, d_values + N, std::greater<int>());
+	int min;
+	thrust::copy(thrust::device, min_iter, min_iter + 1, &min);
+    REQUIRE(min == N - 1);
+  }
+
+  CUDA_CHECK(cudaFree(d_values));
+}
diff --git a/test/min_element/min_element_t.hip.cc b/test/min_element/min_element_t.hip.cc
new file mode 100644
index 0000000..7292494
--- /dev/null
+++ b/test/min_element/min_element_t.hip.cc
@@ -0,0 +1,45 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/hip_check.h"
+#include <hip_runtime.h>
+#include <rocthrust/copy.h>
+
+TEST_CASE("min_elementHIP", "[min_element]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  int* d_values;
+  HIP_CHECK(hipMalloc(&d_values, N * sizeof(int)));
+  HIP_CHECK(hipMemcpy(d_values, values.data(), N * sizeof(int), hipMemcpyHostToDevice));
+
+  SECTION("Default comparison") {
+    auto min_iter = xtd::min_element(d_values, d_values + N);
+	int min;
+	thrust::copy(thrust::hip::par, min_iter, min_iter + 1, &min);
+    REQUIRE(min == 0);
+  }
+
+  SECTION("Greater comparison") {
+    auto min_iter = xtd::min_element(d_values, d_values + N, std::greater<int>());
+	int min;
+	thrust::copy(thrust::hip::par, min_iter, min_iter + 1, &min);
+    REQUIRE(min == N - 1);
+  }
+
+  HIP_CHECK(hipFree(d_values));
+}
diff --git a/test/min_element/min_element_t.sycl.cc b/test/min_element/min_element_t.sycl.cc
new file mode 100644
index 0000000..bfca1d8
--- /dev/null
+++ b/test/min_element/min_element_t.sycl.cc
@@ -0,0 +1,47 @@
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include <sycl/sycl.hpp>
+
+TEST_CASE("sortSYCL", "[sort]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+#ifdef ONEAPI_CPU
+  auto queue = sycl::queue{sycl::cpu_selector_v, sycl::property::queue::in_order()};
+#else
+  if (sycl::device::get_devices(sycl::info::device_type::gpu).size() == 0) {
+    std::cout << "No SYCL GPUs found, the test will be skipped." << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+  auto queue = sycl::queue{sycl::gpu_selector_v, sycl::property::queue::in_order()};
+#endif
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  auto *d_values = sycl::malloc_device<int>(N, queue);
+  queue.memcpy(d_values, values.data(), N * sizeof(int)).wait();
+
+  SECTION("Default comparison") {
+    auto min = xtd::min_element(d_values, d_values + N);
+    REQUIRE(*min == 0);
+  }
+
+  SECTION("Greater comparison") {
+    auto min = xtd::min_element(d_values, d_values + N, std::greater<int>{});
+    REQUIRE(*min == N - 1);
+  }
+
+  sycl::free(d_values, queue);
+}
diff --git a/test/reduce/reduce_t.cc b/test/reduce/reduce_t.cc
new file mode 100644
index 0000000..5b7f89c
--- /dev/null
+++ b/test/reduce/reduce_t.cc
@@ -0,0 +1,52 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+TEST_CASE("reduceCPU", "[reduce]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+
+  SECTION("Default reduction") {
+    auto red = xtd::reduce(values.begin(), values.end());
+    REQUIRE(red == std::reduce(values.begin(), values.end()));
+  }
+
+  SECTION("Less comparison") {
+    auto red = xtd::reduce(values.begin(), values.end(), -1, std::less<int>());
+    REQUIRE(red == std::reduce(values.begin(), values.end(), -1, std::less<int>()));
+  }
+
+  SECTION("Unseq execution policy") {
+    int red = xtd::reduce(std::execution::unseq, values.begin(), values.end());
+    REQUIRE(red == std::reduce(values.begin(), values.end()));
+  }
+
+  SECTION("Unseq execution policy with less comparison") {
+    auto red =
+        xtd::reduce(std::execution::unseq, values.begin(), values.end(), -1, std::less<int>());
+    REQUIRE(red ==
+            std::reduce(std::execution::unseq, values.begin(), values.end(), -1, std::less<int>()));
+  }
+
+  SECTION("Reduction with initial value") {
+    auto red = xtd::reduce(values.begin(), values.end(), 1);
+    REQUIRE(red == std::reduce(values.begin(), values.end(), 1));
+  }
+
+  SECTION("Reduction with initial value and unseq policy") {
+    auto red = xtd::reduce(std::execution::unseq, values.begin(), values.end(), 1);
+    REQUIRE(red == std::reduce(std::execution::unseq, values.begin(), values.end(), 1));
+  }
+}
diff --git a/test/reduce/reduce_t.cu b/test/reduce/reduce_t.cu
new file mode 100644
index 0000000..7810df6
--- /dev/null
+++ b/test/reduce/reduce_t.cu
@@ -0,0 +1,49 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/cuda_check.h"
+#include <cuda_runtime.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+TEST_CASE("reduceCUDA", "[reduce]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  int* d_values;
+  CUDA_CHECK(cudaMallocAsync(&d_values, N * sizeof(int), stream));
+  CUDA_CHECK(
+      cudaMemcpyAsync(d_values, values.data(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
+
+  SECTION("Default reduction") {
+    auto red = xtd::reduce(d_values, d_values + N);
+    REQUIRE(red == std::reduce(values.begin(), values.end()));
+  }
+
+  SECTION("Less comparison") {
+    auto red = xtd::reduce(d_values, d_values + N, -1, std::less<int>());
+    REQUIRE(red == std::reduce(values.begin(), values.end(), -1, std::less<int>()));
+  }
+
+  SECTION("Reduction with initial value") {
+    auto red = xtd::reduce(d_values, d_values + N, 1);
+    REQUIRE(red == std::reduce(values.begin(), values.end(), 1));
+  }
+}
diff --git a/test/reduce/reduce_t.hip.cc b/test/reduce/reduce_t.hip.cc
new file mode 100644
index 0000000..575a603
--- /dev/null
+++ b/test/reduce/reduce_t.hip.cc
@@ -0,0 +1,48 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/hip_check.h"
+#include <hip_runtime.h>
+#include <rocthrust/copy.h>
+
+TEST_CASE("reduceHIP", "[reduce]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  int* d_values;
+  HIP_CHECK(hipMallocAsync(&d_values, N * sizeof(int), stream));
+  HIP_CHECK(
+      hipMemcpyAsync(d_values, values.data(), N * sizeof(int), hipMemcpyHostToDevice, stream));
+
+  SECTION("Default reduction") {
+    auto red = xtd::reduce(d_values, d_values + N);
+    REQUIRE(red == std::reduce(values.begin(), values.end()));
+  }
+
+  SECTION("Less comparison") {
+    auto red = xtd::reduce(d_values, d_values + N, -1, std::less<int>());
+    REQUIRE(red == std::reduce(values.begin(), values.end(), -1, std::less<int>()));
+  }
+
+  SECTION("Reduction with initial value") {
+    auto red = xtd::reduce(d_values, d_values + N, 1);
+    REQUIRE(red == std::reduce(values.begin(), values.end(), 1));
+  }
+}
diff --git a/test/reduce/reduce_t.sycl.cc b/test/reduce/reduce_t.sycl.cc
new file mode 100644
index 0000000..ada03d8
--- /dev/null
+++ b/test/reduce/reduce_t.sycl.cc
@@ -0,0 +1,52 @@
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include <sycl/sycl.hpp>
+
+TEST_CASE("reduceSYCL", "[reduce]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+#ifdef ONEAPI_CPU
+  auto queue = sycl::queue{sycl::cpu_selector_v, sycl::property::queue::in_order()};
+#else
+  if (sycl::device::get_devices(sycl::info::device_type::gpu).size() == 0) {
+    std::cout << "No SYCL GPUs found, the test will be skipped." << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+  auto queue = sycl::queue{sycl::gpu_selector_v, sycl::property::queue::in_order()};
+#endif
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  auto *d_values = sycl::malloc_device<int>(N, queue);
+  queue.memcpy(d_values, values.data(), N * sizeof(int)).wait();
+
+  SECTION("Default reduction") {
+    auto red = xtd::reduce(d_values, d_values + N);
+    REQUIRE(red == std::reduce(values.begin(), values.end()));
+  }
+
+  SECTION("Less comparison") {
+    auto red = xtd::reduce(d_values, d_values + N, -1, std::less<int>());
+    REQUIRE(red == std::reduce(values.begin(), values.end(), -1, std::less<int>()));
+  }
+
+  SECTION("Reduction with initial value") {
+    auto red = xtd::reduce(d_values, d_values + N, 1);
+    REQUIRE(red == std::reduce(values.begin(), values.end(), 1));
+  }
+
+  sycl::free(d_values, queue);
+}
diff --git a/test/sort/sort_t.cc b/test/sort/sort_t.cc
new file mode 100644
index 0000000..d0a95b1
--- /dev/null
+++ b/test/sort/sort_t.cc
@@ -0,0 +1,41 @@
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+TEST_CASE("sortCPU", "[sort]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  SECTION("Default comparison") {
+    xtd::sort(values.begin(), values.end());
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N)));
+  }
+
+  SECTION("Greater comparison") {
+    xtd::sort(values.begin(), values.end(), std::greater<int>());
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N) | std::views::reverse));
+  }
+
+  SECTION("Unseq execution policy") {
+    xtd::sort(std::execution::unseq, values.begin(), values.end());
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N)));
+  }
+
+  SECTION("Unseq execution policy with greater comparison") {
+    xtd::sort(std::execution::unseq, values.begin(), values.end(), std::greater<int>());
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N) | std::views::reverse));
+  }
+}
diff --git a/test/sort/sort_t.cu b/test/sort/sort_t.cu
new file mode 100644
index 0000000..ec34bf8
--- /dev/null
+++ b/test/sort/sort_t.cu
@@ -0,0 +1,52 @@
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/cuda_check.h"
+#include <cuda_runtime.h>
+
+TEST_CASE("sortCUDA", "[sort]") {
+  int deviceCount;
+  cudaError_t cudaStatus = cudaGetDeviceCount(&deviceCount);
+
+  if (cudaStatus != cudaSuccess || deviceCount == 0) {
+    std::cout << "No NVIDIA GPUs found, the test will be skipped." << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+  CUDA_CHECK(cudaSetDevice(0));
+
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  int* d_values;
+  CUDA_CHECK(cudaMalloc(&d_values, N * sizeof(int)));
+  CUDA_CHECK(cudaMemcpy(d_values, values.data(), N * sizeof(int), cudaMemcpyHostToDevice));
+
+  SECTION("Default comparison") {
+    xtd::sort(d_values, d_values + N);
+    CUDA_CHECK(cudaMemcpy(values.data(), d_values, N * sizeof(int), cudaMemcpyDeviceToHost));
+
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N)));
+  }
+
+  SECTION("Greater comparison") {
+    xtd::sort(d_values, d_values + N, std::greater<int>{});
+    CUDA_CHECK(cudaMemcpy(values.data(), d_values, N * sizeof(int), cudaMemcpyDeviceToHost));
+
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N) | std::views::reverse));
+  }
+
+  CUDA_CHECK(cudaFree(d_values));
+}
diff --git a/test/sort/sort_t.hip.cc b/test/sort/sort_t.hip.cc
new file mode 100644
index 0000000..90c25c2
--- /dev/null
+++ b/test/sort/sort_t.hip.cc
@@ -0,0 +1,52 @@
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include "common/hip_check.h"
+#include <hip_runtime.h>
+
+TEST_CASE("sortHIP", "[sort]") {
+  int deviceCount;
+  hipError_t hipStatus = hipGetDeviceCount(&deviceCount);
+
+  if (hipStatus != hipSuccess || deviceCount == 0) {
+    std::cout << "No AMD GPUs found, the test will be skipped." << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+  HIP_CHECK(hipSetDevice(0));
+
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  int* d_values;
+  HIP_CHECK(hipMalloc(&d_values, N * sizeof(int)));
+  HIP_CHECK(hipMemcpy(d_values, values.data(), N * sizeof(int), hipMemcpyHostToDevice));
+
+  SECTION("Default comparison") {
+    xtd::sort(d_values, d_values + N);
+    HIP_CHECK(hipMemcpy(values.data(), d_values, N * sizeof(int), hipMemcpyDeviceToHost));
+
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N)));
+  }
+
+  SECTION("Greater comparison") {
+    xtd::sort(d_values, d_values + N, std::greater<int>{});
+    HIP_CHECK(hipMemcpy(values.data(), d_values, N * sizeof(int), hipMemcpyDeviceToHost));
+
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N) | std::views::reverse));
+  }
+
+  HIP_CHECK(hipFree(d_values));
+}
diff --git a/test/sort/sort_t.sycl.cc b/test/sort/sort_t.sycl.cc
new file mode 100644
index 0000000..878dcfe
--- /dev/null
+++ b/test/sort/sort_t.sycl.cc
@@ -0,0 +1,51 @@
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "algorithm.h"
+
+#include <sycl/sycl.hpp>
+
+TEST_CASE("sortSYCL", "[sort]") {
+  const int N = 100;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+#ifdef ONEAPI_CPU
+  auto queue = sycl::queue{sycl::cpu_selector_v, sycl::property::queue::in_order()};
+#else
+  if (sycl::device::get_devices(sycl::info::device_type::gpu).size() == 0) {
+    std::cout << "No SYCL GPUs found, the test will be skipped." << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+  auto queue = sycl::queue{sycl::gpu_selector_v, sycl::property::queue::in_order()};
+#endif
+
+  std::vector<int> values(N);
+  std::iota(values.begin(), values.end(), 0);
+  std::shuffle(values.begin(), values.end(), rng);
+
+  auto *d_values = sycl::malloc_device<int>(N, queue);
+  queue.memcpy(d_result, values.data(), N * sizeof(int)).wait();
+
+  SECTION("Default comparison") {
+    xtd::sort(d_values, d_values + N);
+    queue.memcpy(values.data(), d_values, N * sizeof(int)).wait();
+
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N)));
+  }
+
+  SECTION("Greater comparison") {
+    xtd::sort(d_values, d_values + N, std::greater<int>{});
+    queue.memcpy(values.data(), d_values, N * sizeof(int)).wait();
+
+    REQUIRE(std::ranges::equal(values, std::views::iota(0, N) | std::views::reverse));
+  }
+
+  sycl::free(d_values, queue);
+}