From 4bd0c2a04644a0bed5ec04f06a9f9fbc846da400 Mon Sep 17 00:00:00 2001
From: Nick Breed <nick@streamhpc.com>
Date: Mon, 4 Nov 2024 11:22:27 +0000
Subject: [PATCH 01/17] Set c++ version to 17 and create warning

---
 CMakeLists.txt                  | 10 +++++++++-
 test/cpp_wrapper/CMakeLists.txt | 11 ++++++++++-
 test/package/CMakeLists.txt     | 11 ++++++++++-
 test/parity/CMakeLists.txt      | 10 +++++++++-
 4 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff8bab44a..1bb71e733 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -142,7 +142,9 @@ include(cmake/VerifyCompiler.cmake)
 option(DISABLE_WERROR "Disable building with Werror" ON)
 
 # Build CXX flags
-set(CMAKE_CXX_STANDARD 11)
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 if(DISABLE_WERROR)
@@ -154,6 +156,12 @@ if(CODE_COVERAGE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
 endif()
 
+if (CMAKE_CXX_STANDARD EQUAL 14)
+  message(WARNING "C++14 will be deprecated in the next major release")
+elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
+  message(FATAL_ERROR "Only C++14 and C++17 are supported")
+endif()
+
 # HIP on Windows: xhip is required with clang++ to get __half defined
 if (WIN32)
   add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-xhip>")
diff --git a/test/cpp_wrapper/CMakeLists.txt b/test/cpp_wrapper/CMakeLists.txt
index 9aee5b320..8043696ad 100644
--- a/test/cpp_wrapper/CMakeLists.txt
+++ b/test/cpp_wrapper/CMakeLists.txt
@@ -33,10 +33,19 @@ list(APPEND CMAKE_MODULE_PATH
 )
 
 set(CMAKE_CXX_COMPILER g++)
-set(CMAKE_CXX_STANDARD 11)
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
+
+if (CMAKE_CXX_STANDARD EQUAL 14)
+  message(WARNING "C++14 will be deprecated in the next major release")
+elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
+  message(FATAL_ERROR "Only C++14 and C++17 are supported")
+endif()
+
 include(cmake/Dependencies.cmake)
 
 # Find rocRAND
diff --git a/test/package/CMakeLists.txt b/test/package/CMakeLists.txt
index 78250ee30..055dda460 100644
--- a/test/package/CMakeLists.txt
+++ b/test/package/CMakeLists.txt
@@ -45,10 +45,19 @@ else()
 find_package(hip REQUIRED CONFIG PATHS $ENV{ROCM_PATH})
 endif()
 
-set(CMAKE_CXX_STANDARD 11)
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
+
+if (CMAKE_CXX_STANDARD EQUAL 14)
+  message(WARNING "C++14 will be deprecated in the next major release")
+elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
+  message(FATAL_ERROR "Only C++14 and C++17 are supported")
+endif()
+
 # Find rocRAND
 find_package(rocrand REQUIRED CONFIG HINTS ${rocrand_DIR} PATHS "$ENV{ROCM_PATH}/rocrand")
 
diff --git a/test/parity/CMakeLists.txt b/test/parity/CMakeLists.txt
index 41b3fa0b7..263f2fcd0 100644
--- a/test/parity/CMakeLists.txt
+++ b/test/parity/CMakeLists.txt
@@ -56,10 +56,18 @@ else()
     set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
 endif()
 
-set(CMAKE_CXX_STANDARD 14)
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
+if (CMAKE_CXX_STANDARD EQUAL 14)
+  message(WARNING "C++14 will be deprecated in the next major release")
+elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
+  message(FATAL_ERROR "Only C++14 and C++17 are supported")
+endif()
+
 # Find rocRAND
 find_package(rocrand REQUIRED CONFIG HINTS ${rocrand_DIR} PATHS "$ENV{ROCM_PATH}/rocrand")
 

From 92d94e09aaeb90dffe7a40c62f818846a511edeb Mon Sep 17 00:00:00 2001
From: Nick Breed <nick@streamhpc.com>
Date: Mon, 4 Nov 2024 11:23:10 +0000
Subject: [PATCH 02/17] Fix no_discard warning c++17

---
 test/test_rocrand_hipgraphs.cpp     |  5 +++++
 test/test_rocrand_kernel_mtgp32.cpp | 14 +++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/test/test_rocrand_hipgraphs.cpp b/test/test_rocrand_hipgraphs.cpp
index 912c99c55..d4dadcfde 100644
--- a/test/test_rocrand_hipgraphs.cpp
+++ b/test/test_rocrand_hipgraphs.cpp
@@ -186,8 +186,13 @@ TEST_P(rocrand_hipgraph_generate_tests, poisson_test)
 
 TEST(rocrand_hipgraph_generate_tests, hipgraphs_doc_sample){
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-result"
+
 #include "hipgraphs_doc_sample.hpp"
 
+#pragma GCC diagnostic pop
+
 }
 
 INSTANTIATE_TEST_SUITE_P(rocrand_hipgraph_generate_tests,
diff --git a/test/test_rocrand_kernel_mtgp32.cpp b/test/test_rocrand_kernel_mtgp32.cpp
index 2fd6b37cc..4837b477d 100644
--- a/test/test_rocrand_kernel_mtgp32.cpp
+++ b/test/test_rocrand_kernel_mtgp32.cpp
@@ -286,7 +286,7 @@ TEST(rocrand_kernel_mtgp32, rocrand)
     typedef rocrand_state_mtgp32 state_type;
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 
@@ -320,7 +320,7 @@ TEST(rocrand_kernel_mtgp32, rocrand_uniform)
     typedef rocrand_state_mtgp32 state_type;
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 
@@ -352,7 +352,7 @@ TEST(rocrand_kernel_mtgp32, rocrand_normal)
     typedef rocrand_state_mtgp32 state_type;
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 
@@ -387,7 +387,7 @@ TEST(rocrand_kernel_mtgp32, rocrand_normal_double)
     typedef rocrand_state_mtgp32 state_type;
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 
@@ -423,7 +423,7 @@ TEST(rocrand_kernel_mtgp32, rocrand_log_normal)
     typedef rocrand_state_mtgp32 state_type;
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 
@@ -463,7 +463,7 @@ TEST(rocrand_kernel_mtgp32, rocrand_log_normal_double)
     typedef rocrand_state_mtgp32 state_type;
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 
@@ -508,7 +508,7 @@ TEST_P(rocrand_kernel_mtgp32_poisson, rocrand_poisson)
     const double lambda = GetParam();
 
     state_type* states;
-    hipMallocHelper(&states, sizeof(state_type) * 8);
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
 
     ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
 

From 01598420a25a8ab0de7143fcf698a55a0e0606cf Mon Sep 17 00:00:00 2001
From: Nick Breed <nick@streamhpc.com>
Date: Mon, 4 Nov 2024 11:30:20 +0000
Subject: [PATCH 03/17] Build for both c++ 14 and 17

---
 .gitlab-ci.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 93e3a4fc8..82b26027f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -163,6 +163,7 @@ copyright-date:
       -D DISABLE_WERROR=OFF
       -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
       -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
+      -D CMAKE_CXX_STANDARD="$BUILD_VERSION"
     - cmake --build $CI_PROJECT_DIR/build
     - if [[ "${BUILD_SHARED_LIBS}" = "ON" ]]; then cmake --build $CI_PROJECT_DIR/build --target package; fi
 
@@ -189,6 +190,7 @@ copyright-date:
       -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
       -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
       -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda
+      -D CMAKE_CXX_STANDARD=14
     - cmake --build $CI_PROJECT_DIR/build
     - if [[ "${BUILD_SHARED_LIBS}" = "ON" ]]; then cmake --build $CI_PROJECT_DIR/build --target package; fi
 
@@ -217,6 +219,7 @@ build:rocm-cmake-minimum:
   variables:
     BUILD_SHARED_LIBS: "ON"
     BUILD_BENCHMARK_TUNING: "ON"
+    BUILD_VERSION: 14
 
 build:rocm-hipcc-cmake-minimum:
   tags:
@@ -228,6 +231,7 @@ build:rocm-hipcc-cmake-minimum:
     - .save-artifacts
   variables:
     BUILD_SHARED_LIBS: "ON"
+    BUILD_VERSION: 14
 
 build:rocm-static-cmake-minimum:
   tags:
@@ -238,6 +242,7 @@ build:rocm-static-cmake-minimum:
     - .rocm:build
   variables:
     BUILD_SHARED_LIBS: "OFF"
+    BUILD_VERSION: 14
 
 build:rocm-cmake-latest:
   tags:
@@ -248,6 +253,10 @@ build:rocm-cmake-latest:
     - .rocm:build
   variables:
     BUILD_SHARED_LIBS: "ON"
+  parallel:
+    matrix:
+      - BUILD_VERSION: [14, 17]
+
 
 build:nvcc-cmake-minimum:
   tags:
@@ -324,6 +333,7 @@ benchmark:benchmark-tuning:
       -D BENCHMARK_TUNING_BLOCK_OPTIONS="${BENCHMARK_TUNING_BLOCK_OPTIONS}"
       -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
       -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
+      -D CMAKE_CXX_STANDARD=14
     - cmake --build $CI_PROJECT_DIR/build --target benchmark_rocrand_tuning
     - $CI_PROJECT_DIR/build/benchmark/tuning/benchmark_rocrand_tuning --benchmark_out_format=json --benchmark_out=$CI_PROJECT_DIR/build/rocrand_config_tuning_${GPU_TARGET}_${CI_JOB_ID}.json
   artifacts:
@@ -604,6 +614,7 @@ test:nvcc-parity:
       -D CMAKE_INSTALL_PREFIX:PATH="$CI_PROJECT_DIR/build/install"
       -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}/lib/cmake"
       -D DISABLE_WERROR=OFF *>&1
+      -D CMAKE_CXX_STANDARD=14
     # Building
     - cmake --build "$CI_PROJECT_DIR/build" *>&1
 
@@ -632,6 +643,7 @@ test:windows:
       -D CMAKE_BUILD_TYPE=Release
       -D CMAKE_CXX_COMPILER:FILEPATH="${env:HIP_PATH}/bin/clang++.exe"
       -D CMAKE_PREFIX_PATH:FILEPATH="${env:HIP_PATH}/lib/cmake;$CI_PROJECT_DIR/build/install" *>&1
+      -D CMAKE_CXX_STANDARD=14
     # Build package test
     - cmake --build "$CI_PROJECT_DIR/build_install_test"
     # Copy rocRAND.dll to the package test build directory
@@ -663,6 +675,7 @@ test:windows:
       -D CMAKE_CXX_COMPILER=${COMPILER}
       -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
       -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
+      -D CMAKE_CXX_STANDARD=14
     - cmake --build ${ROCRAND_STAT_TESTS_DIR}/build
     - mkdir ${LOGS_DIR}
     - cd ${ROCRAND_STAT_TESTS_DIR}/build

From 3d0003593dca4f2abea1539cf21f4f6173a8e6bd Mon Sep 17 00:00:00 2001
From: Nick Breed <nick@streamhpc.com>
Date: Mon, 4 Nov 2024 11:52:53 +0000
Subject: [PATCH 04/17] Add to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 45fcaae6c..1d0d25cc0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Documentation for rocRAND is available at
 
 ### Changed
 * Updated several `gfx942` auto tuning parameters.
+* Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release.
 
 ### Fixed
 * Fixed an issue where `mt19937.hpp` would cause kernel errors during auto tuning.

From ef2e1019f184fa94b24535a53547b2033817a56f Mon Sep 17 00:00:00 2001
From: Sander Bos <sander@streamhpc.com>
Date: Fri, 22 Nov 2024 12:56:27 +0000
Subject: [PATCH 05/17] Fix warnings when generating sphinx docs

---
 docs/api-reference/cpp-api.rst    |   2 +-
 docs/conf.py                      |   7 +
 docs/doxygen/Doxyfile             | 852 ++++++++++++++++++++----------
 library/include/rocrand/rocrand.h |  47 +-
 4 files changed, 620 insertions(+), 288 deletions(-)

diff --git a/docs/api-reference/cpp-api.rst b/docs/api-reference/cpp-api.rst
index 6db52ae88..afa7705f3 100644
--- a/docs/api-reference/cpp-api.rst
+++ b/docs/api-reference/cpp-api.rst
@@ -11,7 +11,7 @@ C/C++ API reference
 This chapter describes the rocRAND C and C++ API.
 
 API index
-===========
+=========
 
 To search an API, refer to the API :ref:`genindex`.
 
diff --git a/docs/conf.py b/docs/conf.py
index 0362d9f2b..0c7945207 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -44,3 +44,10 @@
 
 for sphinx_var in ROCmDocs.SPHINX_VARS:
     globals()[sphinx_var] = getattr(docs_core, sphinx_var)
+
+# Suppresses "WARNING: toctree directive not expected with external-toc"
+# Ideally our code would be fixed, so suppression isn't needed:
+# https://github.com/executablebooks/sphinx-external-toc/issues/36
+suppress_warnings = ["etoc.toctree"]
+
+cpp_id_attributes = ["__forceinline__", "__device__", "__host__", "rocrand_status"]
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index b0ae0c543..fddcb7265 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.17
+# Doxyfile 1.9.8
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,6 +12,16 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
@@ -60,16 +70,28 @@ PROJECT_LOGO           =
 
 OUTPUT_DIRECTORY       = .
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,26 +103,18 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
 
-# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all generated output in the proper direction.
-# Possible values are: None, LTR, RTL and Context.
-# The default value is: None.
-
-OUTPUT_TEXT_DIRECTION  = None
-
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -217,6 +231,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -240,25 +262,19 @@ TAB_SIZE               = 8
 # the documentation. An alias has the form:
 # name=value
 # For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines (in the resulting output). You can put ^^ in the value part of an
-# alias to insert a newline as if a physical newline was in the original file.
-# When you need a literal { or } or , in the value part of an alias you have to
-# escape them by means of a backslash (\), this can lead to conflicts with the
-# commands \{ and \} for these it is advised to use the version @{ and @} or use
-# a double escape (\\{ and \\})
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
 
 ALIASES                = "rocrand_internal="
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -300,18 +316,21 @@ OPTIMIZE_OUTPUT_SLICE  = NO
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
 # language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
-# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
-# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
 # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
 # tries to guess whether the code is fixed or free formatted code, this is the
-# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
-# .inc files as Fortran files (default is PHP), and .f files as C (default is
-# Fortran), use: inc=Fortran f=C.
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
@@ -334,6 +353,17 @@ MARKDOWN_SUPPORT       = YES
 
 TOC_INCLUDE_HEADINGS   = 5
 
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0 and GITHUB use the lower case version of title
+# with any whitespace replaced by '-' and punctuation characters removed.
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -445,6 +475,27 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = NO
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -508,6 +559,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -519,7 +577,8 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
@@ -545,12 +604,20 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# (including Cygwin) ands Mac users are advised to set this option to NO.
-# The default value is: system dependent.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = YES
 
@@ -568,6 +635,12 @@ HIDE_SCOPE_NAMES       = NO
 
 HIDE_COMPOUND_REFERENCE= NO
 
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -725,7 +798,8 @@ FILE_VERSION_FILTER    =
 # output files in an output format independent way. To create the layout file
 # that represents doxygen's defaults, run doxygen with the -l option. You can
 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
 #
 # Note that if you run doxygen from a directory containing a file called
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -771,24 +845,50 @@ WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
 # The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation. If
-# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
 # The default value is: NO.
 
 WARN_AS_ERROR          = YES
@@ -799,13 +899,27 @@ WARN_AS_ERROR          = YES
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -828,12 +942,23 @@ INPUT                  = ../mainpage.dox \
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -842,13 +967,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
-# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen
-# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf and *.ice.
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm,
+# *.cpp, *.cppm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl,
+# *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, *.php,
+# *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be
+# provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
@@ -887,10 +1014,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        = rocrand_device::* \
                          rocrand_impl::*
@@ -936,6 +1060,11 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 #
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
 # Note that for custom extensions or not directly supported extensions you also
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.
@@ -977,6 +1106,15 @@ FILTER_SOURCE_PATTERNS =
 
 USE_MDFILE_AS_MAINPAGE =
 
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
 #---------------------------------------------------------------------------
@@ -1063,6 +1201,46 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS
+# tag is set to YES then doxygen will add the directory of each input to the
+# include path.
+# The default value is: YES.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_ADD_INC_PATHS    = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1074,17 +1252,11 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = NO
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1163,7 +1335,12 @@ HTML_STYLESHEET        = ../_doxygen/stylesheet.css
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_EXTRA_STYLESHEET  = ../_doxygen/extra_stylesheet.css
@@ -1178,9 +1355,22 @@ HTML_EXTRA_STYLESHEET  = ../_doxygen/extra_stylesheet.css
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = AUTO_LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
+# this color. Hue is specified as an angle on a color-wheel, see
 # https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
@@ -1190,7 +1380,7 @@ HTML_EXTRA_FILES       =
 HTML_COLORSTYLE_HUE    = 220
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
 # value of 255 will produce the most vivid colors.
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1208,15 +1398,6 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
 # are dynamically created via JavaScript. If disabled, the navigation index will
@@ -1236,6 +1417,13 @@ HTML_DYNAMIC_MENUS     = YES
 
 HTML_DYNAMIC_SECTIONS  = NO
 
+# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be
+# dynamically folded and expanded in the generated HTML source code.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_CODE_FOLDING      = YES
+
 # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
 # shown in the various tree structured indices initially; the user can expand
 # and collapse entries dynamically later on. Doxygen will expand the tree to
@@ -1251,10 +1439,11 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/xcode/), introduced with OSX
-# 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
 # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
 # genXcode/_index.html for more information.
@@ -1271,6 +1460,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1296,8 +1492,12 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1327,7 +1527,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1354,6 +1554,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1372,7 +1582,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1380,8 +1591,8 @@ QHP_NAMESPACE          =
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1389,16 +1600,16 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
@@ -1410,9 +1621,9 @@ QHP_CUST_FILTER_ATTRS  =
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1455,16 +1666,28 @@ DISABLE_INDEX          = NO
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 GENERATE_TREEVIEW      = NONE
 
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
 #
@@ -1489,6 +1712,24 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1498,17 +1739,6 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
 # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
 # to create new LaTeX commands to be used in formulas as building blocks. See
 # the section "Including formulas" for details.
@@ -1526,11 +1756,29 @@ FORMULA_MACROFILE      =
 
 USE_MATHJAX            = NO
 
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
 # When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
 # Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1543,22 +1791,29 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1605,7 +1860,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1618,8 +1874,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1728,29 +1985,31 @@ PAPER_TYPE             = a4wide
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
 #
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
 # LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_FOOTER           =
@@ -1783,18 +2042,26 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1807,16 +2074,6 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1825,14 +2082,6 @@ LATEX_SOURCE_CODE      = NO
 
 LATEX_BIB_STYLE        = plain
 
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
 # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
 # path from which the emoji images will be read. If a relative path is entered,
 # it will be relative to the LATEX_OUTPUT directory. If left blank the
@@ -1897,16 +2146,6 @@ RTF_STYLESHEET_FILE    =
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -2003,27 +2242,44 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
 # the structure of the code including all documentation. Note that this feature
 # is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
 
+#---------------------------------------------------------------------------
+# Configuration options related to Sqlite3 output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_SQLITE3 tag is set to YES doxygen will generate a Sqlite3
+# database with symbols found by doxygen stored in tables.
+# The default value is: NO.
+
+GENERATE_SQLITE3       = NO
+
+# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be
+# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put
+# in front of it.
+# The default directory is: sqlite3.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_OUTPUT         = sqlite3
+
+# The SQLITE3_OVERWRITE_DB tag is set to YES, the existing doxygen_sqlite3.db
+# database file will be recreated with each doxygen run. If set to NO, doxygen
+# will warn if an a database file is already found and not modify it.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_RECREATE_DB    = YES
+
 #---------------------------------------------------------------------------
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
@@ -2098,7 +2354,8 @@ SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 INCLUDE_PATH           =
@@ -2166,15 +2423,15 @@ TAGFILES               =
 
 GENERATE_TAGFILE       = html/tagfile.xml
 
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
+# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces
+# will be listed in the class and namespace index. If set to NO, only the
+# inherited external classes will be listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
 # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
+# in the topic index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
@@ -2188,25 +2445,9 @@ EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2215,10 +2456,10 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: NO.
+# The default value is: YES.
 
 HAVE_DOT               = NO
 
@@ -2232,49 +2473,73 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTNAME           = FreeSans
+DOT_COMMON_ATTR        = "fontname=FreeSans,fontsize=10"
 
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTSIZE           = 10
+DOT_EDGE_ATTR          = "labelfontname=FreeSans,labelfontsize=10"
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
 # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
 # graph for each documented class showing the direct and indirect implementation
 # dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
+# class with other documented classes. Explicit enabling a collaboration graph,
+# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the
+# command \collaborationgraph. Disabling a collaboration graph can be
+# accomplished by means of the command \hidecollaborationgraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. Explicit enabling a group
+# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means
+# of the command \groupgraph. Disabling a directory graph can be accomplished by
+# means of the command \hidegroupgraph. See also the chapter Grouping in the
+# manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2297,10 +2562,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2312,7 +2599,9 @@ TEMPLATE_RELATIONS     = NO
 # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
 # YES then doxygen will generate a graph for each documented file showing the
 # direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO,
+# can be accomplished by means of the command \includegraph. Disabling an
+# include graph can be accomplished by means of the command \hideincludegraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2321,7 +2610,10 @@ INCLUDE_GRAPH          = YES
 # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
 # set to YES then doxygen will generate a graph for each documented file showing
 # the direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set
+# to NO, can be accomplished by means of the command \includedbygraph. Disabling
+# an included by graph can be accomplished by means of the command
+# \hideincludedbygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2361,21 +2653,32 @@ GRAPHICAL_HIERARCHY    = YES
 # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
 # dependencies a directory has on other directories in a graphical way. The
 # dependency relations are determined by the #include relations between the
-# files in the directories.
+# files in the directories. Explicit enabling a directory graph, when
+# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command
+# \directorygraph. Disabling a directory graph can be accomplished by means of
+# the command \hidedirectorygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# Possible values are: png, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd,
+# gif, gif:cairo, gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd,
+# png:cairo, png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
 # This tag requires that the tag HAVE_DOT is set to YES.
@@ -2407,11 +2710,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2420,10 +2724,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2461,18 +2765,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 1000
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
@@ -2485,14 +2777,34 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
diff --git a/library/include/rocrand/rocrand.h b/library/include/rocrand/rocrand.h
index c92fbf209..daea04824 100644
--- a/library/include/rocrand/rocrand.h
+++ b/library/include/rocrand/rocrand.h
@@ -56,25 +56,30 @@ extern "C" {
 /**
  * \brief rocRAND function call status type
  */
-typedef enum rocrand_status {
-    ROCRAND_STATUS_SUCCESS = 0, ///< No errors
+enum rocrand_status
+{
+    ROCRAND_STATUS_SUCCESS          = 0, ///< No errors
     ROCRAND_STATUS_VERSION_MISMATCH = 100, ///< Header file and linked library version do not match
     ROCRAND_STATUS_NOT_CREATED = 101, ///< Generator was not created using rocrand_create_generator
     ROCRAND_STATUS_ALLOCATION_FAILED = 102, ///< Memory allocation failed during execution
-    ROCRAND_STATUS_TYPE_ERROR = 103, ///< Generator type is wrong
-    ROCRAND_STATUS_OUT_OF_RANGE = 104, ///< Argument out of range
-    ROCRAND_STATUS_LENGTH_NOT_MULTIPLE = 105, ///< Requested size is not a multiple of quasirandom generator's dimension,
-                                              ///< or requested size is not even (see rocrand_generate_normal()),
-                                              ///< or pointer is misaligned (see rocrand_generate_normal())
+    ROCRAND_STATUS_TYPE_ERROR        = 103, ///< Generator type is wrong
+    ROCRAND_STATUS_OUT_OF_RANGE      = 104, ///< Argument out of range
+    ROCRAND_STATUS_LENGTH_NOT_MULTIPLE
+    = 105, ///< Requested size is not a multiple of quasirandom generator's dimension,
+    ///< or requested size is not even (see rocrand_generate_normal()),
+    ///< or pointer is misaligned (see rocrand_generate_normal())
     ROCRAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106, ///< GPU does not have double precision
-    ROCRAND_STATUS_LAUNCH_FAILURE = 107, ///< Kernel launch failure
-    ROCRAND_STATUS_INTERNAL_ERROR = 108 ///< Internal library error
-} rocrand_status;
+    ROCRAND_STATUS_LAUNCH_FAILURE            = 107, ///< Kernel launch failure
+    ROCRAND_STATUS_INTERNAL_ERROR            = 108 ///< Internal library error
+};
+/// \cond DO_NOT_DOCUMENT
+typedef enum rocrand_status rocrand_status;
+/// \endcond
 
 /**
  * \brief rocRAND generator type
  */
-typedef enum rocrand_rng_type
+enum rocrand_rng_type
 {
     ROCRAND_RNG_PSEUDO_DEFAULT       = 400, ///< Default pseudorandom generator
     ROCRAND_RNG_PSEUDO_XORWOW        = 401, ///< XORWOW pseudorandom generator
@@ -97,13 +102,15 @@ typedef enum rocrand_rng_type
     ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 502, ///< Scrambled Sobol32 quasirandom generator
     ROCRAND_RNG_QUASI_SOBOL64           = 504, ///< Sobol64 quasirandom generator
     ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 505 ///< Scrambled Sobol64 quasirandom generator
-
-} rocrand_rng_type;
+};
+/// \cond DO_NOT_DOCUMENT
+typedef enum rocrand_rng_type rocrand_rng_type;
+/// \endcond
 
 /**
  * \brief rocRAND generator ordering
  */
-typedef enum rocrand_ordering
+enum rocrand_ordering
 {
     ROCRAND_ORDERING_PSEUDO_BEST    = 100, ///< Best ordering for pseudorandom results
     ROCRAND_ORDERING_PSEUDO_DEFAULT = 101, ///< Default ordering for pseudorandom results
@@ -112,18 +119,24 @@ typedef enum rocrand_ordering
     ROCRAND_ORDERING_PSEUDO_DYNAMIC
     = 104, ///< Adjust to the device executing the generator. The global memory usage may be higher than with the other orderings.
     ROCRAND_ORDERING_QUASI_DEFAULT = 201 ///< n-dimensional ordering for quasirandom results
-} rocrand_ordering;
+};
+/// \cond DO_NOT_DOCUMENT
+typedef enum rocrand_ordering rocrand_ordering;
+/// \endcond
 
 /**
  * \brief rocRAND vector set
  */
-typedef enum rocrand_direction_vector_set
+enum rocrand_direction_vector_set
 {
     ROCRAND_DIRECTION_VECTORS_32_JOEKUO6           = 101,
     ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102,
     ROCRAND_DIRECTION_VECTORS_64_JOEKUO6           = 103,
     ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104,
-} rocrand_direction_vector_set;
+};
+/// \cond DO_NOT_DOCUMENT
+typedef enum rocrand_direction_vector_set rocrand_direction_vector_set;
+/// \endcond
 
 // Host API function
 

From 52dd64fb868ec7732a320ea11000a4d031a54df7 Mon Sep 17 00:00:00 2001
From: Sander Bos <sander@streamhpc.com>
Date: Mon, 2 Dec 2024 12:02:38 +0000
Subject: [PATCH 06/17] Replace GitHub URL with its issue number

---
 docs/conf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 0c7945207..20152d799 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -46,8 +46,7 @@
     globals()[sphinx_var] = getattr(docs_core, sphinx_var)
 
 # Suppresses "WARNING: toctree directive not expected with external-toc"
-# Ideally our code would be fixed, so suppression isn't needed:
-# https://github.com/executablebooks/sphinx-external-toc/issues/36
+# Ideally suppression wouldn't be needed; see sphinx-external-toc#36
 suppress_warnings = ["etoc.toctree"]
 
 cpp_id_attributes = ["__forceinline__", "__device__", "__host__", "rocrand_status"]

From 4e2171c40f2a54187ec036aaeb76bba8b8627e9c Mon Sep 17 00:00:00 2001
From: Sander Bos <sander@streamhpc.com>
Date: Mon, 9 Dec 2024 13:22:25 +0000
Subject: [PATCH 07/17] Update documentation generation instructions

---
 README.md | 77 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 6bfdaaf19..13583e180 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # rocRAND
 
+> [!NOTE]
+> The published rocRAND documentation is available [here](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
 The rocRAND project provides functions that generate pseudorandom and quasirandom numbers.
 The rocRAND library is implemented in the [HIP](https://github.com/ROCm/HIP)
 programming language and optimized for AMD's latest discrete GPUs. It is designed to run on top
@@ -24,28 +27,6 @@ split into a separate library. As of version 6.0, hipRAND can no longer be built
 * Scrambled Sobol64
 * ThreeFry
 
-## Documentation
-
-> [!NOTE]
-> The published rocRAND documentation is available at [rocRAND](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocRAND/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
-
-To build documentation locally, use the following code:
-
-```sh
-# Go to the docs directory
-cd docs
-
-# Install Python dependencies
-python3 -m pip install -r sphinx/requirements.txt
-
-# Build the documentation
-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
-
-# E.g. serve the HTML docs locally
-cd _build/html
-python3 -m http.server
-```
-
 ## Requirements
 
 * CMake (3.16 or later)
@@ -225,6 +206,58 @@ been migrated to the new framework.
 * [Fortran wrappers](./library/src/fortran/).
 * [Python wrappers](./python/): [rocRAND](./python/rocrand).
 
+## Building the documentation locally
+
+### Requirements
+
+#### Doxygen
+
+The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues.
+
+After you have downloaded Doxygen version 1.9.4:
+
+```shell
+# Add doxygen to your PATH
+echo 'export PATH=<doxygen 1.9.4 path>/bin:$PATH' >> ~/.bashrc
+
+# Apply the updated .bashrc
+source ~/.bashrc
+
+# Confirm that you are using version 1.9.4
+doxygen --version
+```
+
+#### Python
+
+The build system uses Python version 3.10. You can try using a newer version, but that might cause issues.
+
+You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation):
+
+```shell
+# Install Python 3.10
+pyenv install 3.10
+
+# Create a Python 3.10 virtual environment
+pyenv virtualenv 3.10 venv_rocrand
+
+# Activate the virtual environment
+pyenv activate venv_rocrand
+```
+
+### Building
+
+After cloning this repository, and `cd`ing into it:
+
+```shell
+# Install Python dependencies
+python3 -m pip install -r docs/sphinx/requirements.txt
+
+# Build the documentation
+python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html
+```
+
+You can then open `docs/_build/html/index.html` in your browser to view the documentation.
+
 ## Support
 
 Bugs and feature requests can be reported through the

From 72de763c93abdb48565b40cce030cd73f030f39b Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Mon, 13 Jan 2025 10:16:20 +0000
Subject: [PATCH 08/17] Fix CI on rocm/dev-ubuntu

---
 .gitlab-ci.yml          | 25 ++++++++++++++-----------
 python/rocrand/setup.py |  4 +---
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 82b26027f..303d17b6d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -206,7 +206,6 @@ copyright-date:
       - $CI_PROJECT_DIR/build/CMakeCache.txt
       - $CI_PROJECT_DIR/build/*.deb
       - $CI_PROJECT_DIR/build/*.zip
-    expire_in: 2 weeks
 
 build:rocm-cmake-minimum:
   tags:
@@ -430,17 +429,17 @@ benchmark:nvcc:
     - .rules:test
   script:
     - $SUDO_CMD apt-get update -qq
-    - $SUDO_CMD apt-get install -y -qq python3 python3-pip python3-numpy
-    - $SUDO_CMD apt-get install -y -qq wget
-    - pip3 install setuptools
+    - $SUDO_CMD apt-get install -y -qq python3 python3-pip python3-venv
     - export ROCRAND_PATH=$CI_PROJECT_DIR/build/library/
     # rocRAND Wrapper with Python 3
-    - pip3 --version
     - cd $CI_PROJECT_DIR/python/rocrand
-    - python3 setup.py test
-    - pip3 install . --user
+    - python3 -m venv rocrand-venv
+    - source rocrand-venv/bin/activate
+    - python3 -m pip install setuptools numpy
+    - python3 -m pip install .
     - python3 tests/rocrand_test.py
-    - pip3 uninstall --yes rocrand
+    - python3 -m pip uninstall --yes rocrand
+    - deactivate
 
 test:rocm-python:
   tags:
@@ -524,7 +523,11 @@ test:cpp-wrapper:
       -D BUILD_FORTRAN_WRAPPER=OFF
       -D BUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF
       -D CMAKE_CXX_COMPILER=${COMPILER}
-    - $SUDO_CMD cmake --build $CI_PROJECT_DIR/build_only_install --target install
+      -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
+      -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
+      -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda
+    # Preserve $PATH when sudoing
+    - $SUDO_CMD env PATH="$PATH" cmake --build $CI_PROJECT_DIR/build_only_install --target install
     - cmake
       -S $CI_PROJECT_DIR/test/package/
       -B $CI_PROJECT_DIR/install_test
@@ -683,7 +686,7 @@ test:windows:
   artifacts:
     paths:
       - ${LOGS_DIR}/*
-    expire_in: never
+    expire_in: 3 months
 
 # TestU01 SmallCrush, 10 tests, 15 statistics, takes about 5 seconds
 statistical-test:crush-small:
diff --git a/python/rocrand/setup.py b/python/rocrand/setup.py
index 389ba9b40..aa0ebcf0a 100644
--- a/python/rocrand/setup.py
+++ b/python/rocrand/setup.py
@@ -19,14 +19,12 @@
     description="rocRAND Python Wrapper",
     long_description=readme,
     author="Advanced Micro Devices, Inc.",
-    # author_email="",
     url="https://github.com/ROCmSoftwarePlatform/rocRAND",
     license="MIT",
     packages=["rocrand"],
     install_requires=REQUIRED_PACKAGES,
-    test_suite="tests",
     command_options={
         "build_sphinx": {
             "version": ("setup.py", version),
             "release": ("setup.py", release)}},
-)
\ No newline at end of file
+)

From e5450321a003ff3005508f0a953fcc8cbc9f4991 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@streamhpc.com>
Date: Mon, 13 Jan 2025 15:25:01 +0000
Subject: [PATCH 09/17] Make rocRAND usable from FetchContent

---
 library/CMakeLists.txt | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt
index ff67232cb..0bec72a60 100644
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -110,6 +110,13 @@ endif()
 # This will be removed with upcoming packaging changes.
 target_include_directories(rocrand INTERFACE $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include/rocrand>)
 
+target_include_directories(
+    rocrand
+    PRIVATE
+        "${PROJECT_SOURCE_DIR}/library/include"
+        "${PROJECT_BINARY_DIR}/library/include"
+)
+
 # Build library
 if(HIP_COMPILER STREQUAL "nvcc")
     set_source_files_properties(${rocRAND_HIP_SRCS}
@@ -126,15 +133,15 @@ endif()
 rocm_set_soversion(rocrand ${rocrand_SOVERSION})
 set_target_properties(rocrand
     PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/library"
+        RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/library"
         DEBUG_POSTFIX "-d"
 )
 
 rocm_install(
     TARGETS rocrand
     INCLUDE
-        "${CMAKE_SOURCE_DIR}/library/include"
-        "${CMAKE_BINARY_DIR}/library/include"
+        "${PROJECT_SOURCE_DIR}/library/include"
+        "${PROJECT_BINARY_DIR}/library/include"
 )
 
 set(FORTRAN_SRCS_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/rocrand/src/fortran")

From 865f4a6b300c25f7187286cfde6ea7641c2e0506 Mon Sep 17 00:00:00 2001
From: Sander Bos <sander@streamhpc.com>
Date: Wed, 15 Jan 2025 14:40:13 +0000
Subject: [PATCH 10/17] Remove extra dashes in the documentation its \param and
 \tparam

---
 library/include/rocrand/rocrand.h             | 176 +++----
 library/include/rocrand/rocrand.hpp           | 150 +++---
 library/include/rocrand/rocrand_discrete.h    | 151 +++---
 library/include/rocrand/rocrand_lfsr113.h     |  58 ++-
 library/include/rocrand/rocrand_log_normal.h  | 484 +++++++++---------
 library/include/rocrand/rocrand_mrg31k3p.h    |  48 +-
 library/include/rocrand/rocrand_mrg32k3a.h    |  48 +-
 library/include/rocrand/rocrand_mtgp32.h      |  48 +-
 library/include/rocrand/rocrand_normal.h      | 240 +++++----
 .../include/rocrand/rocrand_philox4x32_10.h   |  53 +-
 library/include/rocrand/rocrand_poisson.h     | 106 ++--
 .../rocrand/rocrand_scrambled_sobol32.h       |  32 +-
 .../rocrand/rocrand_scrambled_sobol64.h       |  34 +-
 library/include/rocrand/rocrand_sobol32.h     |  28 +-
 library/include/rocrand/rocrand_sobol64.h     |  28 +-
 .../include/rocrand/rocrand_threefry2x32_20.h |  29 +-
 .../include/rocrand/rocrand_threefry2x64_20.h |  29 +-
 .../include/rocrand/rocrand_threefry4x32_20.h |  29 +-
 .../include/rocrand/rocrand_threefry4x64_20.h |  29 +-
 library/include/rocrand/rocrand_uniform.h     | 183 ++++---
 library/include/rocrand/rocrand_xorwow.h      |  48 +-
 21 files changed, 1051 insertions(+), 980 deletions(-)

diff --git a/library/include/rocrand/rocrand.h b/library/include/rocrand/rocrand.h
index daea04824..7c8f4586d 100644
--- a/library/include/rocrand/rocrand.h
+++ b/library/include/rocrand/rocrand.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -162,8 +162,8 @@ typedef enum rocrand_direction_vector_set rocrand_direction_vector_set;
  * - ROCRAND_RNG_QUASI_SOBOL64
  * - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64
  *
- * \param generator - Pointer to generator
- * \param rng_type - Type of generator to create
+ * \param generator Pointer to generator
+ * \param rng_type Type of generator to create
  *
  * \return
  * - ROCRAND_STATUS_ALLOCATION_FAILED, if memory could not be allocated \n
@@ -185,8 +185,8 @@ rocrand_create_generator(rocrand_generator * generator, rocrand_rng_type rng_typ
  *
  * All generators are supported.
  *
- * \param generator - Pointer to generator
- * \param rng_type - Type of generator to create
+ * \param generator Pointer to generator
+ * \param rng_type Type of generator to create
  *
  * \return
  * - ROCRAND_STATUS_ALLOCATION_FAILED, if memory could not be allocated \n
@@ -212,7 +212,7 @@ rocrand_status ROCRANDAPI rocrand_create_generator_host_blocking(rocrand_generat
  *
  * Destroys random number generator and frees related memory.
  *
- * \param generator - Generator to be destroyed
+ * \param generator Generator to be destroyed
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -230,9 +230,9 @@ rocrand_destroy_generator(rocrand_generator generator);
  * Generated numbers are between \p 0 and \p 2^32, including \p 0 and
  * excluding \p 2^32.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of 32-bit unsigned integers to generate
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of 32-bit unsigned integers to generate
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -254,9 +254,9 @@ rocrand_generate(rocrand_generator generator,
  * Generated numbers are between \p 0 and \p 2^64, including \p 0 and
  * excluding \p 2^64.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of 64-bit unsigned integers to generate
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of 64-bit unsigned integers to generate
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -279,9 +279,9 @@ rocrand_status ROCRANDAPI rocrand_generate_long_long(rocrand_generator       gen
 * Generated numbers are between \p 0 and \p 2^8, including \p 0 and
 * excluding \p 2^8.
 *
-* \param generator - Generator to use
-* \param output_data - Pointer to memory to store generated numbers
-* \param n - Number of 8-bit unsigned integers to generate
+* \param generator Generator to use
+* \param output_data Pointer to memory to store generated numbers
+* \param n Number of 8-bit unsigned integers to generate
 *
 * \return
 * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -303,9 +303,9 @@ rocrand_generate_char(rocrand_generator generator,
 * Generated numbers are between \p 0 and \p 2^16, including \p 0 and
 * excluding \p 2^16.
 *
-* \param generator - Generator to use
-* \param output_data - Pointer to memory to store generated numbers
-* \param n - Number of 16-bit unsigned integers to generate
+* \param generator Generator to use
+* \param output_data Pointer to memory to store generated numbers
+* \param n Number of 16-bit unsigned integers to generate
 *
 * \return
 * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -327,9 +327,9 @@ rocrand_generate_short(rocrand_generator generator,
  * Generated numbers are between \p 0.0f and \p 1.0f, excluding \p 0.0f and
  * including \p 1.0f.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>float</tt>s to generate
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>float</tt>s to generate
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -351,9 +351,9 @@ rocrand_generate_uniform(rocrand_generator generator,
  * Generated numbers are between \p 0.0 and \p 1.0, excluding \p 0.0 and
  * including \p 1.0.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>double</tt>s to generate
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>double</tt>s to generate
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -375,9 +375,9 @@ rocrand_generate_uniform_double(rocrand_generator generator,
  * Generated numbers are between \p 0.0 and \p 1.0, excluding \p 0.0 and
  * including \p 1.0.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>half</tt>s to generate
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>half</tt>s to generate
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -396,11 +396,11 @@ rocrand_generate_uniform_half(rocrand_generator generator,
  * Generates \p n normally distributed distributed 32-bit floating-point
  * values and saves them to \p output_data.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>float</tt>s to generate
- * \param mean - Mean value of normal distribution
- * \param stddev - Standard deviation value of normal distribution
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>float</tt>s to generate
+ * \param mean Mean value of normal distribution
+ * \param stddev Standard deviation value of normal distribution
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -420,11 +420,11 @@ rocrand_generate_normal(rocrand_generator generator,
  * Generates \p n normally distributed 64-bit double-precision floating-point
  * numbers and saves them to \p output_data.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>double</tt>s to generate
- * \param mean - Mean value of normal distribution
- * \param stddev - Standard deviation value of normal distribution
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>double</tt>s to generate
+ * \param mean Mean value of normal distribution
+ * \param stddev Standard deviation value of normal distribution
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -444,11 +444,11 @@ rocrand_generate_normal_double(rocrand_generator generator,
 * Generates \p n normally distributed 16-bit half-precision floating-point
 * numbers and saves them to \p output_data.
 *
-* \param generator - Generator to use
-* \param output_data - Pointer to memory to store generated numbers
-* \param n - Number of <tt>half</tt>s to generate
-* \param mean - Mean value of normal distribution
-* \param stddev - Standard deviation value of normal distribution
+* \param generator Generator to use
+* \param output_data Pointer to memory to store generated numbers
+* \param n Number of <tt>half</tt>s to generate
+* \param mean Mean value of normal distribution
+* \param stddev Standard deviation value of normal distribution
 *
 * \return
 * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -468,11 +468,11 @@ rocrand_generate_normal_half(rocrand_generator generator,
  * Generates \p n log-normally distributed 32-bit floating-point values
  * and saves them to \p output_data.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>float</tt>s to generate
- * \param mean - Mean value of log normal distribution
- * \param stddev - Standard deviation value of log normal distribution
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>float</tt>s to generate
+ * \param mean Mean value of log normal distribution
+ * \param stddev Standard deviation value of log normal distribution
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -492,11 +492,11 @@ rocrand_generate_log_normal(rocrand_generator generator,
  * Generates \p n log-normally distributed 64-bit double-precision floating-point
  * values and saves them to \p output_data.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of <tt>double</tt>s to generate
- * \param mean - Mean value of log normal distribution
- * \param stddev - Standard deviation value of log normal distribution
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of <tt>double</tt>s to generate
+ * \param mean Mean value of log normal distribution
+ * \param stddev Standard deviation value of log normal distribution
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -516,11 +516,11 @@ rocrand_generate_log_normal_double(rocrand_generator generator,
 * Generates \p n log-normally distributed 16-bit half-precision floating-point
 * values and saves them to \p output_data.
 *
-* \param generator - Generator to use
-* \param output_data - Pointer to memory to store generated numbers
-* \param n - Number of <tt>half</tt>s to generate
-* \param mean - Mean value of log normal distribution
-* \param stddev - Standard deviation value of log normal distribution
+* \param generator Generator to use
+* \param output_data Pointer to memory to store generated numbers
+* \param n Number of <tt>half</tt>s to generate
+* \param mean Mean value of log normal distribution
+* \param stddev Standard deviation value of log normal distribution
 *
 * \return
 * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -540,10 +540,10 @@ rocrand_generate_log_normal_half(rocrand_generator generator,
  * Generates \p n Poisson-distributed 32-bit unsigned integers and
  * saves them to \p output_data.
  *
- * \param generator - Generator to use
- * \param output_data - Pointer to memory to store generated numbers
- * \param n - Number of 32-bit unsigned integers to generate
- * \param lambda - lambda for the Poisson distribution
+ * \param generator Generator to use
+ * \param output_data Pointer to memory to store generated numbers
+ * \param n Number of 32-bit unsigned integers to generate
+ * \param lambda lambda for the Poisson distribution
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -568,7 +568,7 @@ rocrand_generate_poisson(rocrand_generator generator,
  * automatically called by functions which generates random numbers like
  * rocrand_generate(), rocrand_generate_uniform() etc.
  *
- * \param generator - Generator to initialize
+ * \param generator Generator to initialize
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -584,8 +584,8 @@ rocrand_initialize_generator(rocrand_generator generator);
  * Sets the current stream for all kernel launches of the generator.
  * All functions will use this stream.
  *
- * \param generator - Generator to modify
- * \param stream - Stream to use or NULL for default stream
+ * \param generator Generator to modify
+ * \param stream Stream to use or NULL for default stream
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -611,8 +611,8 @@ rocrand_set_stream(rocrand_generator generator, hipStream_t stream);
  * second seed value. If those values smaller than 2 and/or 8, those
  * are increased with 1 and/or 7.
  *
- * \param generator - Pseudo-random number generator
- * \param seed - New seed value
+ * \param generator Pseudo-random number generator
+ * \param seed New seed value
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -636,8 +636,8 @@ rocrand_set_seed(rocrand_generator generator, unsigned long long seed);
  * 127. If those values smaller, than the requested minimum values [2, 8, 16, 128], then
  * it will be increased with the minimum values minus 1 [1, 7, 15, 127].
  *
- * \param generator - Pseudo-random number generator
- * \param seed - New seed value
+ * \param generator Pseudo-random number generator
+ * \param seed New seed value
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -657,8 +657,8 @@ rocrand_status ROCRANDAPI rocrand_set_seed_uint4(rocrand_generator generator, ui
  * Absolute offset cannot be set if generator's type is ROCRAND_RNG_PSEUDO_MTGP32 or
  * ROCRAND_RNG_PSEUDO_LFSR113.
  *
- * \param generator - Random number generator
- * \param offset - New absolute offset
+ * \param generator Random number generator
+ * \param offset New absolute offset
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -677,8 +677,8 @@ rocrand_set_offset(rocrand_generator generator, unsigned long long offset);
  * - This operation resets the generator's internal state.
  * - This operation does not change the generator's seed.
  *
- * \param generator - Random number generator
- * \param order - New ordering of results
+ * \param generator Random number generator
+ * \param order New ordering of results
  *
  * The ordering choices for pseudorandom sequences are the following.
  * Note that not all generators support all orderings. For details, see
@@ -708,8 +708,8 @@ rocrand_status ROCRANDAPI rocrand_set_ordering(rocrand_generator generator, rocr
  * - This operation resets the generator's internal state.
  * - This operation does not change the generator's offset.
  *
- * \param generator - Quasi-random number generator
- * \param dimensions - Number of dimensions
+ * \param generator Quasi-random number generator
+ * \param dimensions Number of dimensions
  *
  * \return
  * - ROCRAND_STATUS_NOT_CREATED if the generator wasn't created \n
@@ -727,7 +727,7 @@ rocrand_set_quasi_random_generator_dimensions(rocrand_generator generator,
  * Returns in \p version the version number of the dynamically linked
  * rocRAND library.
  *
- * \param version - Version of the library
+ * \param version Version of the library
  *
  * \return
  * - ROCRAND_STATUS_OUT_OF_RANGE if \p version is NULL \n
@@ -741,8 +741,8 @@ rocrand_get_version(int * version);
  *
  * Construct the histogram for the Poisson distribution with lambda \p lambda.
  *
- * \param lambda - lambda for the Poisson distribution
- * \param discrete_distribution - pointer to the histogram in device memory
+ * \param lambda lambda for the Poisson distribution
+ * \param discrete_distribution pointer to the histogram in device memory
  *
  * \return
  * - ROCRAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
@@ -761,10 +761,10 @@ rocrand_create_poisson_distribution(double lambda,
  * 32-bit unsigned integers from the range [\p offset, \p offset + \p size)
  * using \p probabilities as probabilities.
  *
- * \param probabilities - probabilities of the the distribution in host memory
- * \param size - size of \p probabilities
- * \param offset - offset of values
- * \param discrete_distribution - pointer to the histogram in device memory
+ * \param probabilities probabilities of the the distribution in host memory
+ * \param size size of \p probabilities
+ * \param offset offset of values
+ * \param discrete_distribution pointer to the histogram in device memory
  *
  * \return
  * - ROCRAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
@@ -784,7 +784,7 @@ rocrand_create_discrete_distribution(const double * probabilities,
  * Destroy the histogram array for a discrete distribution created by
  * rocrand_create_poisson_distribution.
  *
- * \param discrete_distribution - pointer to the histogram in device memory
+ * \param discrete_distribution pointer to the histogram in device memory
  *
  * \return
  * - ROCRAND_STATUS_OUT_OF_RANGE if \p discrete_distribution was null \n
@@ -796,9 +796,9 @@ rocrand_destroy_discrete_distribution(rocrand_discrete_distribution discrete_dis
 /**
  * \brief Get the vector for 32-bit (scrambled-)sobol generation.
  *
- * \param vectors - location where to write the vector pointer to
+ * \param vectors location where to write the vector pointer to
  *
- * \param set - which direction vector set to use
+ * \param set which direction vector set to use
  *
  * \return
  * - ROCRAND_STATUS_OUT_OF_RANGE if \p set was invalid for this method \n
@@ -810,9 +810,9 @@ rocrand_status ROCRANDAPI rocrand_get_direction_vectors32(const unsigned int**
 /**
  * \brief Get the vector for 64-bit (scrambled-)sobol generation.
  *
- * \param vectors - location where to write the vector pointer to
+ * \param vectors location where to write the vector pointer to
  *
- * \param set - which direction vector set to use
+ * \param set which direction vector set to use
  *
  * \return
  * - ROCRAND_STATUS_OUT_OF_RANGE if \p set was invalid for this method \n
@@ -824,7 +824,7 @@ rocrand_status ROCRANDAPI rocrand_get_direction_vectors64(const unsigned long lo
 /**
  * \brief Get the scramble constants for 32-bit scrambled sobol generation.
  *
- * \param constants - location where to write the constants pointer to
+ * \param constants location where to write the constants pointer to
  *
  * \return
  * - ROCRAND_STATUS_SUCCESS if the pointer was set succesfully \n
@@ -834,7 +834,7 @@ rocrand_status ROCRANDAPI rocrand_get_scramble_constants32(const unsigned int**
 /**
  * \brief Get the scramble constants for 64-bit scrambled sobol generation.
  *
- * \param constants - location where to write the constants pointer to
+ * \param constants location where to write the constants pointer to
  *
  * \return
  * - ROCRAND_STATUS_SUCCESS if the pointer was set succesfully \n
diff --git a/library/include/rocrand/rocrand.hpp b/library/include/rocrand/rocrand.hpp
index 8120a51c1..ff6e24ed2 100644
--- a/library/include/rocrand/rocrand.hpp
+++ b/library/include/rocrand/rocrand.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +54,7 @@ class error : public std::exception
 
     /// Constructs new error object from error code \p error.
     ///
-    /// \param error - error code
+    /// \param error error code
     explicit error(error_type error) noexcept
         : m_error(error),
           m_error_string(to_string(error))
@@ -140,7 +140,7 @@ class error : public std::exception
 ///
 /// \brief Produces random integer values uniformly distributed on the interval [0, 2^(sizeof(IntType)*8) - 1].
 ///
-/// \tparam IntType - type of generated values. Only \p unsigned \p char, \p unsigned \p short and \p unsigned \p int and \p unsigned \p long \p long \p int type is supported.
+/// \tparam IntType type of generated values. Only \p unsigned \p char, \p unsigned \p short and \p unsigned \p int and \p unsigned \p long \p long \p int type is supported.
 template<class IntType = unsigned int>
 class uniform_int_distribution
 {
@@ -183,9 +183,9 @@ class uniform_int_distribution
     /// on the  interval [0, 2^(sizeof(IntType)*8) - 1], and stores them into the device memory
     /// referenced by \p output pointer.
     ///
-    /// \param g - An uniform random number generator object
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param g An uniform random number generator object
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -245,7 +245,7 @@ class uniform_int_distribution
 ///
 /// \brief Produces random floating-point values uniformly distributed on the interval (0, 1].
 ///
-/// \tparam RealType - type of generated values. Only \p float, \p double and \p half types are supported.
+/// \tparam RealType type of generated values. Only \p float, \p double and \p half types are supported.
 
 template<class RealType = float>
 class uniform_real_distribution
@@ -289,9 +289,9 @@ class uniform_real_distribution
     /// on the interval (0, 1], and stores them into the device memory referenced
     /// by \p output pointer.
     ///
-    /// \param g - An uniform random number generator object
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param g An uniform random number generator object
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -345,7 +345,7 @@ class uniform_real_distribution
 ///
 /// \brief Produces random numbers according to a normal distribution.
 ///
-/// \tparam RealType - type of generated values. Only \p float, \p double and \p half types are supported.
+/// \tparam RealType type of generated values. Only \p float, \p double and \p half types are supported.
 template<class RealType = float>
 class normal_distribution
 {
@@ -370,8 +370,8 @@ class normal_distribution
 
         /// \brief Constructs a \p param_type object with the
         /// given distribution parameters.
-        /// \param mean - mean
-        /// \param stddev - standard deviation
+        /// \param mean mean
+        /// \param stddev standard deviation
         param_type(RealType mean = 0.0, RealType stddev = 1.0)
             : m_mean(mean), m_stddev(stddev)
         {
@@ -416,15 +416,15 @@ class normal_distribution
     };
 
     /// \brief Constructs a new distribution object.
-    /// \param mean - A mean distribution parameter
-    /// \param stddev - A standard deviation distribution parameter
+    /// \param mean A mean distribution parameter
+    /// \param stddev A standard deviation distribution parameter
     normal_distribution(RealType mean = 0.0, RealType stddev = 1.0)
         : m_params(mean, stddev)
     {
     }
 
     /// \brief Constructs a new distribution object.
-    /// \param params - Distribution parameters
+    /// \param params Distribution parameters
     explicit normal_distribution(const param_type& params)
         : m_params(params)
     {
@@ -480,9 +480,9 @@ class normal_distribution
     /// Generates \p size random floating-point values distributed according to a normal distribution,
     /// and stores them into the device memory referenced by \p output pointer.
     ///
-    /// \param g - An uniform random number generator object
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param g An uniform random number generator object
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -549,7 +549,7 @@ class normal_distribution
 ///
 /// \brief Produces positive random numbers according to a log-normal distribution.
 ///
-/// \tparam RealType - type of generated values. Only \p float, \p double and \p half types are supported.
+/// \tparam RealType type of generated values. Only \p float, \p double and \p half types are supported.
 template<class RealType = float>
 class lognormal_distribution
 {
@@ -574,8 +574,8 @@ class lognormal_distribution
 
         /// \brief Constructs a \p param_type object with the
         /// given distribution parameters.
-        /// \param m - mean
-        /// \param s - standard deviation
+        /// \param m mean
+        /// \param s standard deviation
         param_type(RealType m = 0.0, RealType s = 1.0)
             : m_mean(m), m_stddev(s)
         {
@@ -620,15 +620,15 @@ class lognormal_distribution
     };
 
     /// \brief Constructs a new distribution object.
-    /// \param m - A mean distribution parameter
-    /// \param s - A standard deviation distribution parameter
+    /// \param m A mean distribution parameter
+    /// \param s A standard deviation distribution parameter
     lognormal_distribution(RealType m = 0.0, RealType s = 1.0)
         : m_params(m, s)
     {
     }
 
     /// \brief Constructs a new distribution object.
-    /// \param params - Distribution parameters
+    /// \param params Distribution parameters
     explicit lognormal_distribution(const param_type& params)
         : m_params(params)
     {
@@ -685,9 +685,9 @@ class lognormal_distribution
     /// to a log-normal distribution, and stores them into the device memory referenced
     /// by \p output pointer.
     ///
-    /// \param g - An uniform random number generator object
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param g An uniform random number generator object
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -754,7 +754,7 @@ class lognormal_distribution
 ///
 /// \brief Produces random non-negative integer values distributed according to Poisson distribution.
 ///
-/// \tparam IntType - type of generated values. Only \p unsinged \p int type is supported.
+/// \tparam IntType type of generated values. Only \p unsinged \p int type is supported.
 template<class IntType = unsigned int>
 class poisson_distribution
 {
@@ -777,7 +777,7 @@ class poisson_distribution
 
         /// \brief Constructs a \p param_type object with the
         /// given mean.
-        /// \param mean - mean to use for the distribution
+        /// \param mean mean to use for the distribution
         param_type(double mean = 1.0)
             : m_mean(mean)
         {
@@ -815,14 +815,14 @@ class poisson_distribution
     };
 
     /// \brief Constructs a new distribution object.
-    /// \param mean - A mean distribution parameter.
+    /// \param mean A mean distribution parameter.
     poisson_distribution(double mean = 1.0)
         : m_params(mean)
     {
     }
 
     /// \brief Constructs a new distribution object.
-    /// \param params - Distribution parameters
+    /// \param params Distribution parameters
     explicit poisson_distribution(const param_type& params)
         : m_params(params)
     {
@@ -873,9 +873,9 @@ class poisson_distribution
     /// to Poisson distribution, and stores them into the device memory referenced
     /// by \p output pointer.
     ///
-    /// \param g - An uniform random number generator object
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param g An uniform random number generator object
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -946,9 +946,9 @@ class philox4x32_10_engine
 
     /// \brief Constructs the pseudo-random number engine.
     ///
-    /// \param seed_value - seed value to use in the initialization of the internal state, see also seed()
-    /// \param offset_value - number of internal states that should be skipped, see also offset()
-    /// \param order_value - ordering of the sequences generated by the engine, see also order()
+    /// \param seed_value seed value to use in the initialization of the internal state, see also seed()
+    /// \param offset_value number of internal states that should be skipped, see also offset()
+    /// \param order_value ordering of the sequences generated by the engine, see also order()
     ///
     /// See also: rocrand_create_generator()
     philox4x32_10_engine(seed_type   seed_value   = DefaultSeed,
@@ -981,7 +981,7 @@ class philox4x32_10_engine
     /// passed reference to \p NULL. The lifetime of \p generator is now
     /// bound to the lifetime of the engine.
     ///
-    /// \param generator - rocRAND generator
+    /// \param generator rocRAND generator
     explicit philox4x32_10_engine(rocrand_generator& generator)
         : m_generator(generator)
     {
@@ -1035,7 +1035,7 @@ class philox4x32_10_engine
     }
 
     /// \brief Sets the random number engine's \p hipStream for kernel launches.
-    /// \param value - new \p hipStream to use
+    /// \param value new \p hipStream to use
     void stream(hipStream_t value)
     {
         rocrand_status status = rocrand_set_stream(m_generator, value);
@@ -1050,7 +1050,7 @@ class philox4x32_10_engine
     /// - This operation resets the engine's internal state.
     /// - This operation does not change the engine's seed.
     ///
-    /// \param value - New ordering
+    /// \param value New ordering
     ///
     /// See also: rocrand_set_ordering()
     void order(order_type value)
@@ -1068,7 +1068,7 @@ class philox4x32_10_engine
     /// - This operation resets the engine's internal state.
     /// - This operation does not change the engine's seed or the number of dimensions.
     ///
-    /// \param value - New absolute offset
+    /// \param value New absolute offset
     ///
     /// See also: rocrand_set_offset()
     void offset(offset_type value)
@@ -1084,7 +1084,7 @@ class philox4x32_10_engine
     /// - This operation resets the engine's internal state.
     /// - This operation does not change the engine's offset.
     ///
-    /// \param value - New seed value
+    /// \param value New seed value
     ///
     /// See also: rocrand_set_seed()
     void seed(seed_type value)
@@ -1099,8 +1099,8 @@ class philox4x32_10_engine
     /// on the interval [0, 2^32 - 1], and stores them into the device memory
     /// referenced by \p output pointer.
     ///
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// The device memory pointed by \p output must have been previously allocated
     /// and be large enough to store at least \p size values of \p IntType type.
@@ -1689,8 +1689,8 @@ class mtgp32_engine
     ///
     /// MTGP32 engine does not accept offset.
     ///
-    /// \param seed_value - seed value to use in the initialization of the internal state, see also seed()
-    /// \param order_value - ordering value from the rocrand_ordering enum
+    /// \param seed_value seed value to use in the initialization of the internal state, see also seed()
+    /// \param order_value ordering value from the rocrand_ordering enum
     ///
     /// See also: rocrand_create_generator()
     mtgp32_engine(seed_type  seed_value  = DefaultSeed,
@@ -1853,8 +1853,8 @@ class lfsr113_engine
     ///
     /// LFSR113 does not accept offset.
     ///
-    /// \param seed_value - seed value to use in the initialization of the internal state, see also seed()
-    /// \param order_value - ordering of the sequences generated by the engine, see also order()
+    /// \param seed_value seed value to use in the initialization of the internal state, see also seed()
+    /// \param order_value ordering of the sequences generated by the engine, see also order()
     ///
     /// See also: rocrand_create_generator()
     lfsr113_engine(seed_type  seed_value = {DefaultSeedX, DefaultSeedY, DefaultSeedZ, DefaultSeedW},
@@ -2049,8 +2049,8 @@ class mt19937_engine
     ///
     /// MT19937 does not accept offset.
     ///
-    /// \param seed_value - seed value to use in the initialization of the internal state, see also seed()
-    /// \param order_value - ordering of the sequences generated by the engine, see also order()
+    /// \param seed_value seed value to use in the initialization of the internal state, see also seed()
+    /// \param order_value ordering of the sequences generated by the engine, see also order()
     ///
     /// See also: rocrand_create_generator()
     mt19937_engine(seed_type  seed_value  = DefaultSeed,
@@ -2215,9 +2215,9 @@ class sobol32_engine
 
     /// \brief Constructs the pseudo-random number engine.
     ///
-    /// \param num_of_dimensions - number of dimensions to use in the initialization of the internal state, see also dimensions()
-    /// \param offset_value - number of internal states that should be skipped, see also offset()
-    /// \param order_value - ordering of the sequences generated by the engine, see also order()
+    /// \param num_of_dimensions number of dimensions to use in the initialization of the internal state, see also dimensions()
+    /// \param offset_value number of internal states that should be skipped, see also offset()
+    /// \param order_value ordering of the sequences generated by the engine, see also order()
     ///
     /// See also: rocrand_create_generator()
     sobol32_engine(dimensions_num_type num_of_dimensions = DefaultNumDimensions,
@@ -2313,7 +2313,7 @@ class sobol32_engine
     /// - This operation resets the generator's internal state.
     /// - This operation does not change the generator's offset.
     ///
-    /// \param value - Number of dimensions
+    /// \param value Number of dimensions
     ///
     /// See also: rocrand_set_quasi_random_generator_dimensions()
     void dimensions(dimensions_num_type value)
@@ -2329,8 +2329,8 @@ class sobol32_engine
     /// on the interval [0, 2^32 - 1], and stores them into the device memory
     /// referenced by \p output pointer.
     ///
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -2416,9 +2416,9 @@ class scrambled_sobol32_engine
 
     /// \brief Constructs the pseudo-random number engine.
     ///
-    /// \param num_of_dimensions - number of dimensions to use in the initialization of the internal state, see also dimensions()
-    /// \param offset_value - number of internal states that should be skipped, see also offset()
-    /// \param order_value - ordering value from the rocrand_ordering enum
+    /// \param num_of_dimensions number of dimensions to use in the initialization of the internal state, see also dimensions()
+    /// \param offset_value number of internal states that should be skipped, see also offset()
+    /// \param order_value ordering value from the rocrand_ordering enum
     ///
     /// See also: rocrand_create_generator()
     scrambled_sobol32_engine(dimensions_num_type num_of_dimensions = DefaultNumDimensions,
@@ -2516,7 +2516,7 @@ class scrambled_sobol32_engine
     /// - This operation resets the generator's internal state.
     /// - This operation does not change the generator's offset.
     ///
-    /// \param value - Number of dimensions
+    /// \param value Number of dimensions
     ///
     /// See also: rocrand_set_quasi_random_generator_dimensions()
     void dimensions(dimensions_num_type value)
@@ -2533,8 +2533,8 @@ class scrambled_sobol32_engine
     /// on the interval [0, 2^32 - 1], and stores them into the device memory
     /// referenced by \p output pointer.
     ///
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -2621,9 +2621,9 @@ class sobol64_engine
 
     /// \brief Constructs the pseudo-random number engine.
     ///
-    /// \param num_of_dimensions - number of dimensions to use in the initialization of the internal state, see also dimensions()
-    /// \param offset_value - number of internal states that should be skipped, see also offset()
-    /// \param order_value - ordering of the sequences generated by the engine, see also order()
+    /// \param num_of_dimensions number of dimensions to use in the initialization of the internal state, see also dimensions()
+    /// \param offset_value number of internal states that should be skipped, see also offset()
+    /// \param order_value ordering of the sequences generated by the engine, see also order()
     ///
     /// See also: rocrand_create_generator()
     sobol64_engine(dimensions_num_type num_of_dimensions = DefaultNumDimensions,
@@ -2719,7 +2719,7 @@ class sobol64_engine
     /// - This operation resets the generator's internal state.
     /// - This operation does not change the generator's offset.
     ///
-    /// \param value - Number of dimensions
+    /// \param value Number of dimensions
     ///
     /// See also: rocrand_set_quasi_random_generator_dimensions()
     void dimensions(dimensions_num_type value)
@@ -2735,8 +2735,8 @@ class sobol64_engine
     /// on the interval [0, 2^64 - 1], and stores them into the device memory
     /// referenced by \p output pointer.
     ///
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
@@ -2822,9 +2822,9 @@ class scrambled_sobol64_engine
 
     /// \brief Constructs the pseudo-random number engine.
     ///
-    /// \param num_of_dimensions - number of dimensions to use in the initialization of the internal state, see also dimensions()
-    /// \param offset_value - number of internal states that should be skipped, see also offset()
-    /// \param order_value - ordering of the sequences generated by the engine, see also order()
+    /// \param num_of_dimensions number of dimensions to use in the initialization of the internal state, see also dimensions()
+    /// \param offset_value number of internal states that should be skipped, see also offset()
+    /// \param order_value ordering of the sequences generated by the engine, see also order()
     ///
     /// See also: rocrand_create_generator()
     scrambled_sobol64_engine(dimensions_num_type num_of_dimensions = DefaultNumDimensions,
@@ -2922,7 +2922,7 @@ class scrambled_sobol64_engine
     /// - This operation resets the generator's internal state.
     /// - This operation does not change the generator's offset.
     ///
-    /// \param value - Number of dimensions
+    /// \param value Number of dimensions
     ///
     /// See also: rocrand_set_quasi_random_generator_dimensions()
     void dimensions(dimensions_num_type value)
@@ -2939,8 +2939,8 @@ class scrambled_sobol64_engine
     /// on the interval [0, 2^64 - 1], and stores them into the device memory
     /// referenced by \p output pointer.
     ///
-    /// \param output - Pointer to device memory to store results
-    /// \param size - Number of values to generate
+    /// \param output Pointer to device memory to store results
+    /// \param size Number of values to generate
     ///
     /// Requirements:
     /// * The device memory pointed by \p output must have been previously allocated
diff --git a/library/include/rocrand/rocrand_discrete.h b/library/include/rocrand/rocrand_discrete.h
index f1fd275c3..9805878d0 100644
--- a/library/include/rocrand/rocrand_discrete.h
+++ b/library/include/rocrand/rocrand_discrete.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -179,14 +179,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using Philox generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_philox4x32_10*        state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_philox4x32_10*        state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution);
 }
@@ -198,13 +198,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using Philox generator in \p state, and increments
  * the position of the generator by four.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return Four <tt>unsigned int</tt> values distributed according to \p discrete_distribution as \p uint4
  */
-__forceinline__ __device__ __host__ uint4 rocrand_discrete4(
-    rocrand_state_philox4x32_10* state, const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+uint4 rocrand_discrete4(rocrand_state_philox4x32_10*        state,
+                        const rocrand_discrete_distribution discrete_distribution)
 {
     const uint4 u4 = rocrand4(state);
     return uint4 {
@@ -222,14 +223,14 @@ __forceinline__ __device__ __host__ uint4 rocrand_discrete4(
  * \p discrete_distribution using MRG31k3p generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_mrg31k3p*             state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_mrg31k3p*             state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution);
 }
@@ -241,14 +242,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using MRG32k3a generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_mrg32k3a*             state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_mrg32k3a*             state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution);
 }
@@ -260,14 +261,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using XORWOW generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_xorwow*               state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_xorwow*               state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_alias(rocrand(state), *discrete_distribution);
 }
@@ -279,14 +280,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using MTGP32 generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ unsigned int
-    rocrand_discrete(rocrand_state_mtgp32*               state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__
+unsigned int rocrand_discrete(rocrand_state_mtgp32*               state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
 #ifdef ROCRAND_PREFER_CDF_OVER_ALIAS
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
@@ -302,14 +303,14 @@ __forceinline__ __device__ unsigned int
  * \p discrete_distribution using SOBOL32 generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_sobol32*              state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_sobol32*              state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
 }
@@ -321,14 +322,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using SCRAMBLED_SOBOL32 generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_scrambled_sobol32*    state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_scrambled_sobol32*    state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
 }
@@ -340,14 +341,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using SOBOL64 generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned long long int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_sobol64*              state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_sobol64*              state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
 }
@@ -359,14 +360,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using SCRAMBLED_SOBOL64 generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned long long int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_scrambled_sobol64*    state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_scrambled_sobol64*    state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
 }
@@ -378,14 +379,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using LFSR113 generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_lfsr113*              state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_lfsr113*              state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
 #ifdef ROCRAND_PREFER_CDF_OVER_ALIAS
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
@@ -401,14 +402,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using ThreeFry generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_threefry2x32_20*      state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_threefry2x32_20*      state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
 #ifdef ROCRAND_PREFER_CDF_OVER_ALIAS
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
@@ -424,14 +425,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using ThreeFry generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_threefry2x64_20*      state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_threefry2x64_20*      state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
 #ifdef ROCRAND_PREFER_CDF_OVER_ALIAS
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
@@ -447,14 +448,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using ThreeFry generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_threefry4x32_20*      state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_threefry4x32_20*      state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
 #ifdef ROCRAND_PREFER_CDF_OVER_ALIAS
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
@@ -470,14 +471,14 @@ __forceinline__ __device__ __host__ unsigned int
  * \p discrete_distribution using ThreeFry generator in \p state, and increments
  * the position of the generator by one.
  *
- * \param state - Pointer to a state to use
- * \param discrete_distribution - Related discrete distribution
+ * \param state Pointer to a state to use
+ * \param discrete_distribution Related discrete distribution
  *
  * \return <tt>unsigned int</tt> value distributed according to \p discrete_distribution
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_discrete(rocrand_state_threefry4x64_20*      state,
-                     const rocrand_discrete_distribution discrete_distribution)
+__forceinline__ __device__ __host__
+unsigned int rocrand_discrete(rocrand_state_threefry4x64_20*      state,
+                              const rocrand_discrete_distribution discrete_distribution)
 {
 #ifdef ROCRAND_PREFER_CDF_OVER_ALIAS
     return rocrand_device::detail::discrete_cdf(rocrand(state), *discrete_distribution);
diff --git a/library/include/rocrand/rocrand_lfsr113.h b/library/include/rocrand/rocrand_lfsr113.h
index 62844a699..05579bf72 100644
--- a/library/include/rocrand/rocrand_lfsr113.h
+++ b/library/include/rocrand/rocrand_lfsr113.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -230,12 +230,12 @@ typedef rocrand_device::lfsr113_engine rocrand_state_lfsr113;
  * Initializes the LFSR113 generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void
-    rocrand_init(const uint4 seed, const unsigned int subsequence, rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const uint4 seed, const unsigned int subsequence, rocrand_state_lfsr113* state)
 {
     *state = rocrand_state_lfsr113(seed, subsequence);
 }
@@ -246,15 +246,16 @@ __forceinline__ __device__ __host__ void
  * Initializes the LFSR113 generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const uint4              seed,
-                                                      const unsigned int       subsequence,
-                                                      const unsigned long long offset,
-                                                      rocrand_state_lfsr113*   state)
+__forceinline__ __device__ __host__
+void rocrand_init(const uint4              seed,
+                  const unsigned int       subsequence,
+                  const unsigned long long offset,
+                  rocrand_state_lfsr113*   state)
 {
     *state = rocrand_state_lfsr113(seed, subsequence, offset);
 }
@@ -267,11 +268,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const uint4              s
  * value from [0; 2^32 - 1] range using LFSR113 generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_lfsr113* state)
 {
     return state->next();
 }
@@ -281,11 +283,11 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_lfsr113*
  *
  * Updates the LFSR113 state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long     offset,
-                                                   rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_lfsr113* state)
 {
     return state->discard(offset);
 }
@@ -296,11 +298,11 @@ __forceinline__ __device__ __host__ void skipahead(unsigned long long     offset
  * Updates the LFSR113 \p state to skip ahead by \p subsequence subsequences.
  * Each subsequence is 2^55 numbers long.
  *
- * \param subsequence - Number of subsequences to skip
- * \param state - Pointer to state to update
+ * \param subsequence Number of subsequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned int           subsequence,
-                                                               rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+void skipahead_subsequence(unsigned int subsequence, rocrand_state_lfsr113* state)
 {
     return state->discard_subsequence(subsequence);
 }
@@ -311,11 +313,11 @@ __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned int
  * Updates the LFSR113 \p state to skip ahead by \p sequence sequences.
  * For LFSR113 each sequence is 2^55 numbers long (equal to the size of a subsequence).
  *
- * \param sequence - Number of sequences to skip
- * \param state - Pointer to state to update
+ * \param sequence Number of sequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_sequence(unsigned int           sequence,
-                                                            rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+void skipahead_sequence(unsigned int sequence, rocrand_state_lfsr113* state)
 {
     return state->discard_subsequence(sequence);
 }
diff --git a/library/include/rocrand/rocrand_log_normal.h b/library/include/rocrand/rocrand_log_normal.h
index 068426b36..6d4457b69 100644
--- a/library/include/rocrand/rocrand_log_normal.h
+++ b/library/include/rocrand/rocrand_log_normal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -54,9 +54,9 @@
  * values, transforms them to log-normally distributed values, returns first of them, and saves
  * the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
@@ -88,15 +88,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_philox4x32_10* state,
-                                                               float                        mean,
-                                                               float                        stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_philox4x32_10* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -116,15 +115,14 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_phi
  * The function uses the Box-Muller transform method to generate four normally distributed
  * values, transforms them to log-normally distributed values, and returns them.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Four log-normally distributed \p float value as \p float4
  */
-__forceinline__ __device__ __host__ float4 rocrand_log_normal4(rocrand_state_philox4x32_10* state,
-                                                               float                        mean,
-                                                               float                        stddev)
+__forceinline__ __device__ __host__
+float4 rocrand_log_normal4(rocrand_state_philox4x32_10* state, float mean, float stddev)
 {
     float4 r = rocrand_device::detail::normal_distribution4(rocrand4(state));
     return float4 {
@@ -144,9 +142,9 @@ __forceinline__ __device__ __host__ float4 rocrand_log_normal4(rocrand_state_phi
  * \p double values, transforms them to log-normally distributed \p double values, returns
  * first of them, and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
@@ -174,14 +172,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_philox4x32_10* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_philox4x32_10* state, double mean, double stddev)
 {
     double2 r = rocrand_device::detail::normal_distribution_double2(rocrand4(state));
     return double2 {
@@ -198,14 +196,14 @@ __forceinline__ __device__ __host__ double2
  * The function uses the Box-Muller transform method to generate four normally distributed
  * values, transforms them to log-normally distributed values, and returns them.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Four log-normally distributed \p double values as \p double4
  */
-__forceinline__ __device__ __host__ double4
-    rocrand_log_normal_double4(rocrand_state_philox4x32_10* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double4 rocrand_log_normal_double4(rocrand_state_philox4x32_10* state, double mean, double stddev)
 {
     double2 r1, r2;
     r1 = rocrand_log_normal_double2(state, mean, stddev);
@@ -224,9 +222,9 @@ __forceinline__ __device__ __host__ double4
  * values, transforms them to log-normally distributed values, returns first of them,
  * and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
@@ -259,15 +257,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_mrg31k3p* state,
-                                                               float                   mean,
-                                                               float                   stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_mrg31k3p* state, float mean, float stddev)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -286,9 +283,9 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_mrg
  * \p double values, transforms them to log-normally distributed \p double values, returns
  * first of them, and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
@@ -322,14 +319,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_mrg31k3p* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_mrg31k3p* state, double mean, double stddev)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -349,9 +346,9 @@ __forceinline__ __device__ __host__ double2
  * values, transforms them to log-normally distributed values, returns first of them,
  * and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
@@ -384,15 +381,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_mrg32k3a* state,
-                                                               float                   mean,
-                                                               float                   stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_mrg32k3a* state, float mean, float stddev)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -414,9 +410,9 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_mrg
  * \p double values, transforms them to log-normally distributed \p double values, returns
  * first of them, and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
@@ -450,14 +446,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_mrg32k3a* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_mrg32k3a* state, double mean, double stddev)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -480,9 +476,9 @@ __forceinline__ __device__ __host__ double2
  * values, transforms them to log-normally distributed values, returns first of them,
  * and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
@@ -514,15 +510,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_xorwow* state,
-                                                               float                 mean,
-                                                               float                 stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_xorwow* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -543,9 +538,9 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_xor
  * \p double values, transforms them to log-normally distributed \p double values, returns
  * first of them, and saves the second to be returned on the next call.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
@@ -581,15 +576,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2 rocrand_log_normal_double2(rocrand_state_xorwow* state,
-                                                                       double                mean,
-                                                                       double                stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_xorwow* state, double mean, double stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -611,14 +605,14 @@ __forceinline__ __device__ __host__ double2 rocrand_log_normal_double2(rocrand_s
  * Generates and returns a log-normally distributed \p float value using MTGP32
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ float
-    rocrand_log_normal(rocrand_state_mtgp32* state, float mean, float stddev)
+__forceinline__ __device__
+float rocrand_log_normal(rocrand_state_mtgp32* state, float mean, float stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -632,15 +626,14 @@ __forceinline__ __device__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ float2 rocrand_log_normal2(rocrand_state_mtgp32* state,
-                                                      float                 mean,
-                                                      float                 stddev)
+__forceinline__ __device__
+float2 rocrand_log_normal2(rocrand_state_mtgp32* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -655,14 +648,14 @@ __forceinline__ __device__ float2 rocrand_log_normal2(rocrand_state_mtgp32* stat
  * Generates and returns a log-normally distributed \p double value using MTGP32
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ double
-    rocrand_log_normal_double(rocrand_state_mtgp32* state, double mean, double stddev)
+__forceinline__ __device__
+double rocrand_log_normal_double(rocrand_state_mtgp32* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -676,15 +669,14 @@ __forceinline__ __device__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ double2 rocrand_log_normal_double2(rocrand_state_mtgp32* state,
-                                                              double                mean,
-                                                              double                stddev)
+__forceinline__ __device__
+double2 rocrand_log_normal_double2(rocrand_state_mtgp32* state, double mean, double stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -702,14 +694,14 @@ __forceinline__ __device__ double2 rocrand_log_normal_double2(rocrand_state_mtgp
  * Generates and returns a log-normally distributed \p float value using SOBOL32
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_sobol32* state, float mean, float stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_sobol32* state, float mean, float stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -721,14 +713,14 @@ __forceinline__ __device__ __host__ float
  * Generates and returns a log-normally distributed \p double value using SOBOL32
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_sobol32* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_sobol32* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -740,14 +732,14 @@ __forceinline__ __device__ __host__ double
  * Generates and returns a log-normally distributed \p float value using SCRAMBLED_SOBOL32
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_scrambled_sobol32* state, float mean, float stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_scrambled_sobol32* state, float mean, float stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -759,14 +751,14 @@ __forceinline__ __device__ __host__ float
  * Generates and returns a log-normally distributed \p double value using SCRAMBLED_SOBOL32
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_scrambled_sobol32* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_scrambled_sobol32* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -778,14 +770,14 @@ __forceinline__ __device__ __host__ double
  * Generates and returns a log-normally distributed \p float value using SOBOL64
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_sobol64* state, float mean, float stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_sobol64* state, float mean, float stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -797,14 +789,14 @@ __forceinline__ __device__ __host__ float
  * Generates and returns a log-normally distributed \p double value using SOBOL64
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_sobol64* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_sobol64* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -816,14 +808,14 @@ __forceinline__ __device__ __host__ double
  * Generates and returns a log-normally distributed \p float value using SCRAMBLED_SOBOL64
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_scrambled_sobol64* state, float mean, float stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_scrambled_sobol64* state, float mean, float stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -835,14 +827,14 @@ __forceinline__ __device__ __host__ float
  * Generates and returns a log-normally distributed \p double value using SCRAMBLED_SOBOL64
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_scrambled_sobol64* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_scrambled_sobol64* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -854,14 +846,14 @@ __forceinline__ __device__ __host__ double
  * Generates and returns a log-normally distributed \p float value using LFSR113
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_lfsr113* state, float mean, float stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_lfsr113* state, float mean, float stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -875,15 +867,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_lfsr113* state,
-                                                               float                  mean,
-                                                               float                  stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_lfsr113* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -898,14 +889,14 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_lfs
  * Generates and returns a log-normally distributed \p double value using LFSR113
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_lfsr113* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_lfsr113* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -919,15 +910,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2 rocrand_log_normal_double2(rocrand_state_lfsr113* state,
-                                                                       double                 mean,
-                                                                       double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_lfsr113* state, double mean, double stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -945,14 +935,14 @@ __forceinline__ __device__ __host__ double2 rocrand_log_normal_double2(rocrand_s
  * Generates and returns a log-normally distributed \p float value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_threefry2x32_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_threefry2x32_20* state, double mean, double stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -966,15 +956,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_threefry2x32_20* state,
-                                                               float                          mean,
-                                                               float stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_threefry2x32_20* state, float mean, float stddev)
 {
     float2 r = rocrand_device::detail::normal_distribution2(rocrand2(state));
     return float2{expf(mean + (stddev * r.x)), expf(mean + (stddev * r.y))};
@@ -986,14 +975,14 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_thr
  * Generates and returns a log-normally distributed \p double value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_threefry2x32_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_threefry2x32_20* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -1007,14 +996,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_threefry2x32_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_threefry2x32_20* state, double mean, double stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1032,14 +1021,14 @@ __forceinline__ __device__ __host__ double2
  * Generates and returns a log-normally distributed \p float value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_threefry2x64_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_threefry2x64_20* state, double mean, double stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -1053,15 +1042,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_threefry2x64_20* state,
-                                                               float                          mean,
-                                                               float stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_threefry2x64_20* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1076,14 +1064,14 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_thr
  * Generates and returns a log-normally distributed \p double value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_threefry2x64_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_threefry2x64_20* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -1097,14 +1085,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_threefry2x64_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_threefry2x64_20* state, double mean, double stddev)
 {
     double2 r = rocrand_device::detail::normal_distribution_double2(rocrand2(state));
     return double2{exp(mean + (stddev * r.x)), exp(mean + (stddev * r.y))};
@@ -1116,14 +1104,14 @@ __forceinline__ __device__ __host__ double2
  * Generates and returns a log-normally distributed \p float value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_threefry4x32_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_threefry4x32_20* state, double mean, double stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -1137,15 +1125,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_threefry4x32_20* state,
-                                                               float                          mean,
-                                                               float stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_threefry4x32_20* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1160,14 +1147,14 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_thr
  * Generates and returns a log-normally distributed \p double value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_threefry4x32_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_threefry4x32_20* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -1181,14 +1168,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_threefry4x32_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_threefry4x32_20* state, double mean, double stddev)
 {
     double2 r = rocrand_device::detail::normal_distribution_double2(rocrand4(state));
     return double2{exp(mean + (stddev * r.x)), exp(mean + (stddev * r.y))};
@@ -1200,14 +1187,14 @@ __forceinline__ __device__ __host__ double2
  * Generates and returns a log-normally distributed \p float value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float
-    rocrand_log_normal(rocrand_state_threefry4x64_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+float rocrand_log_normal(rocrand_state_threefry4x64_20* state, double mean, double stddev)
 {
     float r = rocrand_device::detail::normal_distribution(rocrand(state));
     return expf(mean + (stddev * r));
@@ -1221,15 +1208,14 @@ __forceinline__ __device__ __host__ float
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_threefry4x64_20* state,
-                                                               float                          mean,
-                                                               float stddev)
+__forceinline__ __device__ __host__
+float2 rocrand_log_normal2(rocrand_state_threefry4x64_20* state, float mean, float stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1244,14 +1230,14 @@ __forceinline__ __device__ __host__ float2 rocrand_log_normal2(rocrand_state_thr
  * Generates and returns a log-normally distributed \p double value using Threefry
  * generator in \p state, and increments position of the generator by one.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Log-normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_log_normal_double(rocrand_state_threefry4x64_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double rocrand_log_normal_double(rocrand_state_threefry4x64_20* state, double mean, double stddev)
 {
     double r = rocrand_device::detail::normal_distribution_double(rocrand(state));
     return exp(mean + (stddev * r));
@@ -1265,14 +1251,14 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, transforms them to log-normally distributed values, and returns both.
  *
- * \param state  - Pointer to a state to use
- * \param mean   - Mean of the related log-normal distribution
- * \param stddev - Standard deviation of the related log-normal distribution
+ * \param state  Pointer to a state to use
+ * \param mean   Mean of the related log-normal distribution
+ * \param stddev Standard deviation of the related log-normal distribution
  *
  * \return Two log-normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_log_normal_double2(rocrand_state_threefry4x64_20* state, double mean, double stddev)
+__forceinline__ __device__ __host__
+double2 rocrand_log_normal_double2(rocrand_state_threefry4x64_20* state, double mean, double stddev)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
diff --git a/library/include/rocrand/rocrand_mrg31k3p.h b/library/include/rocrand/rocrand_mrg31k3p.h
index 38fe73db5..6bfb848bd 100644
--- a/library/include/rocrand/rocrand_mrg31k3p.h
+++ b/library/include/rocrand/rocrand_mrg31k3p.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -336,15 +336,16 @@ typedef rocrand_device::mrg31k3p_engine rocrand_state_mrg31k3p;
  * Initializes the MRG31K3P generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
-                                                      const unsigned long long subsequence,
-                                                      const unsigned long long offset,
-                                                      rocrand_state_mrg31k3p*  state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long seed,
+                  const unsigned long long subsequence,
+                  const unsigned long long offset,
+                  rocrand_state_mrg31k3p*  state)
 {
     *state = rocrand_state_mrg31k3p(seed, subsequence, offset);
 }
@@ -357,11 +358,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long s
  * value from [0; 2^32 - 1] range using MRG31K3P generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_mrg31k3p* state)
 {
     // next() in [1, ROCRAND_MRG31K3P_M1]
     return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG31K3P_UINT32_NORM);
@@ -372,11 +374,11 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg31k3p*
  *
  * Updates the MRG31K3P state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long      offset,
-                                                   rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_mrg31k3p* state)
 {
     return state->discard(offset);
 }
@@ -387,11 +389,11 @@ __forceinline__ __device__ __host__ void skipahead(unsigned long long      offse
  * Updates the MRG31K3P state in \p state to skip ahead by \p subsequence subsequences.
  * Each subsequence is 2^72 numbers long.
  *
- * \param subsequence - Number of subsequences to skip
- * \param state - Pointer to state to update
+ * \param subsequence Number of subsequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long      subsequence,
-                                                               rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg31k3p* state)
 {
     return state->discard_subsequence(subsequence);
 }
@@ -402,11 +404,11 @@ __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long lon
  * Updates the MRG31K3P state in \p state to skip ahead by \p sequence sequences.
  * Each sequence is 2^134 numbers long.
  *
- * \param sequence - Number of sequences to skip
- * \param state - Pointer to state to update
+ * \param sequence Number of sequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long      sequence,
-                                                            rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg31k3p* state)
 {
     return state->discard_sequence(sequence);
 }
diff --git a/library/include/rocrand/rocrand_mrg32k3a.h b/library/include/rocrand/rocrand_mrg32k3a.h
index 790584e65..4437ca59f 100644
--- a/library/include/rocrand/rocrand_mrg32k3a.h
+++ b/library/include/rocrand/rocrand_mrg32k3a.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -392,15 +392,16 @@ typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
  * Initializes the MRG32K3A generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
-                                                      const unsigned long long subsequence,
-                                                      const unsigned long long offset,
-                                                      rocrand_state_mrg32k3a*  state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long seed,
+                  const unsigned long long subsequence,
+                  const unsigned long long offset,
+                  rocrand_state_mrg32k3a*  state)
 {
     *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
 }
@@ -413,11 +414,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long s
  * value from [0; 2^32 - 1] range using MRG32K3A generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_mrg32k3a* state)
 {
     // next() in [1, ROCRAND_MRG32K3A_M1]
     return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
@@ -428,11 +430,11 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a*
  *
  * Updates the MRG32K3A state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long      offset,
-                                                   rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_mrg32k3a* state)
 {
     return state->discard(offset);
 }
@@ -443,11 +445,11 @@ __forceinline__ __device__ __host__ void skipahead(unsigned long long      offse
  * Updates the MRG32K3A state in \p state to skip ahead by \p subsequence subsequences.
  * Each subsequence is 2^76 numbers long.
  *
- * \param subsequence - Number of subsequences to skip
- * \param state - Pointer to state to update
+ * \param subsequence Number of subsequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long      subsequence,
-                                                               rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a* state)
 {
     return state->discard_subsequence(subsequence);
 }
@@ -458,11 +460,11 @@ __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long lon
  * Updates the MRG32K3A state in \p state to skip ahead by \p sequence sequences.
  * Each sequence is 2^127 numbers long.
  *
- * \param sequence - Number of sequences to skip
- * \param state - Pointer to state to update
+ * \param sequence Number of sequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long      sequence,
-                                                            rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a* state)
 {
     return state->discard_sequence(sequence);
 }
diff --git a/library/include/rocrand/rocrand_mtgp32.h b/library/include/rocrand/rocrand_mtgp32.h
index daf72889a..1208622bc 100644
--- a/library/include/rocrand/rocrand_mtgp32.h
+++ b/library/include/rocrand/rocrand_mtgp32.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -350,19 +350,20 @@ typedef rocrand_device::mtgp32_params mtgp32_params;
  * Initializes MTGP32 states on the host-side by allocating a state array in host
  * memory, initializes that array, and copies the result to device or host memory.
  *
- * \param state - Pointer to an array of states in device or host memory
- * \param params - Pointer to an array of type mtgp32_fast_params in host memory
- * \param n - Number of states to initialize
- * \param seed - Seed value
+ * \param state Pointer to an array of states in device or host memory
+ * \param params Pointer to an array of type mtgp32_fast_params in host memory
+ * \param n Number of states to initialize
+ * \param seed Seed value
  *
  * \return
  * - ROCRAND_STATUS_ALLOCATION_FAILED if states could not be initialized
  * - ROCRAND_STATUS_SUCCESS if states are initialized
  */
-__host__ inline rocrand_status rocrand_make_state_mtgp32(rocrand_state_mtgp32* state,
-                                                         mtgp32_fast_params    params[],
-                                                         int                   n,
-                                                         unsigned long long    seed)
+__host__
+inline rocrand_status rocrand_make_state_mtgp32(rocrand_state_mtgp32* state,
+                                                mtgp32_fast_params    params[],
+                                                int                   n,
+                                                unsigned long long    seed)
 {
     int i;
     rocrand_state_mtgp32 * h_state = (rocrand_state_mtgp32 *) malloc(sizeof(rocrand_state_mtgp32) * n);
@@ -405,15 +406,15 @@ __host__ inline rocrand_status rocrand_make_state_mtgp32(rocrand_state_mtgp32* s
  * NOTE: Not used as rocrand_make_state_mtgp32 handles loading parameters into
  * state.
  *
- * \param params - Pointer to an array of type mtgp32_fast_params in host memory
- * \param p - Pointer to a mtgp32_params structure allocated in device memory
+ * \param params Pointer to an array of type mtgp32_fast_params in host memory
+ * \param p Pointer to a mtgp32_params structure allocated in device memory
  *
  * \return
  * - ROCRAND_STATUS_ALLOCATION_FAILED if parameters could not be loaded
  * - ROCRAND_STATUS_SUCCESS if parameters are loaded
  */
-__host__ inline
-rocrand_status rocrand_make_constant(const mtgp32_fast_params params[], mtgp32_params * p)
+__host__
+inline rocrand_status rocrand_make_constant(const mtgp32_fast_params params[], mtgp32_params* p)
 {
     const int block_num = MTGP_BN_MAX;
     const int size1 = sizeof(uint32_t) * block_num;
@@ -488,11 +489,12 @@ rocrand_status rocrand_make_constant(const mtgp32_fast_params params[], mtgp32_p
  * value from [0; 2^32 - 1] range using MTGP32 generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ unsigned int rocrand(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+unsigned int rocrand(rocrand_state_mtgp32* state)
 {
     return state->next();
 }
@@ -524,12 +526,12 @@ __forceinline__ __device__ unsigned int rocrand(rocrand_state_mtgp32* state)
  * }
  * \endcode
  *
- * \param src - Pointer to a state to copy from
- * \param dest - Pointer to a state to copy to
+ * \param src Pointer to a state to copy from
+ * \param dest Pointer to a state to copy to
  *
  */
-__forceinline__ __device__ void rocrand_mtgp32_block_copy(rocrand_state_mtgp32* src,
-                                                          rocrand_state_mtgp32* dest)
+__forceinline__ __device__
+void rocrand_mtgp32_block_copy(rocrand_state_mtgp32* src, rocrand_state_mtgp32* dest)
 {
     dest->copy(src);
 }
@@ -537,11 +539,11 @@ __forceinline__ __device__ void rocrand_mtgp32_block_copy(rocrand_state_mtgp32*
 /**
  * \brief Changes parameters of a MTGP32 state.
  *
- * \param state - Pointer to a MTGP32 state
- * \param params - Pointer to new parameters
+ * \param state Pointer to a MTGP32 state
+ * \param params Pointer to new parameters
  */
-__forceinline__ __device__ void rocrand_mtgp32_set_params(rocrand_state_mtgp32* state,
-                                                          mtgp32_params*        params)
+__forceinline__ __device__
+void rocrand_mtgp32_set_params(rocrand_state_mtgp32* state, mtgp32_params* params)
 {
     state->set_params(params);
 }
diff --git a/library/include/rocrand/rocrand_normal.h b/library/include/rocrand/rocrand_normal.h
index e68720607..9f9e208e2 100644
--- a/library/include/rocrand/rocrand_normal.h
+++ b/library/include/rocrand/rocrand_normal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -356,7 +356,7 @@ __forceinline__ __device__ __host__ __half2 mrg_normal_distribution_half2(unsign
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
@@ -389,11 +389,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_philox4x3
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_philox4x32_10* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -411,11 +412,12 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_philox4
  * The function uses the Box-Muller transform method to generate four normally
  * distributed values, and returns them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four normally distributed \p float value as \p float4
  */
-__forceinline__ __device__ __host__ float4 rocrand_normal4(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+float4 rocrand_normal4(rocrand_state_philox4x32_10* state)
 {
     return rocrand_device::detail::normal_distribution4(rocrand4(state));
 }
@@ -430,7 +432,7 @@ __forceinline__ __device__ __host__ float4 rocrand_normal4(rocrand_state_philox4
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
@@ -459,12 +461,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_p
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double values as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_normal_double2(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_philox4x32_10* state)
 {
     return rocrand_device::detail::normal_distribution_double2(rocrand4(state));
 }
@@ -479,12 +481,12 @@ __forceinline__ __device__ __host__ double2
  * The function uses the Box-Muller transform method to generate four normally
  * distributed values, and returns them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four normally distributed \p double values as \p double4
  */
-__forceinline__ __device__ __host__ double4
-    rocrand_normal_double4(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+double4 rocrand_normal_double4(rocrand_state_philox4x32_10* state)
 {
     double2 r1, r2;
     r1 = rocrand_device::detail::normal_distribution_double2(rocrand4(state));
@@ -504,7 +506,7 @@ __forceinline__ __device__ __host__ double4
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
@@ -538,11 +540,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_mrg31k3p*
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_mrg31k3p* state)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -560,7 +563,7 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_mrg31k3
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
@@ -595,11 +598,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_m
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_mrg31k3p* state)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -618,7 +622,7 @@ __forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
@@ -652,11 +656,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_mrg32k3a*
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_mrg32k3a* state)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -674,7 +679,7 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_mrg32k3
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
@@ -709,11 +714,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_m
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_mrg32k3a* state)
 {
     auto state1 = state->next();
     auto state2 = state->next();
@@ -732,7 +738,7 @@ __forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
@@ -763,11 +769,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_xorwow* s
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float values as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_xorwow* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -784,7 +791,7 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_xorwow*
  * The function uses the Box-Muller transform method to generate two normally distributed
  * values, returns first of them, and saves the second to be returned on the next call.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
@@ -821,11 +828,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_x
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_xorwow* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -845,11 +853,12 @@ __forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ float rocrand_normal(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+float rocrand_normal(rocrand_state_mtgp32* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -864,11 +873,12 @@ __forceinline__ __device__ float rocrand_normal(rocrand_state_mtgp32* state)
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float values as \p float2
  */
-__forceinline__ __device__ float2 rocrand_normal2(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+float2 rocrand_normal2(rocrand_state_mtgp32* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -883,11 +893,12 @@ __forceinline__ __device__ float2 rocrand_normal2(rocrand_state_mtgp32* state)
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ double rocrand_normal_double(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+double rocrand_normal_double(rocrand_state_mtgp32* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -902,11 +913,12 @@ __forceinline__ __device__ double rocrand_normal_double(rocrand_state_mtgp32* st
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ double2 rocrand_normal_double2(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+double2 rocrand_normal_double2(rocrand_state_mtgp32* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -925,11 +937,12 @@ __forceinline__ __device__ double2 rocrand_normal_double2(rocrand_state_mtgp32*
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_sobol32* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -942,11 +955,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_sobol32*
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_sobol32* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -959,11 +973,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_s
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_scrambled_sobol32* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -976,12 +991,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_scrambled
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_normal_double(rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_scrambled_sobol32* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -994,11 +1009,12 @@ __forceinline__ __device__ __host__ double
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_sobol64* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_sobol64* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1011,11 +1027,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_sobol64*
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_sobol64* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_sobol64* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1028,11 +1045,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_s
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_scrambled_sobol64* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1045,12 +1063,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_scrambled
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_normal_double(rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_scrambled_sobol64* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1063,11 +1081,12 @@ __forceinline__ __device__ __host__ double
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_lfsr113* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1082,11 +1101,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_lfsr113*
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_lfsr113* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1102,11 +1122,12 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_lfsr113
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_lfsr113* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1121,11 +1142,12 @@ __forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_l
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_lfsr113* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1144,11 +1166,12 @@ __forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_threefry2x32_20* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1163,11 +1186,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry2
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_threefry2x32_20* state)
 {
     return rocrand_device::detail::normal_distribution2(rocrand2(state));
 }
@@ -1180,12 +1204,12 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefr
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_normal_double(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_threefry2x32_20* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1200,12 +1224,12 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_normal_double2(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_threefry2x32_20* state)
 {
     auto state1 = rocrand2(state);
     auto state2 = rocrand2(state);
@@ -1222,11 +1246,12 @@ __forceinline__ __device__ __host__ double2
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_threefry2x64_20* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1241,11 +1266,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry2
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_threefry2x64_20* state)
 {
     return rocrand_device::detail::normal_distribution2(rocrand(state));
 }
@@ -1258,12 +1284,12 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefr
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_normal_double(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_threefry2x64_20* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1278,12 +1304,12 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_normal_double2(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_threefry2x64_20* state)
 {
     return rocrand_device::detail::normal_distribution_double2(rocrand2(state));
 }
@@ -1296,11 +1322,12 @@ __forceinline__ __device__ __host__ double2
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_threefry4x32_20* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1315,11 +1342,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry4
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_threefry4x32_20* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1335,12 +1363,12 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefr
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_normal_double(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_threefry4x32_20* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1355,12 +1383,12 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_normal_double2(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_threefry4x32_20* state)
 {
     return rocrand_device::detail::normal_distribution_double2(rocrand4(state));
 }
@@ -1373,11 +1401,12 @@ __forceinline__ __device__ __host__ double2
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p float value
  */
-__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+float rocrand_normal(rocrand_state_threefry4x64_20* state)
 {
     return rocrand_device::detail::normal_distribution(rocrand(state));
 }
@@ -1392,11 +1421,12 @@ __forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_threefry4
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p float value as \p float2
  */
-__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+float2 rocrand_normal2(rocrand_state_threefry4x64_20* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -1412,12 +1442,12 @@ __forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_threefr
  * Used normal distribution has mean value equal to 0.0f, and standard deviation
  * equal to 1.0f.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Normally distributed \p double value
  */
-__forceinline__ __device__ __host__ double
-    rocrand_normal_double(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+double rocrand_normal_double(rocrand_state_threefry4x64_20* state)
 {
     return rocrand_device::detail::normal_distribution_double(rocrand(state));
 }
@@ -1432,12 +1462,12 @@ __forceinline__ __device__ __host__ double
  * The function uses the Box-Muller transform method to generate two normally
  * distributed values, and returns both of them.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two normally distributed \p double value as \p double2
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_normal_double2(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+double2 rocrand_normal_double2(rocrand_state_threefry4x64_20* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
diff --git a/library/include/rocrand/rocrand_philox4x32_10.h b/library/include/rocrand/rocrand_philox4x32_10.h
index 346055114..300fa086d 100644
--- a/library/include/rocrand/rocrand_philox4x32_10.h
+++ b/library/include/rocrand/rocrand_philox4x32_10.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -341,15 +341,16 @@ typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
  * Initializes the Philox generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long     seed,
-                                                      const unsigned long long     subsequence,
-                                                      const unsigned long long     offset,
-                                                      rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long     seed,
+                  const unsigned long long     subsequence,
+                  const unsigned long long     offset,
+                  rocrand_state_philox4x32_10* state)
 {
     *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
 }
@@ -362,11 +363,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long
  * value from [0; 2^32 - 1] range using Philox generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_philox4x32_10* state)
 {
     return state->next();
 }
@@ -379,11 +381,12 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x3
  * values from [0; 2^32 - 1] range using Philox generator in \p state.
  * State is incremented by four positions.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four pseudorandom values (32-bit) as an <tt>uint4</tt>
  */
-__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+uint4 rocrand4(rocrand_state_philox4x32_10* state)
 {
     return state->next4();
 }
@@ -393,11 +396,11 @@ __forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10*
  *
  * Updates the Philox generator state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long           offset,
-                                                   rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_philox4x32_10* state)
 {
     return state->discard(offset);
 }
@@ -408,11 +411,11 @@ __forceinline__ __device__ __host__ void skipahead(unsigned long long
  * Updates the Philox generator state in \p state to skip ahead by \p subsequence subsequences.
  * Each subsequence is 4 * 2^64 numbers long.
  *
- * \param subsequence - Number of subsequences to skip
- * \param state - Pointer to state to update
+ * \param subsequence Number of subsequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence,
-                                                               rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10* state)
 {
     return state->discard_subsequence(subsequence);
 }
@@ -423,11 +426,11 @@ __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long lon
  * Updates the Philox generator state in \p state skipping \p sequence sequences ahead.
  * For Philox each sequence is 4 * 2^64 numbers long (equal to the size of a subsequence).
  *
- * \param sequence - Number of sequences to skip
- * \param state - Pointer to state to update
+ * \param sequence Number of sequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long           sequence,
-                                                            rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10* state)
 {
     return state->discard_subsequence(sequence);
 }
diff --git a/library/include/rocrand/rocrand_poisson.h b/library/include/rocrand/rocrand_poisson.h
index d539b68d6..1fc3dc956 100644
--- a/library/include/rocrand/rocrand_poisson.h
+++ b/library/include/rocrand/rocrand_poisson.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -225,8 +225,8 @@ __forceinline__ __device__ __host__ Result_Type poisson_distribution_inv(State&
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using Philox generator in \p state. State is incremented by a variable amount.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
@@ -245,13 +245,13 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_p
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using Philox generator in \p state. State is incremented by a variable amount.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Four Poisson-distributed <tt>unsigned int</tt> values as \p uint4
  */
-__forceinline__ __device__ __host__ uint4 rocrand_poisson4(rocrand_state_philox4x32_10* state,
-                                                           double                       lambda)
+__forceinline__ __device__ __host__
+uint4 rocrand_poisson4(rocrand_state_philox4x32_10* state, double lambda)
 {
     return uint4{
         rocrand_device::detail::poisson_distribution<rocrand_state_philox4x32_10*, unsigned int>(
@@ -275,8 +275,8 @@ __forceinline__ __device__ __host__ uint4 rocrand_poisson4(rocrand_state_philox4
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using MRG31k3p generator in \p state. State is incremented by a variable amount.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
@@ -296,8 +296,8 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_m
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using MRG32k3a generator in \p state. State is incremented by a variable amount.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
@@ -317,8 +317,8 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_m
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using XORWOW generator in \p state. State is incremented by a variable amount.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
@@ -338,13 +338,13 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_x
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using MTGP32 generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_mtgp32* state,
-                                                                 double                lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_mtgp32* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv<rocrand_state_mtgp32*, unsigned int>(
         state,
@@ -357,13 +357,13 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_m
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using SOBOL32 generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_sobol32* state,
-                                                                 double                 lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_sobol32* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv<rocrand_state_sobol32*, unsigned int>(
         state,
@@ -376,13 +376,13 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_s
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using SCRAMBLED_SOBOL32 generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_poisson(rocrand_state_scrambled_sobol32* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_scrambled_sobol32* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv<rocrand_state_scrambled_sobol32*,
                                                             unsigned int>(state, lambda);
@@ -394,13 +394,13 @@ __forceinline__ __device__ __host__ unsigned int
  * Generates and returns Poisson-distributed distributed random <tt>unsigned long long int</tt>
  * values using SOBOL64 generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned long long int</tt>
  */
-__forceinline__ __device__ __host__ unsigned long long int
-    rocrand_poisson(rocrand_state_sobol64* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned long long int rocrand_poisson(rocrand_state_sobol64* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv<rocrand_state_sobol64*,
                                                             unsigned long long int>(state, lambda);
@@ -412,13 +412,13 @@ __forceinline__ __device__ __host__ unsigned long long int
  * Generates and returns Poisson-distributed distributed random <tt>unsigned long long int</tt>
  * values using SCRAMBLED_SOBOL64 generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned long long int</tt>
  */
-__forceinline__ __device__ __host__ unsigned long long int
-    rocrand_poisson(rocrand_state_scrambled_sobol64* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned long long int rocrand_poisson(rocrand_state_scrambled_sobol64* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv<rocrand_state_scrambled_sobol64*,
                                                             unsigned long long int>(state, lambda);
@@ -430,13 +430,13 @@ __forceinline__ __device__ __host__ unsigned long long int
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using LFSR113 generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_lfsr113* state,
-                                                                 double                 lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_lfsr113* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv<rocrand_state_lfsr113*, unsigned int>(
         state,
@@ -449,13 +449,13 @@ __forceinline__ __device__ __host__ unsigned int rocrand_poisson(rocrand_state_l
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using ThreeFry generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_poisson(rocrand_state_threefry2x32_20* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_threefry2x32_20* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv(state, lambda);
 }
@@ -466,13 +466,13 @@ __forceinline__ __device__ __host__ unsigned int
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using ThreeFry generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_poisson(rocrand_state_threefry2x64_20* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_threefry2x64_20* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv(state, lambda);
 }
@@ -483,13 +483,13 @@ __forceinline__ __device__ __host__ unsigned int
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using ThreeFry generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_poisson(rocrand_state_threefry4x32_20* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_threefry4x32_20* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv(state, lambda);
 }
@@ -500,13 +500,13 @@ __forceinline__ __device__ __host__ unsigned int
  * Generates and returns Poisson-distributed distributed random <tt>unsigned int</tt>
  * values using ThreeFry generator in \p state. State is incremented by one position.
  *
- * \param state - Pointer to a state to use
- * \param lambda - Lambda parameter of the Poisson distribution
+ * \param state Pointer to a state to use
+ * \param lambda Lambda parameter of the Poisson distribution
  *
  * \return Poisson-distributed <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int
-    rocrand_poisson(rocrand_state_threefry4x64_20* state, double lambda)
+__forceinline__ __device__ __host__
+unsigned int rocrand_poisson(rocrand_state_threefry4x64_20* state, double lambda)
 {
     return rocrand_device::detail::poisson_distribution_inv(state, lambda);
 }
diff --git a/library/include/rocrand/rocrand_scrambled_sobol32.h b/library/include/rocrand/rocrand_scrambled_sobol32.h
index 1accefc54..67e6d7e2d 100644
--- a/library/include/rocrand/rocrand_scrambled_sobol32.h
+++ b/library/include/rocrand/rocrand_scrambled_sobol32.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -106,15 +106,16 @@ typedef rocrand_device::scrambled_sobol32_engine<false> rocrand_state_scrambled_
  * Initializes the scrambled_sobol32 generator \p state with the given
  * direction \p vectors and \p offset.
  *
- * \param vectors - Direction vectors
- * \param scramble_constant - Constant used for scrambling the sequence
- * \param offset - Absolute offset into sequence
- * \param state - Pointer to state to initialize
+ * \param vectors Direction vectors
+ * \param scramble_constant Constant used for scrambling the sequence
+ * \param offset Absolute offset into sequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned int* vectors,
-                                                      const unsigned int  scramble_constant,
-                                                      const unsigned int  offset,
-                                                      rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned int*              vectors,
+                  const unsigned int               scramble_constant,
+                  const unsigned int               offset,
+                  rocrand_state_scrambled_sobol32* state)
 {
     *state = rocrand_state_scrambled_sobol32(vectors, scramble_constant, offset);
 }
@@ -127,11 +128,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned int* vector
  * value from [0; 2^32 - 1] range using scrambled_sobol32 generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Quasirandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_scrambled_sobol32* state)
 {
     return state->next();
 }
@@ -141,11 +143,11 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_scrambled
  *
  * Updates the SCRAMBLED_SOBOL32 state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long               offset,
-                                                   rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_scrambled_sobol32* state)
 {
     return state->discard(offset);
 }
diff --git a/library/include/rocrand/rocrand_scrambled_sobol64.h b/library/include/rocrand/rocrand_scrambled_sobol64.h
index b685a7f7e..e44d1f08c 100644
--- a/library/include/rocrand/rocrand_scrambled_sobol64.h
+++ b/library/include/rocrand/rocrand_scrambled_sobol64.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -106,16 +106,16 @@ typedef rocrand_device::scrambled_sobol64_engine<false> rocrand_state_scrambled_
  * Initializes the scrambled_sobol64 generator \p state with the given
  * direction \p vectors and \p offset.
  *
- * \param vectors - Direction vectors
- * \param scramble_constant - Constant used for scrambling the sequence
- * \param offset - Absolute offset into sequence
- * \param state - Pointer to state to initialize
+ * \param vectors Direction vectors
+ * \param scramble_constant Constant used for scrambling the sequence
+ * \param offset Absolute offset into sequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void
-    rocrand_init(const unsigned long long int*    vectors,
-                 const unsigned long long int     scramble_constant,
-                 const unsigned int               offset,
-                 rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long int*    vectors,
+                  const unsigned long long int     scramble_constant,
+                  const unsigned int               offset,
+                  rocrand_state_scrambled_sobol64* state)
 {
     *state = rocrand_state_scrambled_sobol64(vectors, scramble_constant, offset);
 }
@@ -128,12 +128,12 @@ __forceinline__ __device__ __host__ void
  * value from [0; 2^64 - 1] range using scrambled_sobol64 generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Quasirandom value (64-bit) as an <tt>unsigned long long int</tt>
  */
-__forceinline__ __device__ __host__ unsigned long long int
-    rocrand(rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+unsigned long long int rocrand(rocrand_state_scrambled_sobol64* state)
 {
     return state->next();
 }
@@ -143,11 +143,11 @@ __forceinline__ __device__ __host__ unsigned long long int
  *
  * Updates the scrambled_sobol64 state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long               offset,
-                                                   rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_scrambled_sobol64* state)
 {
     return state->discard(offset);
 }
diff --git a/library/include/rocrand/rocrand_sobol32.h b/library/include/rocrand/rocrand_sobol32.h
index d46feeb1f..4a0282618 100644
--- a/library/include/rocrand/rocrand_sobol32.h
+++ b/library/include/rocrand/rocrand_sobol32.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -200,13 +200,14 @@ typedef rocrand_device::sobol32_engine<false> rocrand_state_sobol32;
  * Initializes the SOBOL32 generator \p state with the given
  * direction \p vectors and \p offset.
  *
- * \param vectors - Direction vectors
- * \param offset - Absolute offset into sequence
- * \param state - Pointer to state to initialize
+ * \param vectors Direction vectors
+ * \param offset Absolute offset into sequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned int*    vectors,
-                                                      const unsigned int     offset,
-                                                      rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned int*    vectors,
+                  const unsigned int     offset,
+                  rocrand_state_sobol32* state)
 {
     *state = rocrand_state_sobol32(vectors, offset);
 }
@@ -219,11 +220,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned int*    vec
  * value from [0; 2^32 - 1] range using Sobol32 generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Quasirandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_sobol32* state)
 {
     return state->next();
 }
@@ -233,11 +235,11 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_sobol32*
  *
  * Updates the SOBOL32 state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long     offset,
-                                                   rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_sobol32* state)
 {
     return state->discard(offset);
 }
diff --git a/library/include/rocrand/rocrand_sobol64.h b/library/include/rocrand/rocrand_sobol64.h
index 23bebe37e..5ee401281 100644
--- a/library/include/rocrand/rocrand_sobol64.h
+++ b/library/include/rocrand/rocrand_sobol64.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -201,13 +201,14 @@ typedef rocrand_device::sobol64_engine<false> rocrand_state_sobol64;
  * Initializes the sobol64 generator \p state with the given
  * direction \p vectors and \p offset.
  *
- * \param vectors - Direction vectors
- * \param offset - Absolute offset into sequence
- * \param state - Pointer to state to initialize
+ * \param vectors Direction vectors
+ * \param offset Absolute offset into sequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long int* vectors,
-                                                      const unsigned int            offset,
-                                                      rocrand_state_sobol64*        state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long int* vectors,
+                  const unsigned int            offset,
+                  rocrand_state_sobol64*        state)
 {
     *state = rocrand_state_sobol64(vectors, offset);
 }
@@ -220,11 +221,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long i
  * value from [0; 2^64 - 1] range using sobol64 generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Quasirandom value (64-bit) as an <tt>unsigned long long int</tt>
  */
-__forceinline__ __device__ __host__ unsigned long long int rocrand(rocrand_state_sobol64* state)
+__forceinline__ __device__ __host__
+unsigned long long int rocrand(rocrand_state_sobol64* state)
 {
     return state->next();
 }
@@ -234,11 +236,11 @@ __forceinline__ __device__ __host__ unsigned long long int rocrand(rocrand_state
  *
  * Updates the sobol64 state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long int offset,
-                                                   rocrand_state_sobol64* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long int offset, rocrand_state_sobol64* state)
 {
     return state->discard(offset);
 }
diff --git a/library/include/rocrand/rocrand_threefry2x32_20.h b/library/include/rocrand/rocrand_threefry2x32_20.h
index 201a65298..58c22f303 100644
--- a/library/include/rocrand/rocrand_threefry2x32_20.h
+++ b/library/include/rocrand/rocrand_threefry2x32_20.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -109,15 +109,16 @@ typedef rocrand_device::threefry2x32_20_engine rocrand_state_threefry2x32_20;
  * Initializes the Threefry generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long       seed,
-                                                      const unsigned long long       subsequence,
-                                                      const unsigned long long       offset,
-                                                      rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long       seed,
+                  const unsigned long long       subsequence,
+                  const unsigned long long       offset,
+                  rocrand_state_threefry2x32_20* state)
 {
     *state = rocrand_state_threefry2x32_20(seed, subsequence, offset);
 }
@@ -132,11 +133,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long
  *
  * Threefry2x32 has a period of 2 ^ 64 numbers.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_threefry2x32_20* state)
 {
     return state->next();
 }
@@ -149,11 +151,12 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_threefry2
  * values from [0; 2^32 - 1] range using Threefry generator in \p state.
  * State is incremented by two positions.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two pseudorandom values (32-bit) as an <tt>uint2</tt>
  */
-__forceinline__ __device__ __host__ uint2 rocrand2(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+uint2 rocrand2(rocrand_state_threefry2x32_20* state)
 {
     return state->next2();
 }
diff --git a/library/include/rocrand/rocrand_threefry2x64_20.h b/library/include/rocrand/rocrand_threefry2x64_20.h
index 698f6672e..92cd3bb63 100644
--- a/library/include/rocrand/rocrand_threefry2x64_20.h
+++ b/library/include/rocrand/rocrand_threefry2x64_20.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -109,15 +109,16 @@ typedef rocrand_device::threefry2x64_20_engine rocrand_state_threefry2x64_20;
  * Initializes the Threefry generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long       seed,
-                                                      const unsigned long long       subsequence,
-                                                      const unsigned long long       offset,
-                                                      rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long       seed,
+                  const unsigned long long       subsequence,
+                  const unsigned long long       offset,
+                  rocrand_state_threefry2x64_20* state)
 {
     *state = rocrand_state_threefry2x64_20(seed, subsequence, offset);
 }
@@ -132,11 +133,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long
  *
  * Threefry2x64 has a period of 2 ^ 128 numbers.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (64-bit) as an <tt>unsigned long long</tt>
  */
-__forceinline__ __device__ __host__ unsigned long long rocrand(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+unsigned long long rocrand(rocrand_state_threefry2x64_20* state)
 {
     return state->next();
 }
@@ -149,11 +151,12 @@ __forceinline__ __device__ __host__ unsigned long long rocrand(rocrand_state_thr
  * values from [0; 2^64 - 1] range using Threefry generator in \p state.
  * State is incremented by two positions.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two pseudorandom values (64-bit) as an <tt>ulonglong2</tt>
  */
-__forceinline__ __device__ __host__ ulonglong2 rocrand2(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+ulonglong2 rocrand2(rocrand_state_threefry2x64_20* state)
 {
     return state->next2();
 }
diff --git a/library/include/rocrand/rocrand_threefry4x32_20.h b/library/include/rocrand/rocrand_threefry4x32_20.h
index b57753fab..ba8819bd5 100644
--- a/library/include/rocrand/rocrand_threefry4x32_20.h
+++ b/library/include/rocrand/rocrand_threefry4x32_20.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -111,15 +111,16 @@ typedef rocrand_device::threefry4x32_20_engine rocrand_state_threefry4x32_20;
  * Initializes the Threefry generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long       seed,
-                                                      const unsigned long long       subsequence,
-                                                      const unsigned long long       offset,
-                                                      rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long       seed,
+                  const unsigned long long       subsequence,
+                  const unsigned long long       offset,
+                  rocrand_state_threefry4x32_20* state)
 {
     *state = rocrand_state_threefry4x32_20(seed, subsequence, offset);
 }
@@ -134,11 +135,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long
  *
  * Threefry4x32 has a period of 2 ^ 128 numbers.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_threefry4x32_20* state)
 {
     return state->next();
 }
@@ -151,11 +153,12 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_threefry4
  * values from [0; 2^32 - 1] range using Threefry generator in \p state.
  * State is incremented by four positions.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four pseudorandom values (32-bit) as an <tt>uint2</tt>
  */
-__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+uint4 rocrand4(rocrand_state_threefry4x32_20* state)
 {
     return state->next4();
 }
diff --git a/library/include/rocrand/rocrand_threefry4x64_20.h b/library/include/rocrand/rocrand_threefry4x64_20.h
index 6d4a29bee..37251fbdf 100644
--- a/library/include/rocrand/rocrand_threefry4x64_20.h
+++ b/library/include/rocrand/rocrand_threefry4x64_20.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -109,15 +109,16 @@ typedef rocrand_device::threefry4x64_20_engine rocrand_state_threefry4x64_20;
  * Initializes the Threefry generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long       seed,
-                                                      const unsigned long long       subsequence,
-                                                      const unsigned long long       offset,
-                                                      rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long       seed,
+                  const unsigned long long       subsequence,
+                  const unsigned long long       offset,
+                  rocrand_state_threefry4x64_20* state)
 {
     *state = rocrand_state_threefry4x64_20(seed, subsequence, offset);
 }
@@ -132,11 +133,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long
  *
  * Threefry4x64 has a period of 2 ^ 256 numbers.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (64-bit) as an <tt>unsigned long long</tt>
  */
-__forceinline__ __device__ __host__ unsigned long long rocrand(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+unsigned long long rocrand(rocrand_state_threefry4x64_20* state)
 {
     return state->next();
 }
@@ -149,11 +151,12 @@ __forceinline__ __device__ __host__ unsigned long long rocrand(rocrand_state_thr
  * values from [0; 2^64 - 1] range using Threefry generator in \p state.
  * State is incremented by four positions.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four pseudorandom values (64-bit) as an <tt>ulonglong4</tt>
  */
-__forceinline__ __device__ __host__ ulonglong4 rocrand4(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+ulonglong4 rocrand4(rocrand_state_threefry4x64_20* state)
 {
     return state->next4();
 }
diff --git a/library/include/rocrand/rocrand_uniform.h b/library/include/rocrand/rocrand_uniform.h
index 2bf772b57..7eea5d998 100644
--- a/library/include/rocrand/rocrand_uniform.h
+++ b/library/include/rocrand/rocrand_uniform.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -229,11 +229,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0f, including \p 1.0f) using Philox generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_philox4x32_10* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -246,11 +247,12 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_philox4x
  * (excluding \p 0.0f, including \p 1.0f) using Philox generator in \p state, and
  * increments position of the generator by two.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two uniformly distributed \p float values from (0; 1] range as \p float2.
  */
-__forceinline__ __device__ __host__ float2 rocrand_uniform2(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+float2 rocrand_uniform2(rocrand_state_philox4x32_10* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -269,11 +271,12 @@ __forceinline__ __device__ __host__ float2 rocrand_uniform2(rocrand_state_philox
  * (excluding \p 0.0f, including \p 1.0f) using Philox generator in \p state, and
  * increments position of the generator by four.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four uniformly distributed \p float values from (0; 1] range as \p float4.
  */
-__forceinline__ __device__ __host__ float4 rocrand_uniform4(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+float4 rocrand_uniform4(rocrand_state_philox4x32_10* state)
 {
     return rocrand_device::detail::uniform_distribution4(rocrand4(state));
 }
@@ -286,12 +289,12 @@ __forceinline__ __device__ __host__ float4 rocrand_uniform4(rocrand_state_philox
  * (excluding \p 0.0, including \p 1.0) using Philox generator in \p state, and
  * increments position of the generator by two.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_philox4x32_10* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -307,12 +310,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0, including \p 1.0) using Philox generator in \p state, and
  * increments position of the generator by four.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Two uniformly distributed \p double values from (0; 1] range as \p double2.
  */
-__forceinline__ __device__ __host__ double2
-    rocrand_uniform_double2(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+double2 rocrand_uniform_double2(rocrand_state_philox4x32_10* state)
 {
     return rocrand_device::detail::uniform_distribution_double2(rocrand4(state));
 }
@@ -325,12 +328,12 @@ __forceinline__ __device__ __host__ double2
  * (excluding \p 0.0, including \p 1.0) using Philox generator in \p state, and
  * increments position of the generator by eight.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Four uniformly distributed \p double values from (0; 1] range as \p double4.
  */
-__forceinline__ __device__ __host__ double4
-    rocrand_uniform_double4(rocrand_state_philox4x32_10* state)
+__forceinline__ __device__ __host__
+double4 rocrand_uniform_double4(rocrand_state_philox4x32_10* state)
 {
     return rocrand_device::detail::uniform_distribution_double4(rocrand4(state), rocrand4(state));
 }
@@ -343,11 +346,12 @@ __forceinline__ __device__ __host__ double4
  * (excluding \p 0.0f, including \p 1.0f) using MRG31K3P generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_mrg31k3p* state)
 {
     return rocrand_device::detail::mrg_uniform_distribution<rocrand_state_mrg31k3p>(state->next());
 }
@@ -360,20 +364,21 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_mrg31k3p
  * (excluding \p 0.0, including \p 1.0) using MRG31K3P generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_mrg31k3p* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_mrg31k3p* state)
 {
     return rocrand_device::detail::mrg_uniform_distribution_double<rocrand_state_mrg31k3p>(
         state->next());
 }
 
- /**
+/**
  * \brief Returns a uniformly distributed random <tt>float</tt> value
  * from (0; 1] range.
  *
@@ -381,16 +386,17 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
  * (excluding \p 0.0f, including \p 1.0f) using MRG32K3A generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_mrg32k3a* state)
 {
     return rocrand_device::detail::mrg_uniform_distribution<rocrand_state_mrg32k3a>(state->next());
 }
 
- /**
+/**
  * \brief Returns a uniformly distributed random <tt>double</tt> value
  * from (0; 1] range.
  *
@@ -398,20 +404,21 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_mrg32k3a
  * (excluding \p 0.0, including \p 1.0) using MRG32K3A generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_mrg32k3a* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_mrg32k3a* state)
 {
     return rocrand_device::detail::mrg_uniform_distribution_double<rocrand_state_mrg32k3a>(
         state->next());
 }
 
- /**
+/**
  * \brief Returns a uniformly distributed random <tt>float</tt> value
  * from (0; 1] range.
  *
@@ -419,16 +426,17 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
  * (excluding \p 0.0f, including \p 1.0f) using XORWOW generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_xorwow* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
 
- /**
+/**
  * \brief Returns a uniformly distributed random <tt>double</tt> value
  * from (0; 1] range.
  *
@@ -436,11 +444,12 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_xorwow*
  * (excluding \p 0.0, including \p 1.0) using MRG32K3A generator in \p state, and
  * increments position of the generator by two.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_xorwow* state)
 {
     auto state1 = rocrand(state);
     auto state2 = rocrand(state);
@@ -448,7 +457,7 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
     return rocrand_device::detail::uniform_distribution_double(state1, state2);
 }
 
- /**
+/**
  * \brief Returns a uniformly distributed random <tt>float</tt> value
  * from (0; 1] range.
  *
@@ -456,11 +465,12 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
  * (excluding \p 0.0f, including \p 1.0f) using MTGP32 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ float rocrand_uniform(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+float rocrand_uniform(rocrand_state_mtgp32* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -473,19 +483,20 @@ __forceinline__ __device__ float rocrand_uniform(rocrand_state_mtgp32* state)
  * (excluding \p 0.0, including \p 1.0) using MTGP32 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ double rocrand_uniform_double(rocrand_state_mtgp32* state)
+__forceinline__ __device__
+double rocrand_uniform_double(rocrand_state_mtgp32* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
 
- /**
+/**
  * \brief Returns a uniformly distributed random <tt>float</tt> value
  * from (0; 1] range.
  *
@@ -493,11 +504,12 @@ __forceinline__ __device__ double rocrand_uniform_double(rocrand_state_mtgp32* s
  * (excluding \p 0.0f, including \p 1.0f) using SOBOL32 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_sobol32* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -510,14 +522,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_sobol32*
  * (excluding \p 0.0, including \p 1.0) using SOBOL32 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_sobol32* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_sobol32* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -530,11 +543,12 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
  * (excluding \p 0.0f, including \p 1.0f) using SCRAMBLED_SOBOL32 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_scrambled_sobol32* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -547,15 +561,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_scramble
  * (excluding \p 0.0, including \p 1.0) using SCRAMBLED_SOBOL32 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_scrambled_sobol32* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_scrambled_sobol32* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -568,11 +582,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0, including \p 1.0) using SOBOL64 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_sobol64* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_sobol64* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -585,11 +600,12 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_sobol64*
  * (excluding \p 0.0, including \p 1.0) using SOBOL64 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_sobol64* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_sobol64* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -602,11 +618,12 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
  * (excluding \p 0.0, including \p 1.0) using SCRAMBLED_SOBOL64 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_scrambled_sobol64* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -619,12 +636,12 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_scramble
  * (excluding \p 0.0, including \p 1.0) using SCRAMBLED_SOBOL64 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_scrambled_sobol64* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_scrambled_sobol64* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -637,11 +654,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0f, including \p 1.0f) using LFSR113 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_lfsr113* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -654,14 +672,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_lfsr113*
  * (excluding \p 0.0, including \p 1.0) using LFSR113 generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_lfsr113* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_lfsr113* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -674,11 +693,12 @@ __forceinline__ __device__ __host__ double rocrand_uniform_double(rocrand_state_
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_threefry2x32_20* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -691,15 +711,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_threefry2x32_20* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_threefry2x32_20* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -712,11 +732,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_threefry2x64_20* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -729,15 +750,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_threefry2x64_20* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_threefry2x64_20* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -750,11 +771,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_threefry4x32_20* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -767,15 +789,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_threefry4x32_20* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_threefry4x32_20* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
@@ -788,11 +810,12 @@ __forceinline__ __device__ __host__ double
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Uniformly distributed \p float value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+float rocrand_uniform(rocrand_state_threefry4x64_20* state)
 {
     return rocrand_device::detail::uniform_distribution(rocrand(state));
 }
@@ -805,15 +828,15 @@ __forceinline__ __device__ __host__ float rocrand_uniform(rocrand_state_threefry
  * (excluding \p 0.0, including \p 1.0) using ThreeFry generator in \p state, and
  * increments position of the generator by one.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * Note: In this implementation returned \p double value is generated
  * from only 32 random bits (one <tt>unsigned int</tt> value).
  *
  * \return Uniformly distributed \p double value from (0; 1] range.
  */
-__forceinline__ __device__ __host__ double
-    rocrand_uniform_double(rocrand_state_threefry4x64_20* state)
+__forceinline__ __device__ __host__
+double rocrand_uniform_double(rocrand_state_threefry4x64_20* state)
 {
     return rocrand_device::detail::uniform_distribution_double(rocrand(state));
 }
diff --git a/library/include/rocrand/rocrand_xorwow.h b/library/include/rocrand/rocrand_xorwow.h
index 14eed1120..bf3cd8bde 100644
--- a/library/include/rocrand/rocrand_xorwow.h
+++ b/library/include/rocrand/rocrand_xorwow.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -234,15 +234,16 @@ typedef rocrand_device::xorwow_engine rocrand_state_xorwow;
  * Initializes the XORWOW generator \p state with the given
  * \p seed, \p subsequence, and \p offset.
  *
- * \param seed - Value to use as a seed
- * \param subsequence - Subsequence to start at
- * \param offset - Absolute offset into subsequence
- * \param state - Pointer to state to initialize
+ * \param seed Value to use as a seed
+ * \param subsequence Subsequence to start at
+ * \param offset Absolute offset into subsequence
+ * \param state Pointer to state to initialize
  */
-__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
-                                                      const unsigned long long subsequence,
-                                                      const unsigned long long offset,
-                                                      rocrand_state_xorwow*    state)
+__forceinline__ __device__ __host__
+void rocrand_init(const unsigned long long seed,
+                  const unsigned long long subsequence,
+                  const unsigned long long offset,
+                  rocrand_state_xorwow*    state)
 {
     *state = rocrand_state_xorwow(seed, subsequence, offset);
 }
@@ -255,11 +256,12 @@ __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long s
  * value from [0; 2^32 - 1] range using XORWOW generator in \p state.
  * State is incremented by one position.
  *
- * \param state - Pointer to a state to use
+ * \param state Pointer to a state to use
  *
  * \return Pseudorandom value (32-bit) as an <tt>unsigned int</tt>
  */
-__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+unsigned int rocrand(rocrand_state_xorwow* state)
 {
     return state->next();
 }
@@ -269,11 +271,11 @@ __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_xorwow* s
  *
  * Updates the XORWOW state in \p state to skip ahead by \p offset elements.
  *
- * \param offset - Number of elements to skip
- * \param state - Pointer to state to update
+ * \param offset Number of elements to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead(unsigned long long    offset,
-                                                   rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+void skipahead(unsigned long long offset, rocrand_state_xorwow* state)
 {
     return state->discard(offset);
 }
@@ -284,11 +286,11 @@ __forceinline__ __device__ __host__ void skipahead(unsigned long long    offset,
  * Updates the XORWOW \p state to skip ahead by \p subsequence subsequences.
  * Each subsequence is 2^67 numbers long.
  *
- * \param subsequence - Number of subsequences to skip
- * \param state - Pointer to state to update
+ * \param subsequence Number of subsequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long    subsequence,
-                                                               rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+void skipahead_subsequence(unsigned long long subsequence, rocrand_state_xorwow* state)
 {
     return state->discard_subsequence(subsequence);
 }
@@ -299,11 +301,11 @@ __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long lon
  * Updates the XORWOW \p state skipping \p sequence sequences ahead.
  * For XORWOW each sequence is 2^67 numbers long (equal to the size of a subsequence).
  *
- * \param sequence - Number of sequences to skip
- * \param state - Pointer to state to update
+ * \param sequence Number of sequences to skip
+ * \param state Pointer to state to update
  */
-__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long    sequence,
-                                                            rocrand_state_xorwow* state)
+__forceinline__ __device__ __host__
+void skipahead_sequence(unsigned long long sequence, rocrand_state_xorwow* state)
 {
     return state->discard_subsequence(sequence);
 }

From b11ecb6ca8cb4988b475f9921bb4b19bcf02d851 Mon Sep 17 00:00:00 2001
From: Borys Petrov <borys@streamhpc.com>
Date: Thu, 16 Jan 2025 16:32:09 +0000
Subject: [PATCH 11/17] Documentation to CI upload

---
 .gitlab-ci.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 303d17b6d..ca999bf45 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -559,6 +559,10 @@ test:doc:
   extends:
     - .rules:test
     - .build:docs
+  artifacts:
+    paths:
+      - $DOCS_DIR/_build/html/
+    expire_in: 2 weeks
 
 .test:parity:
   stage: test

From f3e1320b59458c6258ab75b41c9b3b2d3eb2f99f Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Date: Fri, 17 Jan 2025 10:06:49 +0000
Subject: [PATCH 12/17] Match cuRAND benchmarks with rocRAND ones

---
 benchmark/benchmark_curand_device_api.cpp  | 121 +++---
 benchmark/benchmark_curand_generate.cpp    | 187 +++++---
 benchmark/benchmark_curand_host_api.cpp    | 323 ++++++++------
 benchmark/benchmark_curand_kernel.cpp      | 469 +++++++++++++--------
 benchmark/benchmark_curand_utils.hpp       |  15 +-
 benchmark/benchmark_rocrand_device_api.cpp | 151 +++----
 benchmark/benchmark_rocrand_kernel.cpp     | 174 +++-----
 7 files changed, 795 insertions(+), 645 deletions(-)

diff --git a/benchmark/benchmark_curand_device_api.cpp b/benchmark/benchmark_curand_device_api.cpp
index ec3123ed3..03a214b04 100644
--- a/benchmark/benchmark_curand_device_api.cpp
+++ b/benchmark/benchmark_curand_device_api.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -38,19 +38,6 @@
 #include <utility>
 #include <vector>
 
-#define CUDA_CALL(condition)                                                               \
-    do                                                                                     \
-    {                                                                                      \
-        cudaError_t error_ = condition;                                                    \
-        if(error_ != cudaSuccess)                                                          \
-        {                                                                                  \
-            std::cout << "CUDA error: " << error_ << " at " << __FILE__ << ":" << __LINE__ \
-                      << std::endl;                                                        \
-            exit(error_);                                                                  \
-        }                                                                                  \
-    }                                                                                      \
-    while(0)
-
 #define CURAND_DEFAULT_MAX_BLOCK_SIZE 256
 
 #ifndef DEFAULT_RAND_N
@@ -100,7 +87,7 @@ struct runner
 
         init_kernel<<<blocks, threads>>>(states, seed, offset);
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
     }
 
@@ -127,7 +114,7 @@ __global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
-    unsigned int       index     = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int       index     = blockIdx.x * blockDim.x + thread_id;
     unsigned int       stride    = gridDim.x * blockDim.x;
 
     __shared__ curandStateMtgp32_t state;
@@ -137,7 +124,13 @@ __global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel
     __syncthreads();
 
     const size_t r               = size % blockDim.x;
-    const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
+    const size_t size_rounded_down = size - r;
+    const size_t size_rounded_up   = r == 0 ? size : size_rounded_down + blockDim.x;
+    while(index < size_rounded_down)
+    {
+        data[index] = generator(&state);
+        index += stride;
+    }
     while(index < size_rounded_up)
     {
         auto value = generator(&state);
@@ -258,12 +251,13 @@ struct runner<curandStateSobol32_t>
     {
         this->dimensions = dimensions;
 
-        const size_t states_size = blocks * threads * dimensions;
-        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol32_t)));
-
         curandDirectionVectors32_t* h_directions;
         CURAND_CALL(
             curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
+
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol32_t)));
+
         unsigned int* directions;
         const size_t  size = dimensions * sizeof(unsigned int) * 32;
         CUDA_CALL(cudaMalloc(&directions, size));
@@ -275,7 +269,7 @@ struct runner<curandStateSobol32_t>
             directions,
             static_cast<unsigned int>(offset));
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -316,26 +310,26 @@ struct runner<curandStateScrambledSobol32_t>
     {
         this->dimensions = dimensions;
 
-        const size_t states_size = blocks * threads * dimensions;
-        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
-
         curandDirectionVectors32_t* h_directions;
+        unsigned int*               h_constants;
+
         CURAND_CALL(
             curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
+        CURAND_CALL(curandGetScrambleConstants32(&h_constants));
+
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
+
         unsigned int* directions;
-        const size_t  size = dimensions * sizeof(unsigned int) * 32;
-        CUDA_CALL(cudaMalloc(&directions, size));
-        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
+        const size_t  directions_size = dimensions * sizeof(unsigned int) * 32;
+        CUDA_CALL(cudaMalloc(&directions, directions_size));
+        CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
 
-        unsigned int* h_scramble_constants;
-        CURAND_CALL(curandGetScrambleConstants32(&h_scramble_constants));
         unsigned int* scramble_constants;
         const size_t  constants_size = dimensions * sizeof(unsigned int);
         CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
-        CUDA_CALL(cudaMemcpy(scramble_constants,
-                             h_scramble_constants,
-                             constants_size,
-                             cudaMemcpyHostToDevice));
+        CUDA_CALL(
+            cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(
@@ -344,7 +338,7 @@ struct runner<curandStateScrambledSobol32_t>
             scramble_constants,
             static_cast<unsigned int>(offset));
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -386,12 +380,13 @@ struct runner<curandStateSobol64_t>
     {
         this->dimensions = dimensions;
 
-        const size_t states_size = blocks * threads * dimensions;
-        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol64_t)));
-
         curandDirectionVectors64_t* h_directions;
         CURAND_CALL(
             curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
+
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol64_t)));
+
         unsigned long long int* directions;
         const size_t            size = dimensions * sizeof(unsigned long long) * 64;
         CUDA_CALL(cudaMalloc(&directions, size));
@@ -400,7 +395,7 @@ struct runner<curandStateSobol64_t>
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         init_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -441,26 +436,26 @@ struct runner<curandStateScrambledSobol64_t>
     {
         this->dimensions = dimensions;
 
-        const size_t states_size = blocks * threads * dimensions;
-        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
-
         curandDirectionVectors64_t* h_directions;
+        unsigned long long*         h_constants;
+
         CURAND_CALL(
             curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
+        CURAND_CALL(curandGetScrambleConstants64(&h_constants));
+
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
+
         unsigned long long* directions;
-        const size_t        size = dimensions * sizeof(unsigned long long) * 64;
-        CUDA_CALL(cudaMalloc(&directions, size));
-        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
+        const size_t        directions_size = dimensions * sizeof(unsigned long long) * 64;
+        CUDA_CALL(cudaMalloc(&directions, directions_size));
+        CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
 
-        unsigned long long* h_scramble_constants;
-        CURAND_CALL(curandGetScrambleConstants64(&h_scramble_constants));
         unsigned long long* scramble_constants;
         const size_t        constants_size = dimensions * sizeof(unsigned long long);
         CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
-        CUDA_CALL(cudaMemcpy(scramble_constants,
-                             h_scramble_constants,
-                             constants_size,
-                             cudaMemcpyHostToDevice));
+        CUDA_CALL(
+            cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
@@ -468,7 +463,7 @@ struct runner<curandStateScrambledSobol64_t>
                                                                              scramble_constants,
                                                                              offset);
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -722,7 +717,7 @@ void run_benchmark(benchmark::State&        state,
     for(size_t i = 0; i < 5; i++)
     {
         r.generate(blocks, threads, stream, data, size, generator);
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
     }
 
@@ -819,6 +814,14 @@ void add_benchmarks(const benchmark_context&                      ctx,
 
 int main(int argc, char* argv[])
 {
+    // get paramaters before they are passed into
+    // benchmark::Initialize()
+    std::string outFormat     = "";
+    std::string filter        = "";
+    std::string consoleFormat = "";
+
+    getFormats(argc, argv, outFormat, filter, consoleFormat);
+
     benchmark::Initialize(&argc, argv);
 
     cli::Parser parser(argc, argv);
@@ -884,8 +887,20 @@ int main(int argc, char* argv[])
         b->Unit(benchmark::kMillisecond);
     }
 
+    benchmark::BenchmarkReporter* console_reporter  = getConsoleReporter(consoleFormat);
+    benchmark::BenchmarkReporter* out_file_reporter = getOutFileReporter(outFormat);
+
+    std::string spec = (filter == "" || filter == "all") ? "." : filter;
+
     // Run benchmarks
-    benchmark::RunSpecifiedBenchmarks();
+    if(outFormat == "") // default case
+    {
+        benchmark::RunSpecifiedBenchmarks(console_reporter, spec);
+    }
+    else
+    {
+        benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, spec);
+    }
     CUDA_CALL(cudaStreamDestroy(stream));
 
     return 0;
diff --git a/benchmark/benchmark_curand_generate.cpp b/benchmark/benchmark_curand_generate.cpp
index c3c450ff7..ca99a0143 100644
--- a/benchmark/benchmark_curand_generate.cpp
+++ b/benchmark/benchmark_curand_generate.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -52,11 +52,17 @@ template<typename T>
 void run_benchmark(const cli::Parser&    parser,
                    const rng_type_t      rng_type,
                    cudaStream_t          stream,
-                   generate_func_type<T> generate_func)
+                   generate_func_type<T> generate_func,
+                   const std::string&    distribution,
+                   const std::string&    engine,
+                   const double          lambda = 0.f)
 {
-    const size_t size = parser.get<size_t>("size");
-    const size_t trials = parser.get<size_t>("trials");
-    const size_t offset = parser.get<size_t>("offset");
+    const size_t      size0      = parser.get<size_t>("size");
+    const size_t      trials     = parser.get<size_t>("trials");
+    const size_t      dimensions = parser.get<size_t>("dimensions");
+    const size_t      offset     = parser.get<size_t>("offset");
+    const size_t      size       = (size0 / dimensions) * dimensions;
+    const std::string format     = parser.get<std::string>("format");
 
     T * data;
     CUDA_CALL(cudaMalloc(&data, size * sizeof(T)));
@@ -64,7 +70,6 @@ void run_benchmark(const cli::Parser&    parser,
     curandGenerator_t generator;
     CURAND_CALL(curandCreateGenerator(&generator, rng_type));
 
-    const size_t dimensions = parser.get<size_t>("dimensions");
     curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
     if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
     {
@@ -90,7 +95,6 @@ void run_benchmark(const cli::Parser&    parser,
     cudaEvent_t start, stop;
     CUDA_CALL(cudaEventCreate(&start));
     CUDA_CALL(cudaEventCreate(&stop));
-
     CUDA_CALL(cudaEventRecord(start, stream));
     for (size_t i = 0; i < trials; i++)
     {
@@ -98,21 +102,40 @@ void run_benchmark(const cli::Parser&    parser,
     }
     CUDA_CALL(cudaEventRecord(stop, stream));
     CUDA_CALL(cudaEventSynchronize(stop));
-
     float elapsed;
     CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
-
     CUDA_CALL(cudaEventDestroy(stop));
     CUDA_CALL(cudaEventDestroy(start));
 
-    std::cout << std::fixed << std::setprecision(3) << "      "
-              << "Throughput = " << std::setw(8)
-              << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30))
-              << " GB/s, Samples = " << std::setw(8)
-              << (trials * size) / (elapsed / 1e3 * (1 << 30))
-              << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials
-              << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size
-              << std::endl;
+    if(format.compare("csv") == 0)
+    {
+        std::cout << std::fixed << std::setprecision(3) << engine << "," << distribution << ","
+                  << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30)) << ","
+                  << (trials * size) / (elapsed / 1e3 * (1 << 30)) << "," << elapsed / trials << ","
+                  << elapsed << "," << size << ",";
+        if(distribution.compare("poisson") == 0 || distribution.compare("discrete-poisson") == 0)
+        {
+            std::cout << lambda;
+        }
+        std::cout << std::endl;
+    }
+    else
+    {
+        if(format.compare("console") != 0)
+        {
+            std::cout << "Unknown format specified (must be either console or csv).  Defaulting to "
+                         "console output."
+                      << std::endl;
+        }
+        std::cout << std::fixed << std::setprecision(3) << "      "
+                  << "Throughput = " << std::setw(8)
+                  << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30))
+                  << " GB/s, Samples = " << std::setw(8)
+                  << (trials * size) / (elapsed / 1e3 * (1 << 30))
+                  << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials
+                  << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size
+                  << std::endl;
+    }
 
     CURAND_CALL(curandDestroyGenerator(generator));
     CUDA_CALL(cudaFree(data));
@@ -121,94 +144,117 @@ void run_benchmark(const cli::Parser&    parser,
 void run_benchmarks(const cli::Parser& parser,
                     const rng_type_t   rng_type,
                     const std::string& distribution,
+                    const std::string& engine,
                     cudaStream_t       stream)
 {
+    const std::string format = parser.get<std::string>("format");
     if (distribution == "uniform-uint")
     {
-        if (rng_type != CURAND_RNG_QUASI_SOBOL64 &&
-            rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
-        {
-            run_benchmark<unsigned int>(parser,
-                                        rng_type,
-                                        stream,
-                                        [](curandGenerator_t gen, unsigned int* data, size_t size)
-                                        { return curandGenerate(gen, data, size); });
-        }
+        run_benchmark<unsigned int>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, unsigned int* data, size_t size)
+            { return curandGenerate(gen, data, size); },
+            distribution,
+            engine);
     }
     if (distribution == "uniform-long-long")
     {
-        if (rng_type == CURAND_RNG_QUASI_SOBOL64 ||
-            rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
-        {
-            run_benchmark<unsigned long long>(
-                parser,
-                rng_type,
-                stream,
-                [](curandGenerator_t gen, unsigned long long* data, size_t size)
-                { return curandGenerateLongLong(gen, data, size); });
-        }
+        run_benchmark<unsigned long long>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, unsigned long long* data, size_t size)
+            { return curandGenerateLongLong(gen, data, size); },
+            distribution,
+            engine);
     }
     if (distribution == "uniform-float")
     {
-        run_benchmark<float>(parser,
-                             rng_type,
-                             stream,
-                             [](curandGenerator_t gen, float* data, size_t size)
-                             { return curandGenerateUniform(gen, data, size); });
+        run_benchmark<float>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, float* data, size_t size)
+            { return curandGenerateUniform(gen, data, size); },
+            distribution,
+            engine);
     }
     if (distribution == "uniform-double")
     {
-        run_benchmark<double>(parser,
-                              rng_type,
-                              stream,
-                              [](curandGenerator_t gen, double* data, size_t size)
-                              { return curandGenerateUniformDouble(gen, data, size); });
+        run_benchmark<double>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, double* data, size_t size)
+            { return curandGenerateUniformDouble(gen, data, size); },
+            distribution,
+            engine);
     }
     if (distribution == "normal-float")
     {
-        run_benchmark<float>(parser,
-                             rng_type,
-                             stream,
-                             [](curandGenerator_t gen, float* data, size_t size)
-                             { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); });
+        run_benchmark<float>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, float* data, size_t size)
+            { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); },
+            distribution,
+            engine);
     }
     if (distribution == "normal-double")
     {
-        run_benchmark<double>(parser,
-                              rng_type,
-                              stream,
-                              [](curandGenerator_t gen, double* data, size_t size)
-                              { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); });
+        run_benchmark<double>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, double* data, size_t size)
+            { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); },
+            distribution,
+            engine);
     }
     if (distribution == "log-normal-float")
     {
-        run_benchmark<float>(parser,
-                             rng_type,
-                             stream,
-                             [](curandGenerator_t gen, float* data, size_t size)
-                             { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); });
+        run_benchmark<float>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, float* data, size_t size)
+            { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); },
+            distribution,
+            engine);
     }
     if (distribution == "log-normal-double")
     {
-        run_benchmark<double>(parser,
-                              rng_type,
-                              stream,
-                              [](curandGenerator_t gen, double* data, size_t size)
-                              { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); });
+        run_benchmark<double>(
+            parser,
+            rng_type,
+            stream,
+            [](curandGenerator_t gen, double* data, size_t size)
+            { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); },
+            distribution,
+            engine);
     }
     if (distribution == "poisson")
     {
         const auto lambdas = parser.get<std::vector<double>>("lambda");
         for (double lambda : lambdas)
         {
-            std::cout << "    " << "lambda "
-                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            if(format.compare("console") == 0)
+            {
+                std::cout << "    "
+                          << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl;
+            }
             run_benchmark<unsigned int>(
                 parser,
                 rng_type,
                 stream,
                 [lambda](curandGenerator_t gen, unsigned int* data, size_t size)
-                { return curandGeneratePoisson(gen, data, size, lambda); });
+                { return curandGeneratePoisson(gen, data, size, lambda); },
+                distribution,
+                engine,
+                lambda);
         }
     }
 }
@@ -310,6 +356,7 @@ int main(int argc, char *argv[])
     cudaDeviceProp props;
     CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
 
+    std::cout << "benchmark_curand_generate" << std::endl;
     std::cout << "cuRAND: " << version << " ";
     std::cout << "Runtime: " << runtime_version << " ";
     std::cout << "Device: " << props.name;
@@ -350,7 +397,7 @@ int main(int argc, char *argv[])
         for (auto distribution : distributions)
         {
             std::cout << "  " << distribution << ":" << std::endl;
-            run_benchmarks(parser, rng_type, distribution, stream);
+            run_benchmarks(parser, rng_type, distribution, engine, stream);
         }
         std::cout << std::endl;
     }
diff --git a/benchmark/benchmark_curand_host_api.cpp b/benchmark/benchmark_curand_host_api.cpp
index ddb121f2b..596079add 100644
--- a/benchmark/benchmark_curand_host_api.cpp
+++ b/benchmark/benchmark_curand_host_api.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,22 +21,11 @@
 #include "benchmark_curand_utils.hpp"
 #include "cmdparser.hpp"
 
+#include <benchmark/benchmark.h>
+
 #include <cuda_runtime.h>
 #include <curand.h>
 
-#define CUDA_CALL(condition)                                                               \
-    do                                                                                     \
-    {                                                                                      \
-        cudaError_t error_ = condition;                                                    \
-        if(error_ != cudaSuccess)                                                          \
-        {                                                                                  \
-            std::cout << "CUDA error: " << error_ << " at " << __FILE__ << ":" << __LINE__ \
-                      << std::endl;                                                        \
-            exit(error_);                                                                  \
-        }                                                                                  \
-    }                                                                                      \
-    while(0)
-
 #ifndef DEFAULT_RAND_N
 const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
 #endif
@@ -48,15 +37,20 @@ using generate_func_type = std::function<curandStatus_t(curandGenerator_t, T*, s
 
 template<typename T>
 void run_benchmark(benchmark::State&     state,
-                   const rng_type_t      rng_type,
                    generate_func_type<T> generate_func,
                    const size_t          size,
+                   const bool            byte_size,
                    const size_t          trials,
-                   const size_t          offset,
                    const size_t          dimensions,
+                   const size_t          offset,
+                   const rng_type_t      rng_type,
+                   const curandOrdering  ordering,
                    const bool            benchmark_host,
                    cudaStream_t          stream)
 {
+    const size_t binary_div   = byte_size ? sizeof(T) : 1;
+    const size_t rounded_size = (size / binary_div / dimensions) * dimensions;
+
     T*                data;
     curandGenerator_t generator;
 
@@ -67,10 +61,12 @@ void run_benchmark(benchmark::State&     state,
     }
     else
     {
-        CUDA_CALL(cudaMalloc(&data, size * sizeof(T)));
+        CUDA_CALL(cudaMalloc(&data, rounded_size * sizeof(T)));
         CURAND_CALL(curandCreateGenerator(&generator, rng_type));
     }
 
+    CURAND_CALL(curandSetGeneratorOrdering(generator, ordering));
+
     curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
     if(status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
     {
@@ -96,7 +92,6 @@ void run_benchmark(benchmark::State&     state,
     cudaEvent_t start, stop;
     CUDA_CALL(cudaEventCreate(&start));
     CUDA_CALL(cudaEventCreate(&stop));
-
     for(auto _ : state)
     {
         CUDA_CALL(cudaEventRecord(start, stream));
@@ -109,6 +104,7 @@ void run_benchmark(benchmark::State&     state,
 
         float elapsed = 0.0f;
         CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
+
         state.SetIterationTime(elapsed / 1000.f);
     }
 
@@ -117,7 +113,6 @@ void run_benchmark(benchmark::State&     state,
 
     CUDA_CALL(cudaEventDestroy(stop));
     CUDA_CALL(cudaEventDestroy(start));
-
     CURAND_CALL(curandDestroyGenerator(generator));
 
     if(benchmark_host)
@@ -152,6 +147,14 @@ void configure_parser(cli::Parser& parser)
 
 int main(int argc, char* argv[])
 {
+    // get paramaters before they are passed into
+    std::string outFormat     = "";
+    std::string filter        = "";
+    std::string consoleFormat = "";
+
+    getFormats(argc, argv, outFormat, filter, consoleFormat);
+
+    // Parse argv
     benchmark::Initialize(&argc, argv);
 
     // Parse arguments from command line
@@ -165,6 +168,7 @@ int main(int argc, char* argv[])
     add_common_benchmark_curand_info();
 
     const size_t              size            = parser.get<size_t>("size");
+    const bool                byte_size       = parser.get<bool>("byte-size");
     const size_t              trials          = parser.get<size_t>("trials");
     const size_t              offset          = parser.get<size_t>("offset");
     const size_t              dimensions      = parser.get<size_t>("dimensions");
@@ -172,12 +176,13 @@ int main(int argc, char* argv[])
     const bool                benchmark_host  = parser.get<bool>("host");
 
     benchmark::AddCustomContext("size", std::to_string(size));
+    benchmark::AddCustomContext("byte-size", std::to_string(byte_size));
     benchmark::AddCustomContext("trials", std::to_string(trials));
     benchmark::AddCustomContext("offset", std::to_string(offset));
     benchmark::AddCustomContext("dimensions", std::to_string(dimensions));
     benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host));
 
-    const std::vector<rng_type_t> engine_types{
+    const std::vector<rng_type_t> benchmarked_engine_types{
         CURAND_RNG_PSEUDO_MT19937,
         CURAND_RNG_PSEUDO_MTGP32,
         CURAND_RNG_PSEUDO_MRG32K3A,
@@ -189,147 +194,211 @@ int main(int argc, char* argv[])
         CURAND_RNG_PSEUDO_XORWOW,
     };
 
-    const std::string                            benchmark_name_prefix = "device_generate";
-    std::vector<benchmark::internal::Benchmark*> benchmarks            = {};
+    const std::map<curandOrdering, std::string> ordering_name_map{
+        {CURAND_ORDERING_PSEUDO_DEFAULT, "default"},
+        { CURAND_ORDERING_PSEUDO_LEGACY,  "legacy"},
+        {   CURAND_ORDERING_PSEUDO_BEST,    "best"},
+        {CURAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"},
+        { CURAND_ORDERING_PSEUDO_SEEDED,  "seeded"},
+        { CURAND_ORDERING_QUASI_DEFAULT, "default"},
+    };
+
+    const std::map<rng_type_t, std::vector<curandOrdering>> benchmarked_orderings{
+  // clang-format off
+        {          CURAND_RNG_PSEUDO_MTGP32,
+            {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC}},
+        {         CURAND_RNG_PSEUDO_MT19937, {CURAND_ORDERING_PSEUDO_DEFAULT}},
+        {          CURAND_RNG_PSEUDO_XORWOW,
+            {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC} },
+        {        CURAND_RNG_PSEUDO_MRG32K3A,
+            {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC}},
+        {   CURAND_RNG_PSEUDO_PHILOX4_32_10,
+            {CURAND_ORDERING_PSEUDO_DEFAULT, CURAND_ORDERING_PSEUDO_DYNAMIC}},
+        {          CURAND_RNG_QUASI_SOBOL32,  {CURAND_ORDERING_QUASI_DEFAULT}},
+        {CURAND_RNG_QUASI_SCRAMBLED_SOBOL32,  {CURAND_ORDERING_QUASI_DEFAULT}},
+        {          CURAND_RNG_QUASI_SOBOL64,  {CURAND_ORDERING_QUASI_DEFAULT}},
+        {CURAND_RNG_QUASI_SCRAMBLED_SOBOL64,  {CURAND_ORDERING_QUASI_DEFAULT}},
+  // clang-format on
+    };
 
+    const std::string benchmark_name_prefix = "device_generate";
     // Add benchmarks
-    for(const rng_type_t engine_type : engine_types)
+    std::vector<benchmark::internal::Benchmark*> benchmarks = {};
+    for(const rng_type_t engine_type : benchmarked_engine_types)
     {
-        const std::string benchmark_name_engine
-            = benchmark_name_prefix + "<" + engine_name(engine_type) + ",";
+        const std::string name = engine_name(engine_type);
+        for(const curandOrdering ordering : benchmarked_orderings.at(engine_type))
+        {
+            const std::string name_engine_prefix
+                = benchmark_name_prefix + "<" + name + "," + ordering_name_map.at(ordering) + ",";
 
-        if(engine_type != CURAND_RNG_QUASI_SOBOL64
-           && engine_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
             benchmarks.emplace_back(benchmark::RegisterBenchmark(
-                (benchmark_name_engine + "uniform-uint>").c_str(),
+                (name_engine_prefix + "uniform-uint>").c_str(),
                 &run_benchmark<unsigned int>,
-                engine_type,
-                [](curandGenerator_t gen, unsigned int* data, size_t size)
-                { return curandGenerate(gen, data, size); },
+                [](curandGenerator_t gen, unsigned int* data, size_t size_gen)
+                { return curandGenerate(gen, data, size_gen); },
                 size,
+                byte_size,
                 trials,
-                offset,
                 dimensions,
+                offset,
+                engine_type,
+                ordering,
                 benchmark_host,
                 stream));
-        else
+
+            if(engine_type == CURAND_RNG_QUASI_SOBOL64
+               || engine_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+            {
+                benchmarks.emplace_back(benchmark::RegisterBenchmark(
+                    (name_engine_prefix + "uniform-long-long>").c_str(),
+                    &run_benchmark<unsigned long long>,
+                    [](curandGenerator_t gen, unsigned long long* data, size_t size)
+                    { return curandGenerateLongLong(gen, data, size); },
+                    size,
+                    byte_size,
+                    trials,
+                    dimensions,
+                    offset,
+                    engine_type,
+                    ordering,
+                    benchmark_host,
+                    stream));
+            }
+
+            benchmarks.emplace_back(
+                benchmark::RegisterBenchmark((name_engine_prefix + "uniform-float>").c_str(),
+                                             &run_benchmark<float>,
+                                             [](curandGenerator_t gen, float* data, size_t size_gen)
+                                             { return curandGenerateUniform(gen, data, size_gen); },
+                                             size,
+                                             byte_size,
+                                             trials,
+                                             dimensions,
+                                             offset,
+                                             engine_type,
+                                             ordering,
+                                             benchmark_host,
+                                             stream));
+
             benchmarks.emplace_back(benchmark::RegisterBenchmark(
-                (benchmark_name_engine + "uniform-long-long>").c_str(),
-                &run_benchmark<unsigned long long>,
-                engine_type,
-                [](curandGenerator_t gen, unsigned long long* data, size_t size)
-                { return curandGenerateLongLong(gen, data, size); },
+                (name_engine_prefix + "uniform-double>").c_str(),
+                &run_benchmark<double>,
+                [](curandGenerator_t gen, double* data, size_t size_gen)
+                { return curandGenerateUniformDouble(gen, data, size_gen); },
                 size,
+                byte_size,
                 trials,
-                offset,
                 dimensions,
+                offset,
+                engine_type,
+                ordering,
                 benchmark_host,
                 stream));
 
-        benchmarks.emplace_back(
-            benchmark::RegisterBenchmark((benchmark_name_engine + "uniform-float>").c_str(),
-                                         &run_benchmark<float>,
-                                         engine_type,
-                                         [](curandGenerator_t gen, float* data, size_t size)
-                                         { return curandGenerateUniform(gen, data, size); },
-                                         size,
-                                         trials,
-                                         offset,
-                                         dimensions,
-                                         benchmark_host,
-                                         stream));
-
-        benchmarks.emplace_back(
-            benchmark::RegisterBenchmark((benchmark_name_engine + "uniform-double>").c_str(),
-                                         &run_benchmark<double>,
-                                         engine_type,
-                                         [](curandGenerator_t gen, double* data, size_t size)
-                                         { return curandGenerateUniformDouble(gen, data, size); },
-                                         size,
-                                         trials,
-                                         offset,
-                                         dimensions,
-                                         benchmark_host,
-                                         stream));
-
-        benchmarks.emplace_back(benchmark::RegisterBenchmark(
-            (benchmark_name_engine + "normal-float>").c_str(),
-            &run_benchmark<float>,
-            engine_type,
-            [](curandGenerator_t gen, float* data, size_t size)
-            { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); },
-            size,
-            trials,
-            offset,
-            dimensions,
-            benchmark_host,
-            stream));
-
-        benchmarks.emplace_back(benchmark::RegisterBenchmark(
-            (benchmark_name_engine + "normal-double>").c_str(),
-            &run_benchmark<double>,
-            engine_type,
-            [](curandGenerator_t gen, double* data, size_t size)
-            { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); },
-            size,
-            trials,
-            offset,
-            dimensions,
-            benchmark_host,
-            stream));
-
-        benchmarks.emplace_back(benchmark::RegisterBenchmark(
-            (benchmark_name_engine + "log-normal-float>").c_str(),
-            &run_benchmark<float>,
-            engine_type,
-            [](curandGenerator_t gen, float* data, size_t size)
-            { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); },
-            size,
-            trials,
-            offset,
-            dimensions,
-            benchmark_host,
-            stream));
-
-        benchmarks.emplace_back(benchmark::RegisterBenchmark(
-            (benchmark_name_engine + "log-normal-double>").c_str(),
-            &run_benchmark<double>,
-            engine_type,
-            [](curandGenerator_t gen, double* data, size_t size)
-            { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); },
-            size,
-            trials,
-            offset,
-            dimensions,
-            benchmark_host,
-            stream));
-
-        for(auto lambda : poisson_lambdas)
-        {
-            const std::string poisson_dis_name
-                = std::string("poisson(lambda=") + std::to_string(lambda) + ")>";
+            benchmarks.emplace_back(benchmark::RegisterBenchmark(
+                (name_engine_prefix + "normal-float>").c_str(),
+                &run_benchmark<float>,
+                [](curandGenerator_t gen, float* data, size_t size_gen)
+                { return curandGenerateNormal(gen, data, size_gen, 0.0f, 1.0f); },
+                size,
+                byte_size,
+                trials,
+                dimensions,
+                offset,
+                engine_type,
+                ordering,
+                benchmark_host,
+                stream));
 
             benchmarks.emplace_back(benchmark::RegisterBenchmark(
-                (benchmark_name_engine + poisson_dis_name).c_str(),
-                &run_benchmark<unsigned int>,
+                (name_engine_prefix + "normal-double>").c_str(),
+                &run_benchmark<double>,
+                [](curandGenerator_t gen, double* data, size_t size_gen)
+                { return curandGenerateNormalDouble(gen, data, size_gen, 0.0, 1.0); },
+                size,
+                byte_size,
+                trials,
+                dimensions,
+                offset,
                 engine_type,
-                [lambda](curandGenerator_t gen, unsigned int* data, size_t size)
-                { return curandGeneratePoisson(gen, data, size, lambda); },
+                ordering,
+                benchmark_host,
+                stream));
+
+            benchmarks.emplace_back(benchmark::RegisterBenchmark(
+                (name_engine_prefix + "log-normal-float>").c_str(),
+                &run_benchmark<float>,
+                [](curandGenerator_t gen, float* data, size_t size_gen)
+                { return curandGenerateLogNormal(gen, data, size_gen, 0.0f, 1.0f); },
                 size,
+                byte_size,
                 trials,
+                dimensions,
                 offset,
+                engine_type,
+                ordering,
+                benchmark_host,
+                stream));
+
+            benchmarks.emplace_back(benchmark::RegisterBenchmark(
+                (name_engine_prefix + "log-normal-double>").c_str(),
+                &run_benchmark<double>,
+                [](curandGenerator_t gen, double* data, size_t size_gen)
+                { return curandGenerateLogNormalDouble(gen, data, size_gen, 0.0, 1.0); },
+                size,
+                byte_size,
+                trials,
                 dimensions,
+                offset,
+                engine_type,
+                ordering,
                 benchmark_host,
                 stream));
+
+            for(auto lambda : poisson_lambdas)
+            {
+                const std::string poisson_dis_name
+                    = std::string("poisson(lambda=") + std::to_string(lambda) + ")>";
+                benchmarks.emplace_back(benchmark::RegisterBenchmark(
+                    (name_engine_prefix + poisson_dis_name).c_str(),
+                    &run_benchmark<unsigned int>,
+                    [lambda](curandGenerator_t gen, unsigned int* data, size_t size_gen)
+                    { return curandGeneratePoisson(gen, data, size_gen, lambda); },
+                    size,
+                    byte_size,
+                    trials,
+                    dimensions,
+                    offset,
+                    engine_type,
+                    ordering,
+                    benchmark_host,
+                    stream));
+            }
         }
     }
-    // Use manual timing
+
     for(auto& b : benchmarks)
     {
         b->UseManualTime();
         b->Unit(benchmark::kMillisecond);
     }
-    benchmark::RunSpecifiedBenchmarks();
+
+    benchmark::BenchmarkReporter* console_reporter  = getConsoleReporter(consoleFormat);
+    benchmark::BenchmarkReporter* out_file_reporter = getOutFileReporter(outFormat);
+
+    std::string spec = (filter == "" || filter == "all") ? "." : filter;
+
+    // Run benchmarks
+    if(outFormat == "") // default case
+    {
+        benchmark::RunSpecifiedBenchmarks(console_reporter, spec);
+    }
+    else
+    {
+        benchmark::RunSpecifiedBenchmarks(console_reporter, out_file_reporter, spec);
+    }
+
     CUDA_CALL(cudaStreamDestroy(stream));
 
     return 0;
diff --git a/benchmark/benchmark_curand_kernel.cpp b/benchmark/benchmark_curand_kernel.cpp
index 6c205ff02..df3e7922e 100644
--- a/benchmark/benchmark_curand_kernel.cpp
+++ b/benchmark/benchmark_curand_kernel.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -36,7 +36,7 @@
 #include <curand_mtgp32_host.h>
 #include <curand_mtgp32dc_p_11213.h>
 
-#define CUPRAND_DEFAULT_MAX_BLOCK_SIZE 256
+#define CURAND_DEFAULT_MAX_BLOCK_SIZE 256
 
 #define CUDA_CALL(x) do { \
     cudaError_t error = (x);\
@@ -63,8 +63,8 @@ size_t next_power2(size_t x)
 
 template<typename GeneratorState>
 __global__
-__launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE)
-void init_kernel(GeneratorState * states,
+__launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
+void init_kernel(GeneratorState*          states,
                  const unsigned long long seed,
                  const unsigned long long offset)
 {
@@ -76,12 +76,12 @@ void init_kernel(GeneratorState * states,
 
 template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
 __global__
-__launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE)
-void generate_kernel(GeneratorState * states,
-                     T * data,
-                     const size_t size,
-                     GenerateFunc generate_func,
-                     const Extra extra)
+__launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
+void generate_kernel(GeneratorState* states,
+                     T*              data,
+                     const size_t    size,
+                     GenerateFunc    generate_func,
+                     const Extra     extra)
 {
     const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int stride = gridDim.x * blockDim.x;
@@ -112,7 +112,7 @@ struct runner
 
         init_kernel<<<blocks, threads>>>(states, seed, offset);
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
     }
 
@@ -127,12 +127,13 @@ struct runner
     }
 
     template<typename T, typename GenerateFunc, typename Extra>
-    void generate(const size_t blocks,
-                  const size_t threads,
-                  T * data,
-                  const size_t size,
+    void generate(const size_t        blocks,
+                  const size_t        threads,
+                  cudaStream_t        stream,
+                  T*                  data,
+                  const size_t        size,
                   const GenerateFunc& generate_func,
-                  const Extra extra)
+                  const Extra         extra)
     {
         generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
     }
@@ -140,12 +141,12 @@ struct runner
 
 template<typename T, typename GenerateFunc, typename Extra>
 __global__
-__launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE)
-void generate_kernel(curandStateMtgp32_t * states,
-                     T * data,
-                     const size_t size,
-                     GenerateFunc generate_func,
-                     const Extra extra)
+__launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
+void generate_kernel(curandStateMtgp32_t* states,
+                     T*                   data,
+                     const size_t         size,
+                     GenerateFunc         generate_func,
+                     const Extra          extra)
 {
     const unsigned int state_id = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -159,7 +160,13 @@ void generate_kernel(curandStateMtgp32_t * states,
     __syncthreads();
 
     const size_t r = size%blockDim.x;
-    const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
+    const size_t size_rounded_down = size - r;
+    const size_t size_rounded_up   = r == 0 ? size : size_rounded_down + blockDim.x;
+    while(index < size_rounded_down)
+    {
+        data[index] = generate_func(&state, extra);
+        index += stride;
+    }
     while(index < size_rounded_up)
     {
         auto value = generate_func(&state, extra);
@@ -207,18 +214,23 @@ struct runner<curandStateMtgp32_t>
     template<typename T, typename GenerateFunc, typename Extra>
     void generate(const size_t blocks,
                   const size_t /* threads */,
-                  T * data,
-                  const size_t size,
+                  cudaStream_t        stream,
+                  T*                  data,
+                  const size_t        size,
                   const GenerateFunc& generate_func,
-                  const Extra extra)
+                  const Extra         extra)
     {
-        generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
+        generate_kernel<<<std::min((size_t)200, blocks), 256, 0, stream>>>(states,
+                                                                           data,
+                                                                           size,
+                                                                           generate_func,
+                                                                           extra);
     }
 };
 
 template<typename GeneratorState, typename SobolType>
-__global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel(
-    GeneratorState* states, SobolType* directions, SobolType offset)
+__global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
+void init_sobol_kernel(GeneratorState* states, SobolType* directions, SobolType offset)
 {
     const unsigned int dimension = blockIdx.y;
     const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -228,8 +240,11 @@ __global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_ker
 }
 
 template<typename GeneratorState, typename SobolType>
-__global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel(
-    GeneratorState* states, SobolType* directions, SobolType* scramble_constants, SobolType offset)
+__global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
+void init_scrambled_sobol_kernel(GeneratorState* states,
+                                 SobolType*      directions,
+                                 SobolType*      scramble_constants,
+                                 SobolType       offset)
 {
     const unsigned int dimension = blockIdx.y;
     const unsigned int state_id  = blockIdx.x * blockDim.x + threadIdx.x;
@@ -243,12 +258,12 @@ __global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled
 
 // generate_kernel for the sobol generators
 template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
-__global__ __launch_bounds__(CUPRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel(
-    GeneratorState* states,
-    T*              data,
-    const size_t    size,
-    GenerateFunc    generate_func,
-    const Extra     extra)
+__global__ __launch_bounds__(CURAND_DEFAULT_MAX_BLOCK_SIZE)
+void generate_sobol_kernel(GeneratorState* states,
+                           T*              data,
+                           const size_t    size,
+                           GenerateFunc    generate_func,
+                           const Extra     extra)
 {
     const unsigned int dimension = blockIdx.y;
     const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -282,11 +297,13 @@ struct runner<curandStateSobol32_t>
     {
         this->dimensions = dimensions;
 
+        curandDirectionVectors32_t* h_directions;
+        CURAND_CALL(
+            curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
+
         const size_t states_size = blocks * threads * dimensions;
         CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol32_t)));
 
-        curandDirectionVectors32_t * h_directions;
-        CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
         unsigned int* directions;
         const size_t  size = dimensions * sizeof(unsigned int) * 32;
         CUDA_CALL(cudaMalloc(&directions, size));
@@ -298,7 +315,7 @@ struct runner<curandStateSobol32_t>
             directions,
             static_cast<unsigned int>(offset));
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -315,19 +332,20 @@ struct runner<curandStateSobol32_t>
     }
 
     template<typename T, typename GenerateFunc, typename Extra>
-    void generate(const size_t blocks,
-                  const size_t threads,
-                  T * data,
-                  const size_t size,
+    void generate(const size_t        blocks,
+                  const size_t        threads,
+                  cudaStream_t        stream,
+                  T*                  data,
+                  const size_t        size,
                   const GenerateFunc& generate_func,
-                  const Extra extra)
+                  const Extra         extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
-                                                                       data,
-                                                                       size / dimensions,
-                                                                       generate_func,
-                                                                       extra);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads, 0, stream>>>(states,
+                                                                                  data,
+                                                                                  size / dimensions,
+                                                                                  generate_func,
+                                                                                  extra);
     }
 };
 
@@ -345,26 +363,26 @@ struct runner<curandStateScrambledSobol32_t>
     {
         this->dimensions = dimensions;
 
-        const size_t states_size = blocks * threads * dimensions;
-        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
-
         curandDirectionVectors32_t* h_directions;
+        unsigned int*               h_constants;
+
         CURAND_CALL(
             curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
+        CURAND_CALL(curandGetScrambleConstants32(&h_constants));
+
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol32_t)));
+
         unsigned int* directions;
-        const size_t  size = dimensions * sizeof(unsigned int) * 32;
-        CUDA_CALL(cudaMalloc(&directions, size));
-        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
+        const size_t  directions_size = dimensions * sizeof(unsigned int) * 32;
+        CUDA_CALL(cudaMalloc(&directions, directions_size));
+        CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
 
-        unsigned int* h_scramble_constants;
-        CURAND_CALL(curandGetScrambleConstants32(&h_scramble_constants));
         unsigned int* scramble_constants;
         const size_t  constants_size = dimensions * sizeof(unsigned int);
         CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
-        CUDA_CALL(cudaMemcpy(scramble_constants,
-                             h_scramble_constants,
-                             constants_size,
-                             cudaMemcpyHostToDevice));
+        CUDA_CALL(
+            cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(
@@ -373,7 +391,7 @@ struct runner<curandStateScrambledSobol32_t>
             scramble_constants,
             static_cast<unsigned int>(offset));
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -393,17 +411,18 @@ struct runner<curandStateScrambledSobol32_t>
     template<typename T, typename GenerateFunc, typename Extra>
     void generate(const size_t        blocks,
                   const size_t        threads,
+                  cudaStream_t        stream,
                   T*                  data,
                   const size_t        size,
                   const GenerateFunc& generate_func,
                   const Extra         extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
-                                                                       data,
-                                                                       size / dimensions,
-                                                                       generate_func,
-                                                                       extra);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads, 0, stream>>>(states,
+                                                                                  data,
+                                                                                  size / dimensions,
+                                                                                  generate_func,
+                                                                                  extra);
     }
 };
 
@@ -421,11 +440,13 @@ struct runner<curandStateSobol64_t>
     {
         this->dimensions = dimensions;
 
+        curandDirectionVectors64_t* h_directions;
+        CURAND_CALL(
+            curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
+
         const size_t states_size = blocks * threads * dimensions;
         CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateSobol64_t)));
 
-        curandDirectionVectors64_t * h_directions;
-        CURAND_CALL(curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
         unsigned long long int* directions;
         const size_t            size = dimensions * sizeof(unsigned long long) * 64;
         CUDA_CALL(cudaMalloc(&directions, size));
@@ -434,7 +455,7 @@ struct runner<curandStateSobol64_t>
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         init_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -453,6 +474,7 @@ struct runner<curandStateSobol64_t>
     template<typename T, typename GenerateFunc, typename Extra>
     void generate(const size_t        blocks,
                   const size_t        threads,
+                  cudaStream_t        stream,
                   T*                  data,
                   const size_t        size,
                   const GenerateFunc& generate_func,
@@ -481,26 +503,26 @@ struct runner<curandStateScrambledSobol64_t>
     {
         this->dimensions = dimensions;
 
-        const size_t states_size = blocks * threads * dimensions;
-        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
-
         curandDirectionVectors64_t* h_directions;
+        unsigned long long*         h_constants;
+
         CURAND_CALL(
             curandGetDirectionVectors64(&h_directions, CURAND_DIRECTION_VECTORS_64_JOEKUO6));
+        CURAND_CALL(curandGetScrambleConstants64(&h_constants));
+
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc(&states, states_size * sizeof(curandStateScrambledSobol64_t)));
+
         unsigned long long* directions;
-        const size_t        size = dimensions * sizeof(unsigned long long) * 64;
-        CUDA_CALL(cudaMalloc(&directions, size));
-        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
+        const size_t        directions_size = dimensions * sizeof(unsigned long long) * 64;
+        CUDA_CALL(cudaMalloc(&directions, directions_size));
+        CUDA_CALL(cudaMemcpy(directions, h_directions, directions_size, cudaMemcpyHostToDevice));
 
-        unsigned long long* h_scramble_constants;
-        CURAND_CALL(curandGetScrambleConstants64(&h_scramble_constants));
         unsigned long long* scramble_constants;
         const size_t        constants_size = dimensions * sizeof(unsigned long long);
         CUDA_CALL(cudaMalloc(&scramble_constants, constants_size));
-        CUDA_CALL(cudaMemcpy(scramble_constants,
-                             h_scramble_constants,
-                             constants_size,
-                             cudaMemcpyHostToDevice));
+        CUDA_CALL(
+            cudaMemcpy(scramble_constants, h_constants, constants_size, cudaMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
@@ -508,7 +530,7 @@ struct runner<curandStateScrambledSobol64_t>
                                                                              scramble_constants,
                                                                              offset);
 
-        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
 
         CUDA_CALL(cudaFree(directions));
@@ -526,12 +548,13 @@ struct runner<curandStateScrambledSobol64_t>
     }
 
     template<typename T, typename GenerateFunc, typename Extra>
-    void generate(const size_t blocks,
-                  const size_t threads,
-                  T * data,
-                  const size_t size,
+    void generate(const size_t        blocks,
+                  const size_t        threads,
+                  cudaStream_t        stream,
+                  T*                  data,
+                  const size_t        size,
                   const GenerateFunc& generate_func,
-                  const Extra extra)
+                  const Extra         extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
         generate_sobol_kernel<<<dim3(blocks_x, dimensions), threads>>>(states,
@@ -543,9 +566,13 @@ struct runner<curandStateScrambledSobol64_t>
 };
 
 template<typename T, typename GeneratorState, typename GenerateFunc, typename Extra>
-void run_benchmark(const cli::Parser& parser,
+void run_benchmark(const cli::Parser&  parser,
+                   cudaStream_t        stream,
                    const GenerateFunc& generate_func,
-                   const Extra extra)
+                   const Extra         extra,
+                   const std::string&  distribution,
+                   const std::string&  engine,
+                   const double        lambda = 0.f)
 {
     const size_t size = parser.get<size_t>("size");
     const size_t dimensions = parser.get<size_t>("dimensions");
@@ -554,6 +581,8 @@ void run_benchmark(const cli::Parser& parser,
     const size_t blocks = parser.get<size_t>("blocks");
     const size_t threads = parser.get<size_t>("threads");
 
+    const std::string format = parser.get<std::string>("format");
+
     T * data;
     CUDA_CALL(cudaMalloc(&data, size * sizeof(T)));
 
@@ -562,129 +591,170 @@ void run_benchmark(const cli::Parser& parser,
     // Warm-up
     for (size_t i = 0; i < 5; i++)
     {
-        r.generate(blocks, threads, data, size, generate_func, extra);
-        CUDA_CALL(cudaPeekAtLastError());
+        r.generate(blocks, threads, stream, data, size, generate_func, extra);
+        CUDA_CALL(cudaGetLastError());
         CUDA_CALL(cudaDeviceSynchronize());
     }
     CUDA_CALL(cudaDeviceSynchronize());
 
     // Measurement
-    auto start = std::chrono::high_resolution_clock::now();
+    cudaEvent_t start, stop;
+    CUDA_CALL(cudaEventCreate(&start));
+    CUDA_CALL(cudaEventCreate(&stop));
+    CUDA_CALL(cudaEventRecord(start, stream));
     for (size_t i = 0; i < trials; i++)
     {
-        r.generate(blocks, threads, data, size, generate_func, extra);
+        r.generate(blocks, threads, stream, data, size, generate_func, extra);
+    }
+    CUDA_CALL(cudaEventRecord(stop, stream));
+    CUDA_CALL(cudaEventSynchronize(stop));
+    float elapsed;
+    CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
+    CUDA_CALL(cudaEventDestroy(start));
+    CUDA_CALL(cudaEventDestroy(stop));
+
+    if(format.compare("csv") == 0)
+    {
+        std::cout << std::fixed << std::setprecision(3) << engine << "," << distribution << ","
+                  << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30)) << ","
+                  << (trials * size) / (elapsed / 1e3 * (1 << 30)) << "," << elapsed / trials << ","
+                  << elapsed << "," << size << ",";
+        if(distribution.compare("poisson") == 0 || distribution.compare("discrete-poisson") == 0)
+        {
+            std::cout << lambda;
+        }
+        std::cout << std::endl;
+    }
+    else
+    {
+        if(format.compare("console") != 0)
+        {
+            std::cout << "Unknown format specified (must be either console or csv).  Defaulting to "
+                         "console output."
+                      << std::endl;
+        }
+        std::cout << std::fixed << std::setprecision(3) << "      "
+                  << "Throughput = " << std::setw(8)
+                  << (trials * size * sizeof(T)) / (elapsed / 1e3 * (1 << 30))
+                  << " GB/s, Samples = " << std::setw(8)
+                  << (trials * size) / (elapsed / 1e3 * (1 << 30))
+                  << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed / trials
+                  << " ms, Time (all) = " << std::setw(8) << elapsed << " ms, Size = " << size
+                  << std::endl;
     }
-    CUDA_CALL(cudaPeekAtLastError());
-    CUDA_CALL(cudaDeviceSynchronize());
-    auto end = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::milli> elapsed = end - start;
-
-    std::cout << std::fixed << std::setprecision(3)
-              << "      "
-              << "Throughput = "
-              << std::setw(8) << (trials * size * sizeof(T)) /
-                    (elapsed.count() / 1e3 * (1 << 30))
-              << " GB/s, Samples = "
-              << std::setw(8) << (trials * size) /
-                    (elapsed.count() / 1e3 * (1 << 30))
-              << " GSample/s, AvgTime (1 trial) = "
-              << std::setw(8) << elapsed.count() / trials
-              << " ms, Time (all) = "
-              << std::setw(8) << elapsed.count()
-              << " ms, Size = " << size
-              << std::endl;
 
     CUDA_CALL(cudaFree(data));
 }
 
 template<typename GeneratorState>
 void run_benchmarks(const cli::Parser& parser,
-                    const std::string& distribution)
+                    const std::string& distribution,
+                    const std::string& engine,
+                    cudaStream_t       stream)
 {
+    const std::string format = parser.get<std::string>("format");
     if (distribution == "uniform-uint")
     {
-        if (!std::is_same<GeneratorState, curandStateSobol64_t>::value &&
-            !std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
-        {
-            run_benchmark<unsigned int, GeneratorState>(parser,
-                [] __device__ (GeneratorState * state, int) {
-                    return curand(state);
-                }, 0
-            );
-        }
+        run_benchmark<unsigned int, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand(state); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "uniform-long-long")
     {
         if (std::is_same<GeneratorState, curandStateSobol64_t>::value ||
             std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
         {
-            run_benchmark<unsigned long long, GeneratorState>(parser,
-                [] __device__ (GeneratorState * state, int) {
-                    return curand(state);
-                }, 0
-            );
+            run_benchmark<unsigned long long, GeneratorState>(
+                parser,
+                stream,
+                [] __device__ ( GeneratorState* state, int) { return curand(state); },
+                0,
+                distribution,
+                engine);
         }
     }
     if (distribution == "uniform-float")
     {
-        run_benchmark<float, GeneratorState>(parser,
-            [] __device__ (GeneratorState * state, int) {
-                return curand_uniform(state);
-            }, 0
-        );
+        run_benchmark<float, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand_uniform(state); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "uniform-double")
     {
-        run_benchmark<double, GeneratorState>(parser,
-            [] __device__ (GeneratorState * state, int) {
-                return curand_uniform_double(state);
-            }, 0
-        );
+        run_benchmark<double, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand_uniform_double(state); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "normal-float")
     {
-        run_benchmark<float, GeneratorState>(parser,
-            [] __device__ (GeneratorState * state, int) {
-                return curand_normal(state);
-            }, 0
-        );
+        run_benchmark<float, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand_normal(state); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "normal-double")
     {
-        run_benchmark<double, GeneratorState>(parser,
-            [] __device__ (GeneratorState * state, int) {
-                return curand_normal_double(state);
-            }, 0
-        );
+        run_benchmark<double, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand_normal_double(state); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "log-normal-float")
     {
-        run_benchmark<float, GeneratorState>(parser,
-            [] __device__ (GeneratorState * state, int) {
-                return curand_log_normal(state, 0.0f, 1.0f);
-            }, 0
-        );
+        run_benchmark<float, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand_log_normal(state, 0.0f, 1.0f); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "log-normal-double")
     {
-        run_benchmark<double, GeneratorState>(parser,
-            [] __device__ (GeneratorState * state, int) {
-                return curand_log_normal_double(state, 0.0, 1.0);
-            }, 0
-        );
+        run_benchmark<double, GeneratorState>(
+            parser,
+            stream,
+            [] __device__ ( GeneratorState* state, int) { return curand_log_normal_double(state, 0.0, 1.0); },
+            0,
+            distribution,
+            engine);
     }
     if (distribution == "poisson")
     {
         const auto lambdas = parser.get<std::vector<double>>("lambda");
         for (double lambda : lambdas)
         {
-            std::cout << "    " << "lambda "
-                 << std::fixed << std::setprecision(1) << lambda << std::endl;
-            run_benchmark<unsigned int, GeneratorState>(parser,
-                [] __device__ (GeneratorState * state, double lambda) {
-                    return curand_poisson(state, lambda);
-                }, lambda
-            );
+            if(format.compare("console") == 0)
+            {
+                std::cout << "    "
+                          << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl;
+            }
+            run_benchmark<unsigned int, GeneratorState>(
+                parser,
+                stream,
+                [] __device__ ( GeneratorState* state, double lambda) { return curand_poisson(state, lambda); },
+                lambda,
+                distribution,
+                engine,
+                lambda);
         }
     }
     if (distribution == "discrete-poisson")
@@ -692,15 +762,22 @@ void run_benchmarks(const cli::Parser& parser,
         const auto lambdas = parser.get<std::vector<double>>("lambda");
         for (double lambda : lambdas)
         {
-            std::cout << "    " << "lambda "
-                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            if(format.compare("console") == 0)
+            {
+                std::cout << "    "
+                          << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl;
+            }
             curandDiscreteDistribution_t discrete_distribution;
             CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution));
-            run_benchmark<unsigned int, GeneratorState>(parser,
-                [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) {
-                    return curand_discrete(state, discrete_distribution);
-                }, discrete_distribution
-            );
+            run_benchmark<unsigned int, GeneratorState>(
+                parser,
+                stream,
+                [] __device__ ( GeneratorState* state, curandDiscreteDistribution_t discrete_distribution)
+                { return curand_discrete(state, discrete_distribution); },
+                discrete_distribution,
+                distribution,
+                engine,
+                lambda);
             CURAND_CALL(curandDestroyDistribution(discrete_distribution));
         }
     }
@@ -758,6 +835,10 @@ int main(int argc, char *argv[])
     parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
     parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
     parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
+    parser.set_optional<std::string>("format",
+                                     "format",
+                                     {"console"},
+                                     "output format: console or csv");
     parser.run_and_exit_if_error();
 
     std::vector<std::string> engines;
@@ -803,53 +884,75 @@ int main(int argc, char *argv[])
     cudaDeviceProp props;
     CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
 
+    std::cout << "benchmark_curand_kernel" << std::endl;
     std::cout << "cuRAND: " << version << " ";
     std::cout << "Runtime: " << runtime_version << " ";
     std::cout << "Device: " << props.name;
     std::cout << std::endl << std::endl;
 
+    cudaStream_t stream;
+    CUDA_CALL(cudaStreamCreate(&stream));
+
+    std::string format         = parser.get<std::string>("format");
+    bool        console_output = format.compare("console") == 0 ? true : false;
+
+    if(!console_output)
+    {
+        std::cout
+            << "Engine,Distribution,Throughput,Samples,AvgTime (1 Trial),Time(all),Size,Lambda"
+            << std::endl;
+        std::cout << ",,GB/s,GSample/s,ms),ms),values," << std::endl;
+    }
     for (auto engine : engines)
     {
-        std::cout << engine << ":" << std::endl;
+        if(console_output)
+        {
+            std::cout << engine << ":" << std::endl;
+        }
         for (auto distribution : distributions)
         {
-            std::cout << "  " << distribution << ":" << std::endl;
+            if(console_output)
+            {
+                std::cout << engine << ":" << std::endl;
+            }
             const std::string plot_name = engine + "-" + distribution;
             if (engine == "xorwow")
             {
-                run_benchmarks<curandStateXORWOW_t>(parser, distribution);
+                run_benchmarks<curandStateXORWOW_t>(parser, distribution, engine, stream);
             }
             else if (engine == "mrg32k3a")
             {
-                run_benchmarks<curandStateMRG32k3a_t>(parser, distribution);
+                run_benchmarks<curandStateMRG32k3a_t>(parser, distribution, engine, stream);
             }
             else if (engine == "philox")
             {
-                run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution);
+                run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution, engine, stream);
             }
             else if (engine == "sobol32")
             {
-                run_benchmarks<curandStateSobol32_t>(parser, distribution);
+                run_benchmarks<curandStateSobol32_t>(parser, distribution, engine, stream);
             }
             else if(engine == "scrambled_sobol32")
             {
-                run_benchmarks<curandStateScrambledSobol32_t>(parser, distribution);
+                run_benchmarks<curandStateScrambledSobol32_t>(parser, distribution, engine, stream);
             }
             else if (engine == "sobol64")
             {
-                run_benchmarks<curandStateSobol64_t>(parser, distribution);
+                run_benchmarks<curandStateSobol64_t>(parser, distribution, engine, stream);
             }
             else if(engine == "scrambled_sobol64")
             {
-                run_benchmarks<curandStateScrambledSobol64_t>(parser, distribution);
+                run_benchmarks<curandStateScrambledSobol64_t>(parser, distribution, engine, stream);
             }
             else if (engine == "mtgp32")
             {
-                run_benchmarks<curandStateMtgp32_t>(parser, distribution);
+                run_benchmarks<curandStateMtgp32_t>(parser, distribution, engine, stream);
             }
         }
         std::cout << std::endl;
     }
 
+    CUDA_CALL(cudaStreamDestroy(stream));
+
     return 0;
 }
diff --git a/benchmark/benchmark_curand_utils.hpp b/benchmark/benchmark_curand_utils.hpp
index 3a0ff3750..8eef353c6 100644
--- a/benchmark/benchmark_curand_utils.hpp
+++ b/benchmark/benchmark_curand_utils.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -42,6 +42,19 @@
     }                                                                                         \
     while(0)
 
+#define CUDA_CALL(condition)                                                               \
+    do                                                                                     \
+    {                                                                                      \
+        cudaError_t error_ = condition;                                                    \
+        if(error_ != cudaSuccess)                                                          \
+        {                                                                                  \
+            std::cout << "CUDA error: " << error_ << " at " << __FILE__ << ":" << __LINE__ \
+                      << std::endl;                                                        \
+            exit(error_);                                                                  \
+        }                                                                                  \
+    }                                                                                      \
+    while(0)
+
 inline void add_common_benchmark_curand_info()
 {
     int version;
diff --git a/benchmark/benchmark_rocrand_device_api.cpp b/benchmark/benchmark_rocrand_device_api.cpp
index b38430efc..cdf1237ec 100644
--- a/benchmark/benchmark_rocrand_device_api.cpp
+++ b/benchmark/benchmark_rocrand_device_api.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -85,14 +85,7 @@ struct runner
         const size_t states_size = blocks * threads;
         HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState)));
 
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel),
-                           dim3(blocks),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           seed,
-                           offset);
+        init_kernel<<<dim3(blocks), dim3(threads)>>>(states, seed, offset);
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -111,15 +104,7 @@ struct runner
                   const size_t     size,
                   const Generator& generator)
     {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel),
-                           dim3(blocks),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size,
-                           generator);
+        generate_kernel<<<dim3(blocks), dim3(threads), 0, stream>>>(states, data, size, generator);
     }
 };
 
@@ -185,15 +170,10 @@ struct runner<rocrand_state_mtgp32>
                   const size_t     size,
                   const Generator& generator)
     {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel),
-                           dim3(std::min((size_t)200, blocks)),
-                           dim3(256),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size,
-                           generator);
+        generate_kernel<<<dim3(std::min((size_t)200, blocks)), dim3(256), 0, stream>>>(states,
+                                                                                       data,
+                                                                                       size,
+                                                                                       generator);
     }
 };
 
@@ -341,14 +321,10 @@ struct runner<rocrand_state_sobol32>
         HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           static_cast<unsigned int>(offset));
+        init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
+            states,
+            directions,
+            static_cast<unsigned int>(offset));
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -370,15 +346,11 @@ struct runner<rocrand_state_sobol32>
                   const Generator& generator)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generator);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generator);
     }
 };
 
@@ -419,15 +391,11 @@ struct runner<rocrand_state_scrambled_sobol32>
             hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           scramble_constants,
-                           static_cast<unsigned int>(offset));
+        init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
+            states,
+            directions,
+            scramble_constants,
+            static_cast<unsigned int>(offset));
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -450,15 +418,11 @@ struct runner<rocrand_state_scrambled_sobol32>
                   const Generator& generator)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generator);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generator);
     }
 };
 
@@ -477,7 +441,8 @@ struct runner<rocrand_state_sobol64>
         this->dimensions = dimensions;
 
         const unsigned long long* h_directions;
-        rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
+        ROCRAND_CHECK(
+            rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6));
 
         const size_t states_size = blocks * threads * dimensions;
         HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64)));
@@ -488,14 +453,9 @@ struct runner<rocrand_state_sobol64>
         HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           offset);
+        init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(states,
+                                                                         directions,
+                                                                         offset);
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -517,15 +477,11 @@ struct runner<rocrand_state_sobol64>
                   const Generator& generator)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generator);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generator);
     }
 };
 
@@ -546,9 +502,10 @@ struct runner<rocrand_state_scrambled_sobol64>
         const unsigned long long* h_directions;
         const unsigned long long* h_constants;
 
-        rocrand_get_direction_vectors64(&h_directions,
-                                        ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6);
-        rocrand_get_scramble_constants64(&h_constants);
+        ROCRAND_CHECK(
+            rocrand_get_direction_vectors64(&h_directions,
+                                            ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6));
+        ROCRAND_CHECK(rocrand_get_scramble_constants64(&h_constants));
 
         const size_t states_size = blocks * threads * dimensions;
         HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol64)));
@@ -565,15 +522,11 @@ struct runner<rocrand_state_scrambled_sobol64>
             hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           scramble_constants,
-                           offset);
+        init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
+            states,
+            directions,
+            scramble_constants,
+            offset);
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -596,15 +549,11 @@ struct runner<rocrand_state_scrambled_sobol64>
                   const Generator& generator)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generator);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generator);
     }
 };
 
diff --git a/benchmark/benchmark_rocrand_kernel.cpp b/benchmark/benchmark_rocrand_kernel.cpp
index 1e4c38991..8e5009489 100644
--- a/benchmark/benchmark_rocrand_kernel.cpp
+++ b/benchmark/benchmark_rocrand_kernel.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -114,11 +114,7 @@ struct runner
         const size_t states_size = blocks * threads;
         HIP_CHECK(hipMalloc(&states, states_size * sizeof(GeneratorState)));
 
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(init_kernel),
-            dim3(blocks), dim3(threads), 0, 0,
-            states, seed, offset
-        );
+        init_kernel<<<dim3(blocks), dim3(threads)>>>(states, seed, offset);
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -143,11 +139,11 @@ struct runner
                   const GenerateFunc& generate_func,
                   const Extra extra)
     {
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(generate_kernel),
-            dim3(blocks), dim3(threads), 0, stream,
-            states, data, size, generate_func, extra
-        );
+        generate_kernel<<<dim3(blocks), dim3(threads), 0, stream>>>(states,
+                                                                    data,
+                                                                    size,
+                                                                    generate_func,
+                                                                    extra);
     }
 };
 
@@ -222,11 +218,12 @@ struct runner<rocrand_state_mtgp32>
                   const GenerateFunc& generate_func,
                   const Extra extra)
     {
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(generate_kernel),
-            dim3(std::min((size_t)200, blocks)), dim3(256), 0, stream,
-            states, data, size, generate_func, extra
-        );
+        generate_kernel<<<dim3(std::min((size_t)200, blocks)), dim3(256), 0, stream>>>(
+            states,
+            data,
+            size,
+            generate_func,
+            extra);
     }
 };
 
@@ -255,16 +252,11 @@ struct runner<rocrand_state_lfsr113>
         const size_t states_size = blocks * threads;
         HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113)));
 
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel),
-                           dim3(blocks),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           uint4{ROCRAND_LFSR113_DEFAULT_SEED_X,
-                                 ROCRAND_LFSR113_DEFAULT_SEED_Y,
-                                 ROCRAND_LFSR113_DEFAULT_SEED_Z,
-                                 ROCRAND_LFSR113_DEFAULT_SEED_W});
+        init_kernel<<<dim3(blocks), dim3(threads)>>>(states,
+                                                     uint4{ROCRAND_LFSR113_DEFAULT_SEED_X,
+                                                           ROCRAND_LFSR113_DEFAULT_SEED_Y,
+                                                           ROCRAND_LFSR113_DEFAULT_SEED_Z,
+                                                           ROCRAND_LFSR113_DEFAULT_SEED_W});
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -289,16 +281,11 @@ struct runner<rocrand_state_lfsr113>
                   const GenerateFunc& generate_func,
                   const Extra         extra)
     {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel),
-                           dim3(blocks),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size,
-                           generate_func,
-                           extra);
+        generate_kernel<<<dim3(blocks), dim3(threads), 0, stream>>>(states,
+                                                                    data,
+                                                                    size,
+                                                                    generate_func,
+                                                                    extra);
     }
 };
 
@@ -381,14 +368,10 @@ struct runner<rocrand_state_sobol32>
         HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           static_cast<unsigned int>(offset));
+        init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
+            states,
+            directions,
+            static_cast<unsigned int>(offset));
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -416,16 +399,12 @@ struct runner<rocrand_state_sobol32>
                   const Extra extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generate_func,
-                           extra);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generate_func,
+            extra);
     }
 };
 
@@ -466,15 +445,11 @@ struct runner<rocrand_state_scrambled_sobol32>
             hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           scramble_constants,
-                           static_cast<unsigned int>(offset));
+        init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
+            states,
+            directions,
+            scramble_constants,
+            static_cast<unsigned int>(offset));
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -503,16 +478,12 @@ struct runner<rocrand_state_scrambled_sobol32>
                   const Extra         extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generate_func,
-                           extra);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generate_func,
+            extra);
     }
 };
 
@@ -542,14 +513,9 @@ struct runner<rocrand_state_sobol64>
         HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           offset);
+        init_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(states,
+                                                                         directions,
+                                                                         offset);
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -577,16 +543,12 @@ struct runner<rocrand_state_sobol64>
                   const Extra extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generate_func,
-                           extra);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generate_func,
+            extra);
     }
 };
 
@@ -626,15 +588,11 @@ struct runner<rocrand_state_scrambled_sobol64>
             hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice));
 
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           0,
-                           states,
-                           directions,
-                           scramble_constants,
-                           offset);
+        init_scrambled_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads)>>>(
+            states,
+            directions,
+            scramble_constants,
+            offset);
 
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
@@ -663,16 +621,12 @@ struct runner<rocrand_state_scrambled_sobol64>
                   const Extra         extra)
     {
         const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel),
-                           dim3(blocks_x, dimensions),
-                           dim3(threads),
-                           0,
-                           stream,
-                           states,
-                           data,
-                           size / dimensions,
-                           generate_func,
-                           extra);
+        generate_sobol_kernel<<<dim3(blocks_x, dimensions), dim3(threads), 0, stream>>>(
+            states,
+            data,
+            size / dimensions,
+            generate_func,
+            extra);
     }
 };
 

From a0a1f337163095b98b32e7ec056e5d1db091bc13 Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Thu, 23 Jan 2025 09:25:06 +0000
Subject: [PATCH 13/17] Replace download_project with FetchContent

---
 .gitlab-ci.yml                            |   2 -
 benchmark/custom_csv_formater.hpp         |   9 +-
 cmake/Dependencies.cmake                  | 115 ++++++++-------
 cmake/DownloadProject.CMakeLists.cmake.in |  14 --
 cmake/DownloadProject.cmake               | 170 ----------------------
 test/cpp_wrapper/CMakeLists.txt           |   3 +-
 test/cpp_wrapper/cmake/Dependencies.cmake |  45 ++----
 7 files changed, 86 insertions(+), 272 deletions(-)
 delete mode 100644 cmake/DownloadProject.CMakeLists.cmake.in
 delete mode 100644 cmake/DownloadProject.cmake

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ca999bf45..31511b9b3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -202,7 +202,6 @@ copyright-date:
       - $CI_PROJECT_DIR/build/test/test_*
       - $CI_PROJECT_DIR/build/**/CTestTestfile.cmake
       - $CI_PROJECT_DIR/build/benchmark/benchmark_*
-      - $CI_PROJECT_DIR/build/deps/googlebenchmark/
       - $CI_PROJECT_DIR/build/CMakeCache.txt
       - $CI_PROJECT_DIR/build/*.deb
       - $CI_PROJECT_DIR/build/*.zip
@@ -339,7 +338,6 @@ benchmark:benchmark-tuning:
     paths:
       - $CI_PROJECT_DIR/build/library/
       - $CI_PROJECT_DIR/build/benchmark/
-      - $CI_PROJECT_DIR/build/deps/googlebenchmark/
       - $CI_PROJECT_DIR/build/CMakeCache.txt
       - $CI_PROJECT_DIR/build/*.json
     expire_in: 2 weeks
diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp
index 204512026..6c3c52e8f 100644
--- a/benchmark/custom_csv_formater.hpp
+++ b/benchmark/custom_csv_formater.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -178,6 +178,13 @@ inline void customCSVReporter::PrintRunData(const Run& run)
 
     Out << engineName << "," << disName << "," << mode << ",";
     Out << CsvEscape(run.benchmark_name()) << ",";
+    if(run.skipped)
+    {
+        Err << std::string(elements.size() - 3, ',');
+        Err << "true,";
+        Err << CsvEscape(run.skip_message) << "\n";
+        return;
+    }
 
     // Do not print iteration on bigO and RMS report
     if(!run.report_big_o && !run.report_rms)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 36c604440..348ee7c2b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,22 +20,47 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-cmake_minimum_required(VERSION 3.16)
+# Dependencies
 
-# find_package() uses upper-case <PACKAGENAME>_ROOT variables.
-# altough we use GTEST_ROOT for our purposes, it is actually even benefecial for
-# find_package() to look for it there (that's where we are going to put it anyway)
-if(POLICY CMP0144)
-  cmake_policy(SET CMP0144 NEW)
+# Save global state
+# NOTE1: the reason we don't scope global state meddling using add_subdirectory
+#        is because CMake < 3.24 lacks CMAKE_FIND_PACKAGE_TARGETS_GLOBAL which
+#        would promote IMPORTED targets of find_package(CONFIG) to be visible
+#        by other parts of the build. So we save and restore global state.
+#
+# NOTE2: We disable the ROCMChecks.cmake warning noting that we meddle with
+#        global state. This is consequence of abusing the CMake CXX language
+#        which HIP piggybacks on top of. This kind of HIP support has one chance
+#        at observing the global flags, at the find_package(HIP) invocation.
+#        The device compiler won't be able to pick up changes after that, hence
+#        the warning.
+set(USER_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+if(DEFINED BUILD_SHARED_LIBS)
+  set(USER_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
 endif()
+set(USER_ROCM_WARN_TOOLCHAIN_VAR ${ROCM_WARN_TOOLCHAIN_VAR})
 
-# Dependencies
+# Change variables before configuring dependencies
+set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
+# Turn off warnings and errors for all warnings in dependencies
+separate_arguments(CXX_FLAGS_LIST NATIVE_COMMAND ${CMAKE_CXX_FLAGS})
+list(REMOVE_ITEM CXX_FLAGS_LIST /WX -Werror -Werror=pendantic -pedantic-errors)
+if(MSVC)
+  list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "/[Ww]([0-4]?)(all)?") # Remove MSVC warning flags
+  list(APPEND CXX_FLAGS_LIST /w)
+else()
+  list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "-W(all|extra|everything)") # Remove GCC/LLVM flags
+  list(APPEND CXX_FLAGS_LIST -w)
+endif()
+list(JOIN CXX_FLAGS_LIST " " CMAKE_CXX_FLAGS)
+# Don't build client dependencies as shared
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Global flag to cause add_library() to create shared libraries if on." FORCE)
 
 # HIP dependency is handled earlier in the project cmake file
 # when VerifyCompiler.cmake is included.
 
-# For downloading, building, and installing required dependencies
-include(cmake/DownloadProject.cmake)
+# For downloading and building required dependencies
+include(FetchContent)
 
 # Fortran Wrapper
 if(BUILD_FORTRAN_WRAPPER)
@@ -44,6 +69,7 @@ endif()
 
 # Test dependencies
 if(BUILD_TEST)
+  # Google Test (https://github.com/google/googletest)
   # NOTE: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake
   #
   # FindGTest.cmake defines:   GTest::GTest, GTest::Main, GTEST_FOUND
@@ -53,65 +79,43 @@ if(BUILD_TEST)
   # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets
   #        will be duplicately defined.
   if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
-    # Google Test (https://github.com/google/googletest)
     find_package(GTest QUIET)
   endif()
 
   if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest)
-    message(STATUS "GTest not found or force download GTest on. Downloading and building GTest.")
-    set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/gtest CACHE PATH "")
-    if(DEFINED CMAKE_CXX_COMPILER)
-      set(CXX_COMPILER_OPTION "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
-    endif()
-    download_project(
-      PROJ                googletest
-      GIT_REPOSITORY      https://github.com/google/googletest.git
-      GIT_TAG             release-1.11.0
-      INSTALL_DIR         ${GTEST_ROOT}
-      CMAKE_ARGS          -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${CXX_COMPILER_OPTION} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-      LOG_DOWNLOAD        TRUE
-      LOG_CONFIGURE       TRUE
-      LOG_BUILD           TRUE
-      LOG_INSTALL         TRUE
-      BUILD_PROJECT       TRUE
-      UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository
+    message(STATUS "Google Test not found or force download on. Fetching...")
+    option(BUILD_GTEST "Builds the googletest subproject" ON)
+    option(BUILD_GMOCK "Builds the googlemock subproject" OFF)
+    option(INSTALL_GTEST "Enable installation of googletest" OFF)
+    FetchContent_Declare(
+      googletest
+      GIT_REPOSITORY https://github.com/google/googletest.git
+      GIT_TAG        v1.15.2
     )
-    find_package(GTest CONFIG REQUIRED PATHS ${GTEST_ROOT} NO_DEFAULT_PATH)
+    FetchContent_MakeAvailable(googletest)
   endif()
 endif()
 
 # Benchmark dependencies
 if(BUILD_BENCHMARK)
+  # Google Benchmark (https://github.com/google/benchmark)
   if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
-    # Google Benchmark (https://github.com/google/benchmark.git)
-    find_package(benchmark QUIET)
+    find_package(benchmark 1.9.1 QUIET)
   endif()
 
-  if(NOT benchmark_FOUND)
-    message(STATUS "Google Benchmark not found or force download Google Benchmark on. Downloading and building Google Benchmark.")
-    if(CMAKE_CONFIGURATION_TYPES)
-      message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.")
-    endif()
-    set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/googlebenchmark CACHE PATH "")
-    option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." OFF)
-    option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark." OFF)
-    download_project(
-      PROJ           googlebenchmark
+  if(NOT TARGET benchmark::benchmark)
+    message(STATUS "Google Benchmark not found or force download on. Fetching...")
+    option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library" OFF)
+    option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark" OFF)
+    FetchContent_Declare(
+      googlebenchmark
       GIT_REPOSITORY https://github.com/google/benchmark.git
-      GIT_TAG        v1.8.0
-      INSTALL_DIR    ${GOOGLEBENCHMARK_ROOT}
-      CMAKE_ARGS     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_CXX_STANDARD=14 ${COMPILER_OVERRIDE}
-      LOG_DOWNLOAD   TRUE
-      LOG_CONFIGURE  TRUE
-      LOG_BUILD      TRUE
-      LOG_INSTALL    TRUE
-      BUILD_PROJECT  TRUE
-      UPDATE_DISCONNECTED TRUE
+      GIT_TAG        v1.9.1
     )
     set(HAVE_STD_REGEX ON)
     set(RUN_HAVE_STD_REGEX 1)
+    FetchContent_MakeAvailable(googlebenchmark)
   endif()
-  find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT} NO_DEFAULT_PATH)
 endif()
 
 set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
@@ -153,6 +157,15 @@ if(NOT ROCM_FOUND)
   find_package(ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR} NO_DEFAULT_PATH)
 endif()
 
+# Restore user global state
+set(CMAKE_CXX_FLAGS ${USER_CXX_FLAGS})
+if(DEFINED USER_BUILD_SHARED_LIBS)
+  set(BUILD_SHARED_LIBS ${USER_BUILD_SHARED_LIBS})
+else()
+  unset(BUILD_SHARED_LIBS CACHE )
+endif()
+set(ROCM_WARN_TOOLCHAIN_VAR ${USER_ROCM_WARN_TOOLCHAIN_VAR} CACHE BOOL "")
+
 include(ROCMSetupVersion)
 include(ROCMCreatePackage)
 include(ROCMInstallTargets)
diff --git a/cmake/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject.CMakeLists.cmake.in
deleted file mode 100644
index d6e544f55..000000000
--- a/cmake/DownloadProject.CMakeLists.cmake.in
+++ /dev/null
@@ -1,14 +0,0 @@
-# Distributed under the OSI-approved MIT License.  See accompanying
-# file LICENSE or https://github.com/Crascit/DownloadProject for details.
-
-cmake_minimum_required(VERSION 2.8.2)
-
-project(${DL_ARGS_PROJ}-download NONE)
-
-include(ExternalProject)
-ExternalProject_Add(${DL_ARGS_PROJ}-download
-                    ${DL_ARGS_UNPARSED_ARGUMENTS}
-                    SOURCE_DIR          "${DL_ARGS_SOURCE_DIR}"
-                    BUILD_IN_SOURCE     TRUE
-                    TEST_COMMAND        ""
-)
\ No newline at end of file
diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject.cmake
deleted file mode 100644
index 110bbd5c4..000000000
--- a/cmake/DownloadProject.cmake
+++ /dev/null
@@ -1,170 +0,0 @@
-# Distributed under the OSI-approved MIT License.  See accompanying
-# file LICENSE or https://github.com/Crascit/DownloadProject for details.
-#
-# MODULE:   DownloadProject
-#
-# PROVIDES:
-#   download_project( PROJ projectName
-#                    [PREFIX prefixDir]
-#                    [DOWNLOAD_DIR downloadDir]
-#                    [SOURCE_DIR srcDir]
-#                    [BINARY_DIR binDir]
-#                    [QUIET]
-#                    ...
-#   )
-#
-#       Provides the ability to download and unpack a tarball, zip file, git repository,
-#       etc. at configure time (i.e. when the cmake command is run). How the downloaded
-#       and unpacked contents are used is up to the caller, but the motivating case is
-#       to download source code which can then be included directly in the build with
-#       add_subdirectory() after the call to download_project(). Source and build
-#       directories are set up with this in mind.
-#
-#       The PROJ argument is required. The projectName value will be used to construct
-#       the following variables upon exit (obviously replace projectName with its actual
-#       value):
-#
-#           projectName_SOURCE_DIR
-#           projectName_BINARY_DIR
-#
-#       The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically
-#       need to be provided. They can be specified if you want the downloaded source
-#       and build directories to be located in a specific place. The contents of
-#       projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the
-#       locations used whether you provide SOURCE_DIR/BINARY_DIR or not.
-#
-#       The DOWNLOAD_DIR argument does not normally need to be set. It controls the
-#       location of the temporary CMake build used to perform the download.
-#
-#       The PREFIX argument can be provided to change the base location of the default
-#       values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments
-#       are provided, then PREFIX will have no effect. The default value for PREFIX is
-#       CMAKE_BINARY_DIR.
-#
-#       The QUIET option can be given if you do not want to show the output associated
-#       with downloading the specified project.
-#
-#       In addition to the above, any other options are passed through unmodified to
-#       ExternalProject_Add() to perform the actual download, patch and update steps.
-#
-#       Only those ExternalProject_Add() arguments which relate to downloading, patching
-#       and updating of the project sources are intended to be used. Also note that at
-#       least one set of download-related arguments are required.
-#
-#       If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to
-#       prevent a check at the remote end for changes every time CMake is run
-#       after the first successful download. See the documentation of the ExternalProject
-#       module for more information. It is likely you will want to use this option if it
-#       is available to you. Note, however, that the ExternalProject implementation contains
-#       bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when
-#       using the URL download method or when specifying a SOURCE_DIR with no download
-#       method. Fixes for these have been created, the last of which is scheduled for
-#       inclusion in CMake 3.8.0. Details can be found here:
-#
-#           https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c
-#           https://gitlab.kitware.com/cmake/cmake/issues/16428
-#
-#       If you experience build errors related to the update step, consider avoiding
-#       the use of UPDATE_DISCONNECTED.
-#
-# EXAMPLE USAGE:
-#
-#   include(DownloadProject)
-#   download_project(PROJ                googletest
-#                    GIT_REPOSITORY      https://github.com/google/googletest.git
-#                    GIT_TAG             master
-#                    UPDATE_DISCONNECTED 1
-#                    QUIET
-#   )
-#
-#   add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
-#
-#========================================================================================
-
-
-set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}")
-
-include(CMakeParseArguments)
-
-function(download_project)
-
-    set(options QUIET)
-    set(oneValueArgs
-        PROJ
-        PREFIX
-        DOWNLOAD_DIR
-        SOURCE_DIR
-        BINARY_DIR
-    )
-    set(multiValueArgs "")
-
-    cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    # Hide output if requested
-    if (DL_ARGS_QUIET)
-        set(OUTPUT_QUIET "OUTPUT_QUIET")
-    else()
-        unset(OUTPUT_QUIET)
-        message(STATUS "Downloading/updating ${DL_ARGS_PROJ}")
-    endif()
-
-    # Set up where we will put our temporary CMakeLists.txt file and also
-    # the base point below which the default source and binary dirs will be.
-    # The prefix must always be an absolute path.
-    if (NOT DL_ARGS_PREFIX)
-        set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}")
-    else()
-        get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE
-                               BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    endif()
-    if (NOT DL_ARGS_DOWNLOAD_DIR)
-        set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download")
-    endif()
-
-    # Ensure the caller can know where to find the source and build directories
-    if (NOT DL_ARGS_SOURCE_DIR)
-        set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src")
-    endif()
-    if (NOT DL_ARGS_BINARY_DIR)
-        set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build")
-    endif()
-    set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE)
-    set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE)
-
-    # The way that CLion manages multiple configurations, it causes a copy of
-    # the CMakeCache.txt to be copied across due to it not expecting there to
-    # be a project within a project.  This causes the hard-coded paths in the
-    # cache to be copied and builds to fail.  To mitigate this, we simply
-    # remove the cache if it exists before we configure the new project.  It
-    # is safe to do so because it will be re-generated.  Since this is only
-    # executed at the configure step, it should not cause additional builds or
-    # downloads.
-    file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt")
-
-    # Create and build a separate CMake project to carry out the download.
-    # If we've already previously done these steps, they will not cause
-    # anything to be updated, so extra rebuilds of the project won't occur.
-    # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project
-    # has this set to something not findable on the PATH.
-    configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in"
-                   "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt")
-    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
-                        -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}"
-                        .
-                    RESULT_VARIABLE result
-                    ${OUTPUT_QUIET}
-                    WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
-    )
-    if(result)
-        message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
-    endif()
-    execute_process(COMMAND ${CMAKE_COMMAND} --build .
-                    RESULT_VARIABLE result
-                    ${OUTPUT_QUIET}
-                    WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
-    )
-    if(result)
-        message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
-    endif()
-
-endfunction()
\ No newline at end of file
diff --git a/test/cpp_wrapper/CMakeLists.txt b/test/cpp_wrapper/CMakeLists.txt
index 8043696ad..3f678b297 100644
--- a/test/cpp_wrapper/CMakeLists.txt
+++ b/test/cpp_wrapper/CMakeLists.txt
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -32,7 +32,6 @@ list(APPEND CMAKE_MODULE_PATH
     ${HIP_PATH}/cmake $ENV{ROCM_PATH}/hip/cmake # FindHIP.cmake
 )
 
-set(CMAKE_CXX_COMPILER g++)
 if (NOT DEFINED CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
diff --git a/test/cpp_wrapper/cmake/Dependencies.cmake b/test/cpp_wrapper/cmake/Dependencies.cmake
index 4caa3a839..d78373ba4 100644
--- a/test/cpp_wrapper/cmake/Dependencies.cmake
+++ b/test/cpp_wrapper/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -22,21 +22,12 @@
 
 cmake_minimum_required(VERSION 3.16)
 
-# find_package() uses upper-case <PACKAGENAME>_ROOT variables.
-# altough we use GTEST_ROOT for our purposes, it is actually even benefecial for
-# find_package() to look for it there (that's where we are going to put it anyway)
-if(POLICY CMP0144)
-  cmake_policy(SET CMP0144 NEW)
-endif()
-
 # Dependencies
 
-# HIP dependency is handled earlier in the project cmake file
-# when VerifyCompiler.cmake is included.
-
-# For downloading, building, and installing required dependencies
-include(../../cmake/DownloadProject.cmake)
+# For downloading and building required dependencies
+include(FetchContent)
 
+# Google Test (https://github.com/google/googletest)
 # NOTE: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake
 #
 # FindGTest.cmake defines:   GTest::GTest, GTest::Main, GTEST_FOUND
@@ -46,28 +37,18 @@ include(../../cmake/DownloadProject.cmake)
 # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets
 #        will be duplicately defined.
 if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
-  # Google Test (https://github.com/google/googletest)
   find_package(GTest QUIET)
 endif()
 
 if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest)
-  message(STATUS "GTest not found or force download GTest on. Downloading and building GTest.")
-  set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/gtest CACHE PATH "")
-  if(DEFINED CMAKE_CXX_COMPILER)
-    set(CXX_COMPILER_OPTION "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
-  endif()
-  download_project(
-    PROJ                googletest
-    GIT_REPOSITORY      https://github.com/google/googletest.git
-    GIT_TAG             release-1.11.0
-    INSTALL_DIR         ${GTEST_ROOT}
-    CMAKE_ARGS          -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${CXX_COMPILER_OPTION} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    LOG_DOWNLOAD        TRUE
-    LOG_CONFIGURE       TRUE
-    LOG_BUILD           TRUE
-    LOG_INSTALL         TRUE
-    BUILD_PROJECT       TRUE
-    UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository
+  message(STATUS "Google Test not found or force download on. Fetching...")
+  option(BUILD_GTEST "Builds the googletest subproject" ON)
+  option(BUILD_GMOCK "Builds the googlemock subproject" OFF)
+  option(INSTALL_GTEST "Enable installation of googletest" OFF)
+  FetchContent_Declare(
+    googletest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG        v1.15.2
   )
-  find_package(GTest CONFIG REQUIRED PATHS ${GTEST_ROOT} NO_DEFAULT_PATH)
+  FetchContent_MakeAvailable(googletest)
 endif()

From ecfd7488c3f68f425210d420f5513b54b22837a3 Mon Sep 17 00:00:00 2001
From: Nara Prasetya <nara@streamhpc.com>
Date: Tue, 28 Jan 2025 14:18:40 +0000
Subject: [PATCH 14/17] fix: missing err stream

---
 benchmark/custom_csv_formater.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp
index 6c3c52e8f..d31550e48 100644
--- a/benchmark/custom_csv_formater.hpp
+++ b/benchmark/custom_csv_formater.hpp
@@ -152,6 +152,7 @@ inline void customCSVReporter::ReportRuns(const std::vector<Run>& reports)
 inline void customCSVReporter::PrintRunData(const Run& run)
 {
     std::ostream& Out = GetOutputStream();
+    std::ostream& Err = GetErrorStream();
 
     //get the name of the engine and distribution:
     std::string temp = run.benchmark_name();

From 358f3ec60c8efa63bc2d322bc56a5b866edde8b4 Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Tue, 28 Jan 2025 13:08:10 +0000
Subject: [PATCH 15/17] Resolve "Some tests create discrete_distribution but do
 not use it"

---
 test/test_rocrand_host.cpp                    |  10 +-
 test/test_rocrand_kernel_lfsr113.cpp          |  42 +--
 test/test_rocrand_kernel_mrg.cpp              |  42 +--
 test/test_rocrand_kernel_mtgp32.cpp           | 113 ++++++-
 test/test_rocrand_kernel_philox4x32_10.cpp    | 260 +++++++---------
 .../test_rocrand_kernel_scrambled_sobol32.cpp | 127 ++++++--
 .../test_rocrand_kernel_scrambled_sobol64.cpp | 123 ++++++--
 test/test_rocrand_kernel_sobol32.cpp          | 288 ++++++++++-------
 test/test_rocrand_kernel_sobol64.cpp          | 292 ++++++++++++------
 test/test_rocrand_kernel_threefry2x32_20.cpp  |  42 +--
 test/test_rocrand_kernel_threefry2x64_20.cpp  |  46 +--
 test/test_rocrand_kernel_threefry4x32_20.cpp  |  42 +--
 test/test_rocrand_kernel_threefry4x64_20.cpp  |  42 +--
 test/test_rocrand_kernel_xorwow.cpp           | 253 +++++++--------
 14 files changed, 1072 insertions(+), 650 deletions(-)

diff --git a/test/test_rocrand_host.cpp b/test/test_rocrand_host.cpp
index f6cdcdf00..d4844d15c 100644
--- a/test/test_rocrand_host.cpp
+++ b/test/test_rocrand_host.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -62,6 +62,8 @@ struct host_test_params
     }
 };
 
+// Check all combinations of blocking_host_generator and use_default_stream for one PRNG (Philox)
+// and one QRNG (Sobol32), others work the same way
 constexpr host_test_params host_test_params_array[] = {
     {   ROCRAND_RNG_PSEUDO_PHILOX4_32_10, false,  true},
     {         ROCRAND_RNG_PSEUDO_LFSR113, false,  true},
@@ -79,9 +81,9 @@ constexpr host_test_params host_test_params_array[] = {
     {          ROCRAND_RNG_QUASI_SOBOL32, false,  true},
     {          ROCRAND_RNG_QUASI_SOBOL64, false,  true},
 
-    {          ROCRAND_RNG_PSEUDO_XORWOW, false, false},
-    {          ROCRAND_RNG_PSEUDO_XORWOW,  true, false},
-    {          ROCRAND_RNG_PSEUDO_XORWOW,  true,  true},
+    {   ROCRAND_RNG_PSEUDO_PHILOX4_32_10, false, false},
+    {   ROCRAND_RNG_PSEUDO_PHILOX4_32_10,  true, false},
+    {   ROCRAND_RNG_PSEUDO_PHILOX4_32_10,  true,  true},
 
     {          ROCRAND_RNG_QUASI_SOBOL32, false, false},
     {          ROCRAND_RNG_QUASI_SOBOL32,  true, false},
diff --git a/test/test_rocrand_kernel_lfsr113.cpp b/test/test_rocrand_kernel_lfsr113.cpp
index 6ddf88cdc..f876e86a2 100644
--- a/test/test_rocrand_kernel_lfsr113.cpp
+++ b/test/test_rocrand_kernel_lfsr113.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -34,10 +34,11 @@
 #include "test_rocrand_common.hpp"
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    states,
-                                                          const size_t       states_size,
-                                                          uint4              seed,
-                                                          unsigned long long offset)
+__global__
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         uint4              seed,
+                         unsigned long long offset)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int subsequence = state_id;
@@ -50,7 +51,8 @@ __global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    sta
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const size_t size)
+__global__
+void rocrand_kernel(unsigned int* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -74,7 +76,8 @@ __global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, const size_t size)
+__global__
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -98,8 +101,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, cons
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*      output,
-                                                                    const size_t size)
+__global__
+void rocrand_uniform_double_kernel(double* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -123,7 +126,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -144,7 +148,8 @@ __global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -165,9 +170,8 @@ __global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, c
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* output,
-                                                             const size_t  size,
-                                                             double        lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -185,8 +189,10 @@ __global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* outpu
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_discrete_kernel(
-    unsigned int* output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -526,14 +532,14 @@ TEST_P(rocrand_kernel_lfsr113_poisson, rocrand_discrete)
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
                        dim3(4),
                        dim3(64),
                        0,
                        0,
                        output,
                        output_size,
-                       lambda);
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
diff --git a/test/test_rocrand_kernel_mrg.cpp b/test/test_rocrand_kernel_mrg.cpp
index 6f896bceb..6bee0b2f4 100644
--- a/test/test_rocrand_kernel_mrg.cpp
+++ b/test/test_rocrand_kernel_mrg.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -44,10 +44,11 @@ typedef ::testing::Types<rocrand_state_mrg31k3p, rocrand_state_mrg32k3a> rocrand
 TYPED_TEST_SUITE(rocrand_kernel_mrg, rocrand_kernel_mrg_types);
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    states,
-                                                          const size_t       states_size,
-                                                          unsigned long long seed,
-                                                          unsigned long long offset)
+__global__
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         unsigned long long seed,
+                         unsigned long long offset)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int subsequence = state_id;
@@ -60,7 +61,8 @@ __global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    sta
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const size_t size)
+__global__
+void rocrand_kernel(unsigned int* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -78,7 +80,8 @@ __global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, const size_t size)
+__global__
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -96,8 +99,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, cons
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*      output,
-                                                                    const size_t size)
+__global__
+void rocrand_uniform_double_kernel(double* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -115,7 +118,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -136,7 +140,8 @@ __global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -157,9 +162,8 @@ __global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, c
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* output,
-                                                             const size_t  size,
-                                                             double        lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -177,8 +181,10 @@ __global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* outpu
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_discrete_kernel(
-    unsigned int* output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -521,14 +527,14 @@ TYPED_TEST(rocrand_kernel_mrg, rocrand_discrete)
         rocrand_discrete_distribution discrete_distribution;
         ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
                            dim3(4),
                            dim3(64),
                            0,
                            0,
                            output,
                            output_size,
-                           lambda);
+                           discrete_distribution);
         HIP_CHECK(hipGetLastError());
 
         std::vector<unsigned int> output_host(output_size);
diff --git a/test/test_rocrand_kernel_mtgp32.cpp b/test/test_rocrand_kernel_mtgp32.cpp
index 4837b477d..7328bee6e 100644
--- a/test/test_rocrand_kernel_mtgp32.cpp
+++ b/test/test_rocrand_kernel_mtgp32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -35,8 +35,8 @@
 #include "test_rocrand_common.hpp"
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_kernel(
-    GeneratorState* states, unsigned int* output, const size_t size)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_kernel(GeneratorState* states, unsigned int* output, const size_t size)
 {
     const unsigned int state_id = blockIdx.x;
     unsigned int       index    = blockIdx.x * blockDim.x + threadIdx.x;
@@ -61,8 +61,8 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_kernel
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_uniform_kernel(
-    GeneratorState* states, float* output, const size_t size)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_uniform_kernel(GeneratorState* states, float* output, const size_t size)
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -91,8 +91,8 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_unifor
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_normal_kernel(
-    GeneratorState* states, float* output, const size_t size)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_normal_kernel(GeneratorState* states, float* output, const size_t size)
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -124,13 +124,13 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_normal
     }
 
     // Save engine with its state
-    if (thread_id == 0)
+    if(thread_id == 0)
         states[state_id] = state;
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_normal_double_kernel(
-    GeneratorState* states, double* output, const size_t size)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_normal_double_kernel(GeneratorState* states, double* output, const size_t size)
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -167,8 +167,8 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_normal
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_log_normal_kernel(
-    GeneratorState* states, float* output, const size_t size)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_log_normal_kernel(GeneratorState* states, float* output, const size_t size)
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -205,8 +205,8 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_log_no
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_log_normal_double_kernel(
-    GeneratorState* states, double* output, const size_t size)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_log_normal_double_kernel(GeneratorState* states, double* output, const size_t size)
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -243,8 +243,11 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_log_no
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_poisson_kernel(
-    GeneratorState* states, unsigned int* output, const size_t size, double lambda)
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_poisson_kernel(GeneratorState* states,
+                            unsigned int*   output,
+                            const size_t    size,
+                            double          lambda)
 {
     const unsigned int state_id  = blockIdx.x;
     const unsigned int thread_id = threadIdx.x;
@@ -272,6 +275,39 @@ __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void rocrand_poisso
         states[state_id] = state;
 }
 
+template<class GeneratorState>
+__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE)
+void rocrand_discrete_kernel(GeneratorState*               states,
+                             unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
+{
+    const unsigned int state_id  = blockIdx.x;
+    const unsigned int thread_id = threadIdx.x;
+    unsigned int       index     = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int stride          = gridDim.x * blockDim.x;
+
+    __shared__ GeneratorState state;
+    if(thread_id == 0)
+        state = states[state_id];
+    __syncthreads();
+
+    const size_t r               = size % blockDim.x;
+    const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
+    while(index < size_rounded_up)
+    {
+        auto value = rocrand_discrete(&state, discrete_distribution);
+        if(index < size)
+            output[index] = value;
+        // Next position
+        index += stride;
+    }
+
+    // Save engine with its state
+    if(thread_id == 0)
+        states[state_id] = state;
+}
+
 TEST(rocrand_kernel_mtgp32, rocrand_state_mtgp32_type)
 {
     typedef rocrand_state_mtgp32 state_type;
@@ -542,6 +578,51 @@ TEST_P(rocrand_kernel_mtgp32_poisson, rocrand_poisson)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
+TEST_P(rocrand_kernel_mtgp32_poisson, rocrand_discrete)
+{
+    typedef rocrand_state_mtgp32 state_type;
+
+    const double lambda = GetParam();
+
+    state_type* states;
+    HIP_CHECK(hipMallocHelper(&states, sizeof(state_type) * 8));
+
+    ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, 8, 0));
+
+    rocrand_discrete_distribution discrete_distribution;
+    ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
+
+    const size_t  output_size = 8192;
+    unsigned int* output;
+    HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
+    HIP_CHECK(hipDeviceSynchronize());
+    rocrand_discrete_kernel<state_type>
+        <<<dim3(8), dim3(256), 0, 0>>>(states, output, output_size, discrete_distribution);
+    HIP_CHECK(hipGetLastError());
+
+    std::vector<unsigned int> output_host(output_size);
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipFree(output));
+    HIP_CHECK(hipFree(states));
+    ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
+
+    double mean = std::accumulate(output_host.begin(), output_host.end(), 0.0) / output_size;
+
+    double variance = 0;
+    for(auto v : output_host)
+    {
+        variance += std::pow(v - mean, 2);
+    }
+    variance = variance / output_size;
+
+    EXPECT_NEAR(mean, lambda, std::max(1.0, lambda * 1e-1));
+    EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
+}
+
 const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_mtgp32_poisson,
diff --git a/test/test_rocrand_kernel_philox4x32_10.cpp b/test/test_rocrand_kernel_philox4x32_10.cpp
index 496a9bd0f..0789bd8d2 100644
--- a/test/test_rocrand_kernel_philox4x32_10.cpp
+++ b/test/test_rocrand_kernel_philox4x32_10.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,26 +18,25 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include <stdio.h>
 #include <gtest/gtest.h>
+#include <stdio.h>
 
-#include <vector>
 #include <cmath>
 #include <type_traits>
+#include <vector>
 
 #include <hip/hip_runtime.h>
 
-#include <rocrand/rocrand_kernel.h>
 #include <rocrand/rocrand.h>
+#include <rocrand/rocrand_kernel.h>
 
 #include "test_common.hpp"
 #include "test_rocrand_common.hpp"
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_init_kernel(GeneratorState * states,
-                         const size_t states_size,
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
                          unsigned long long seed,
                          unsigned long long offset)
 {
@@ -51,15 +50,14 @@ void rocrand_init_kernel(GeneratorState * states,
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_kernel(unsigned int * output, const size_t size)
+void rocrand_kernel(unsigned int* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 0, &state);
 
@@ -71,15 +69,14 @@ void rocrand_kernel(unsigned int * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_uniform_kernel(float * output, const size_t size)
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 0, &state);
 
@@ -96,15 +93,14 @@ void rocrand_uniform_kernel(float * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_normal_kernel(float * output, const size_t size)
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 0, &state);
 
@@ -121,15 +117,14 @@ void rocrand_normal_kernel(float * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_log_normal_kernel(float * output, const size_t size)
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 0, &state);
 
@@ -146,15 +141,14 @@ void rocrand_log_normal_kernel(float * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(64)
-void rocrand_poisson_kernel(unsigned int * output, const size_t size, double lambda)
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(456, subsequence, 234ULL, &state);
 
@@ -166,15 +160,16 @@ void rocrand_poisson_kernel(unsigned int * output, const size_t size, double lam
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(64)
-void rocrand_discrete_kernel(unsigned int * output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(456, subsequence, 234ULL, &state);
 
@@ -212,42 +207,41 @@ TEST(rocrand_kernel_philox4x32_10, rocrand_init)
         typedef rocrand_state_philox4x32_10::philox4x32_10_state internal_state_type;
 
     public:
+        rocrand_state_philox4x32_10_test() {}
 
-        __host__ rocrand_state_philox4x32_10_test() {}
-
-        __host__ internal_state_type internal_state() const
+        internal_state_type internal_state() const
         {
             return m_state;
         }
     };
 
-    typedef rocrand_state_philox4x32_10 state_type;
+    typedef rocrand_state_philox4x32_10      state_type;
     typedef rocrand_state_philox4x32_10_test state_type_test;
 
-    unsigned long long seed = 0xdeadbeefbeefdeadULL;
+    unsigned long long seed   = 0xdeadbeefbeefdeadULL;
     unsigned long long offset = 4 * ((UINT_MAX * 17ULL) + 17);
 
     const size_t states_size = 256;
-    state_type * states;
+    state_type*  states;
     HIP_CHECK(hipMallocHelper(&states, states_size * sizeof(state_type)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_init_kernel),
-        dim3(8), dim3(32), 0, 0,
-        states, states_size,
-        seed, offset
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_init_kernel),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       states,
+                       states_size,
+                       seed,
+                       offset);
     HIP_CHECK(hipGetLastError());
 
     std::vector<state_type_test> states_host(states_size);
-    HIP_CHECK(
-        hipMemcpy(
-            states_host.data(), states,
-            states_size * sizeof(state_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(states_host.data(),
+                        states,
+                        states_size * sizeof(state_type),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(states));
 
@@ -263,12 +257,7 @@ TEST(rocrand_kernel_philox4x32_10, rocrand_init)
         EXPECT_EQ(s.counter.z, subsequence);
         EXPECT_EQ(s.counter.w, 0U);
 
-        EXPECT_TRUE(
-            s.result.x != 0U
-            || s.result.y != 0U
-            || s.result.z != 0U
-            || s.result.w
-        );
+        EXPECT_TRUE(s.result.x != 0U || s.result.y != 0U || s.result.z != 0U || s.result.w);
 
         EXPECT_EQ(s.substate, 0U);
 
@@ -280,26 +269,25 @@ TEST(rocrand_kernel_philox4x32_10, rocrand)
 {
     typedef rocrand_state_philox4x32_10 state_type;
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -317,25 +305,22 @@ TEST(rocrand_kernel_philox4x32_10, rocrand_uniform)
     typedef rocrand_state_philox4x32_10 state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -353,25 +338,22 @@ TEST(rocrand_kernel_philox4x32_10, rocrand_normal)
     typedef rocrand_state_philox4x32_10 state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -397,25 +379,22 @@ TEST(rocrand_kernel_philox4x32_10, rocrand_log_normal)
     typedef rocrand_state_philox4x32_10 state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -434,13 +413,14 @@ TEST(rocrand_kernel_philox4x32_10, rocrand_log_normal)
     stddev = std::sqrt(stddev / output_size);
 
     double logmean = std::log(mean * mean / std::sqrt(stddev + mean * mean));
-    double logstd = std::sqrt(std::log(1.0f + stddev/(mean * mean)));
+    double logstd  = std::sqrt(std::log(1.0f + stddev / (mean * mean)));
 
     EXPECT_NEAR(1.6, logmean, 1.6 * 0.2);
     EXPECT_NEAR(0.25, logstd, 0.25 * 0.2);
 }
 
-class rocrand_kernel_philox4x32_10_poisson : public ::testing::TestWithParam<double> { };
+class rocrand_kernel_philox4x32_10_poisson : public ::testing::TestWithParam<double>
+{};
 
 TEST_P(rocrand_kernel_philox4x32_10_poisson, rocrand_poisson)
 {
@@ -448,26 +428,26 @@ TEST_P(rocrand_kernel_philox4x32_10_poisson, rocrand_poisson)
 
     const double lambda = GetParam();
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size, lambda
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size,
+                       lambda);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -495,29 +475,29 @@ TEST_P(rocrand_kernel_philox4x32_10_poisson, rocrand_discrete)
 
     const double lambda = GetParam();
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size, discrete_distribution
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size,
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
@@ -540,8 +520,8 @@ TEST_P(rocrand_kernel_philox4x32_10_poisson, rocrand_discrete)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
-const double lambdas[] = { 1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0 };
+const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_philox4x32_10_poisson,
-                        rocrand_kernel_philox4x32_10_poisson,
-                        ::testing::ValuesIn(lambdas));
+                         rocrand_kernel_philox4x32_10_poisson,
+                         ::testing::ValuesIn(lambdas));
diff --git a/test/test_rocrand_kernel_scrambled_sobol32.cpp b/test/test_rocrand_kernel_scrambled_sobol32.cpp
index c4c070b4a..6b90aa541 100644
--- a/test/test_rocrand_kernel_scrambled_sobol32.cpp
+++ b/test/test_rocrand_kernel_scrambled_sobol32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 #include "test_common.hpp"
 #include "test_rocrand_common.hpp"
 
+#include <rocrand/rocrand_discrete.h>
 #include <rocrand/rocrand_log_normal.h>
 #include <rocrand/rocrand_normal.h>
 #include <rocrand/rocrand_poisson.h>
@@ -39,7 +40,9 @@
 
 struct rocrand_f
 {
-    __device__ __forceinline__ unsigned int operator()(rocrand_state_scrambled_sobol32* state_ptr)
+    __device__ __forceinline__
+    unsigned int
+        operator()(rocrand_state_scrambled_sobol32* state_ptr)
     {
         return rocrand(state_ptr);
     }
@@ -47,7 +50,9 @@ struct rocrand_f
 
 struct rocrand_uniform_f
 {
-    __device__ __forceinline__ float operator()(rocrand_state_scrambled_sobol32* state_ptr)
+    __device__ __forceinline__
+    float
+        operator()(rocrand_state_scrambled_sobol32* state_ptr)
     {
         return rocrand_uniform(state_ptr);
     }
@@ -55,7 +60,9 @@ struct rocrand_uniform_f
 
 struct rocrand_uniform_double_f
 {
-    __device__ __forceinline__ double operator()(rocrand_state_scrambled_sobol32* state_ptr)
+    __device__ __forceinline__
+    double
+        operator()(rocrand_state_scrambled_sobol32* state_ptr)
     {
         return rocrand_uniform_double(state_ptr);
     }
@@ -63,7 +70,9 @@ struct rocrand_uniform_double_f
 
 struct rocrand_normal_f
 {
-    __device__ __forceinline__ float operator()(rocrand_state_scrambled_sobol32* state_ptr)
+    __device__ __forceinline__
+    float
+        operator()(rocrand_state_scrambled_sobol32* state_ptr)
     {
         return rocrand_normal(state_ptr);
     }
@@ -71,7 +80,9 @@ struct rocrand_normal_f
 
 struct rocrand_normal_double_f
 {
-    __device__ __forceinline__ double operator()(rocrand_state_scrambled_sobol32* state_ptr)
+    __device__ __forceinline__
+    double
+        operator()(rocrand_state_scrambled_sobol32* state_ptr)
     {
         return rocrand_normal_double(state_ptr);
     }
@@ -79,7 +90,8 @@ struct rocrand_normal_double_f
 
 struct rocrand_log_normal_f
 {
-    __device__ __forceinline__ float
+    __device__ __forceinline__
+    float
         operator()(rocrand_state_scrambled_sobol32* state_ptr, float mean, float std)
     {
         return rocrand_log_normal(state_ptr, mean, std);
@@ -88,8 +100,9 @@ struct rocrand_log_normal_f
 
 struct rocrand_log_normal_double_f
 {
-    __device__ __forceinline__ float
-        operator()(rocrand_state_scrambled_sobol32* state_ptr, float mean, float std)
+    __device__ __forceinline__
+    double
+        operator()(rocrand_state_scrambled_sobol32* state_ptr, double mean, double std)
     {
         return rocrand_log_normal_double(state_ptr, mean, std);
     }
@@ -97,19 +110,32 @@ struct rocrand_log_normal_double_f
 
 struct rocrand_poisson_f
 {
-    __device__ __forceinline__ unsigned int operator()(rocrand_state_scrambled_sobol32* state_ptr,
-                                                       double                           lambda)
+    __device__ __forceinline__
+    unsigned int
+        operator()(rocrand_state_scrambled_sobol32* state_ptr, double lambda)
     {
         return rocrand_poisson(state_ptr, lambda);
     }
 };
 
+struct rocrand_discrete_f
+{
+    __device__ __forceinline__
+    unsigned int
+        operator()(rocrand_state_scrambled_sobol32* state_ptr,
+                   rocrand_discrete_distribution    discrete_distribution)
+    {
+        return rocrand_discrete(state_ptr, discrete_distribution);
+    }
+};
+
 template<class Distribution, typename OutputType, typename... Args>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(OutputType*         output,
-                                                          const unsigned int* vectors,
-                                                          const unsigned int* scramble_constants,
-                                                          const size_t        size_per_dimension,
-                                                          Args... args)
+__global__
+void rocrand_kernel(OutputType*         output,
+                    const unsigned int* vectors,
+                    const unsigned int* scramble_constants,
+                    const size_t        size_per_dimension,
+                    Args... args)
 {
     const unsigned int state_id  = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int dimension = blockIdx.y;
@@ -178,7 +204,7 @@ void call_rocrand_kernel(std::vector<ResultType>& output_host,
     unsigned int* m_scramble_constants;
     load_scrambled_sobol32_constants_to_gpu(dimensions, &m_vector, &m_scramble_constants);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_init_kernel<Distribution>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<Distribution>),
                        dim3(8, dimensions),
                        dim3(32),
                        0,
@@ -426,14 +452,14 @@ TEST_P(rocrand_kernel_scrambled_sobol32_poisson, rocrand_poisson)
     constexpr size_t output_size = dimensions * size_per_dimension;
 
     ResultType* output;
-    HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
+    HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(ResultType)));
     HIP_CHECK(hipDeviceSynchronize());
 
     unsigned int* m_vector;
     unsigned int* m_scramble_constants;
     load_scrambled_sobol32_constants_to_gpu(dimensions, &m_vector, &m_scramble_constants);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_init_kernel<rocrand_poisson_f>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<rocrand_poisson_f>),
                        dim3(8, dimensions),
                        dim3(32),
                        0,
@@ -473,6 +499,69 @@ TEST_P(rocrand_kernel_scrambled_sobol32_poisson, rocrand_poisson)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
+TEST_P(rocrand_kernel_scrambled_sobol32_poisson, rocrand_discrete)
+{
+    const double lambda = GetParam();
+
+    using ResultType = unsigned int;
+
+    constexpr size_t       size_per_dimension = 8192;
+    constexpr unsigned int dimensions         = 8;
+    // output_size has to be a multiple of the dimensions for sobol
+    constexpr size_t output_size = dimensions * size_per_dimension;
+
+    ResultType* output;
+    HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(ResultType)));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    unsigned int* m_vector;
+    unsigned int* m_scramble_constants;
+    load_scrambled_sobol32_constants_to_gpu(dimensions, &m_vector, &m_scramble_constants);
+
+    rocrand_discrete_distribution discrete_distribution;
+    ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<rocrand_discrete_f>),
+                       dim3(8, dimensions),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       m_scramble_constants,
+                       size_per_dimension,
+                       discrete_distribution);
+    HIP_CHECK(hipGetLastError());
+
+    std::vector<ResultType> output_host(output_size);
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(ResultType),
+                        hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipFree(output));
+    HIP_CHECK(hipFree(m_vector));
+    HIP_CHECK(hipFree(m_scramble_constants));
+    ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
+
+    double mean = 0;
+    for(ResultType v : output_host)
+    {
+        mean += static_cast<double>(v);
+    }
+    mean = mean / output_size;
+
+    double variance = 0;
+    for(ResultType v : output_host)
+    {
+        variance += std::pow(v - mean, 2);
+    }
+    variance = variance / output_size;
+
+    EXPECT_NEAR(mean, lambda, std::max(1.0, lambda * 1e-1));
+    EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
+}
+
 const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_scrambled_sobol32_poisson,
diff --git a/test/test_rocrand_kernel_scrambled_sobol64.cpp b/test/test_rocrand_kernel_scrambled_sobol64.cpp
index dc67c0aaa..a72826268 100644
--- a/test/test_rocrand_kernel_scrambled_sobol64.cpp
+++ b/test/test_rocrand_kernel_scrambled_sobol64.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 #include "test_common.hpp"
 #include "test_rocrand_common.hpp"
 
+#include <rocrand/rocrand_discrete.h>
 #include <rocrand/rocrand_log_normal.h>
 #include <rocrand/rocrand_normal.h>
 #include <rocrand/rocrand_poisson.h>
@@ -39,7 +40,8 @@
 
 struct rocrand_f
 {
-    __device__ __forceinline__ unsigned long long int
+    __device__ __forceinline__
+    unsigned long long int
         operator()(rocrand_state_scrambled_sobol64* state_ptr)
     {
         return rocrand(state_ptr);
@@ -48,7 +50,9 @@ struct rocrand_f
 
 struct rocrand_uniform_f
 {
-    __device__ __forceinline__ float operator()(rocrand_state_scrambled_sobol64* state_ptr)
+    __device__ __forceinline__
+    float
+        operator()(rocrand_state_scrambled_sobol64* state_ptr)
     {
         return rocrand_uniform(state_ptr);
     }
@@ -56,7 +60,9 @@ struct rocrand_uniform_f
 
 struct rocrand_uniform_double_f
 {
-    __device__ __forceinline__ double operator()(rocrand_state_scrambled_sobol64* state_ptr)
+    __device__ __forceinline__
+    double
+        operator()(rocrand_state_scrambled_sobol64* state_ptr)
     {
         return rocrand_uniform_double(state_ptr);
     }
@@ -64,7 +70,9 @@ struct rocrand_uniform_double_f
 
 struct rocrand_normal_f
 {
-    __device__ __forceinline__ float operator()(rocrand_state_scrambled_sobol64* state_ptr)
+    __device__ __forceinline__
+    float
+        operator()(rocrand_state_scrambled_sobol64* state_ptr)
     {
         return rocrand_normal(state_ptr);
     }
@@ -72,7 +80,9 @@ struct rocrand_normal_f
 
 struct rocrand_normal_double_f
 {
-    __device__ __forceinline__ double operator()(rocrand_state_scrambled_sobol64* state_ptr)
+    __device__ __forceinline__
+    double
+        operator()(rocrand_state_scrambled_sobol64* state_ptr)
     {
         return rocrand_normal_double(state_ptr);
     }
@@ -80,7 +90,8 @@ struct rocrand_normal_double_f
 
 struct rocrand_log_normal_f
 {
-    __device__ __forceinline__ float
+    __device__ __forceinline__
+    float
         operator()(rocrand_state_scrambled_sobol64* state_ptr, float mean, float std)
     {
         return rocrand_log_normal(state_ptr, mean, std);
@@ -89,8 +100,9 @@ struct rocrand_log_normal_f
 
 struct rocrand_log_normal_double_f
 {
-    __device__ __forceinline__ float
-        operator()(rocrand_state_scrambled_sobol64* state_ptr, float mean, float std)
+    __device__ __forceinline__
+    double
+        operator()(rocrand_state_scrambled_sobol64* state_ptr, double mean, double std)
     {
         return rocrand_log_normal_double(state_ptr, mean, std);
     }
@@ -98,20 +110,32 @@ struct rocrand_log_normal_double_f
 
 struct rocrand_poisson_f
 {
-    __device__ __forceinline__ unsigned long long int
+    __device__ __forceinline__
+    unsigned int
         operator()(rocrand_state_scrambled_sobol64* state_ptr, double lambda)
     {
         return rocrand_poisson(state_ptr, lambda);
     }
 };
 
+struct rocrand_discrete_f
+{
+    __device__ __forceinline__
+    unsigned int
+        operator()(rocrand_state_scrambled_sobol64* state_ptr,
+                   rocrand_discrete_distribution    discrete_distribution)
+    {
+        return rocrand_discrete(state_ptr, discrete_distribution);
+    }
+};
+
 template<class Distribution, typename OutputType, typename... Args>
 __global__
-    __launch_bounds__(32) void rocrand_init_kernel(OutputType*                   output,
-                                                   const unsigned long long int* vectors,
-                                                   const unsigned long long int* scramble_constants,
-                                                   const size_t                  size_per_dimension,
-                                                   Args... args)
+void rocrand_kernel(OutputType*                   output,
+                    const unsigned long long int* vectors,
+                    const unsigned long long int* scramble_constants,
+                    const size_t                  size_per_dimension,
+                    Args... args)
 {
     const unsigned int state_id  = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int dimension = blockIdx.y;
@@ -179,7 +203,7 @@ void call_rocrand_kernel(std::vector<ResultType>& output_host,
     unsigned long long int* m_scramble_constants;
     load_scrambled_sobol64_constants_to_gpu(dimensions, &m_vector, &m_scramble_constants);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_init_kernel<Distribution>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<Distribution>),
                        dim3(8, dimensions),
                        dim3(32),
                        0,
@@ -420,7 +444,7 @@ TEST_P(rocrand_kernel_scrambled_sobol64_poisson, rocrand_poisson)
 {
     const double lambda = GetParam();
 
-    using ResultType = unsigned long long int;
+    using ResultType = unsigned int;
 
     constexpr size_t       size_per_dimension = 8192;
     constexpr unsigned int dimensions         = 8;
@@ -435,7 +459,7 @@ TEST_P(rocrand_kernel_scrambled_sobol64_poisson, rocrand_poisson)
     unsigned long long int* m_scramble_constants;
     load_scrambled_sobol64_constants_to_gpu(dimensions, &m_vector, &m_scramble_constants);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_init_kernel<rocrand_poisson_f>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<rocrand_poisson_f>),
                        dim3(8, dimensions),
                        dim3(32),
                        0,
@@ -475,6 +499,69 @@ TEST_P(rocrand_kernel_scrambled_sobol64_poisson, rocrand_poisson)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
+TEST_P(rocrand_kernel_scrambled_sobol64_poisson, rocrand_discrete)
+{
+    const double lambda = GetParam();
+
+    using ResultType = unsigned int;
+
+    constexpr size_t       size_per_dimension = 8192;
+    constexpr unsigned int dimensions         = 8;
+    // output_size has to be a multiple of the dimensions for sobol
+    constexpr size_t output_size = dimensions * size_per_dimension;
+
+    ResultType* output;
+    HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(ResultType)));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    unsigned long long int* m_vector;
+    unsigned long long int* m_scramble_constants;
+    load_scrambled_sobol64_constants_to_gpu(dimensions, &m_vector, &m_scramble_constants);
+
+    rocrand_discrete_distribution discrete_distribution;
+    ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<rocrand_discrete_f>),
+                       dim3(8, dimensions),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       m_scramble_constants,
+                       size_per_dimension,
+                       discrete_distribution);
+    HIP_CHECK(hipGetLastError());
+
+    std::vector<ResultType> output_host(output_size);
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(ResultType),
+                        hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipFree(output));
+    HIP_CHECK(hipFree(m_vector));
+    HIP_CHECK(hipFree(m_scramble_constants));
+    ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
+
+    double mean = 0;
+    for(ResultType v : output_host)
+    {
+        mean += static_cast<double>(v);
+    }
+    mean = mean / output_size;
+
+    double variance = 0;
+    for(ResultType v : output_host)
+    {
+        variance += std::pow(v - mean, 2);
+    }
+    variance = variance / output_size;
+
+    EXPECT_NEAR(mean, lambda, std::max(1.0, lambda * 1e-1));
+    EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
+}
+
 const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_scrambled_sobol64_poisson,
diff --git a/test/test_rocrand_kernel_sobol32.cpp b/test/test_rocrand_kernel_sobol32.cpp
index d50c83475..3fdd1ff1a 100644
--- a/test/test_rocrand_kernel_sobol32.cpp
+++ b/test/test_rocrand_kernel_sobol32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,12 +18,12 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include <stdio.h>
 #include <gtest/gtest.h>
+#include <stdio.h>
 
-#include <vector>
 #include <cmath>
 #include <type_traits>
+#include <vector>
 
 #include <hip/hip_runtime.h>
 
@@ -33,12 +33,11 @@
 #include "test_common.hpp"
 #include "test_rocrand_common.hpp"
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_init_kernel(GeneratorState * states,
-                         const size_t states_size,
-                         unsigned int * vectors,
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         unsigned int*      vectors,
                          unsigned long long offset)
 {
     const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -50,96 +49,114 @@ void rocrand_init_kernel(GeneratorState * states,
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_kernel(unsigned int * output, unsigned int * vectors, const size_t size)
+void rocrand_kernel(unsigned int* output, const unsigned int* vectors, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand(&state);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_uniform_kernel(float * output, unsigned int * vectors, const size_t size)
+void rocrand_uniform_kernel(float* output, const unsigned int* vectors, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_uniform(&state);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_normal_kernel(float * output, unsigned int * vectors, const size_t size)
+void rocrand_normal_kernel(float* output, const unsigned int* vectors, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_normal(&state);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_log_normal_kernel(float * output, unsigned int * vectors, const size_t size)
+void rocrand_log_normal_kernel(float* output, const unsigned int* vectors, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_log_normal(&state, 1.6f, 0.25f);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_poisson_kernel(unsigned int * output, unsigned int * vectors, const size_t size, double lambda)
+void rocrand_poisson_kernel(unsigned int*       output,
+                            const unsigned int* vectors,
+                            const size_t        size,
+                            double              lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_poisson(&state, lambda);
     }
 }
 
+template<class GeneratorState>
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const unsigned int*           vectors,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
+{
+    const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int global_size = gridDim.x * blockDim.x;
+
+    const unsigned int n = size / global_size;
+    GeneratorState     state;
+    rocrand_init(vectors, 1234 + state_id * n, &state);
+
+    for(unsigned int i = 0; i < n; i++)
+    {
+        output[state_id * n + i] = rocrand_discrete(&state, discrete_distribution);
+    }
+}
+
 TEST(rocrand_kernel_sobol32, rocrand_state_sobol32_type)
 {
     typedef rocrand_state_sobol32 state_type;
@@ -153,8 +170,8 @@ TEST(rocrand_kernel_sobol32, rocrand)
 {
     typedef rocrand_state_sobol32 state_type;
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -162,27 +179,27 @@ TEST(rocrand_kernel_sobol32, rocrand)
     ROCRAND_CHECK(
         rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
 
-    unsigned int * m_vector;
+    unsigned int* m_vector;
     HIP_CHECK(hipMallocHelper(&m_vector, sizeof(unsigned int) * 8 * 32));
     HIP_CHECK(
         hipMemcpy(m_vector, h_directions, sizeof(unsigned int) * 8 * 32, hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -201,7 +218,7 @@ TEST(rocrand_kernel_sobol32, rocrand_uniform)
     typedef rocrand_state_sobol32 state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -209,27 +226,25 @@ TEST(rocrand_kernel_sobol32, rocrand_uniform)
     ROCRAND_CHECK(
         rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
 
-    unsigned int * m_vector;
+    unsigned int* m_vector;
     HIP_CHECK(hipMallocHelper(&m_vector, sizeof(unsigned int) * 8 * 32));
     HIP_CHECK(
         hipMemcpy(m_vector, h_directions, sizeof(unsigned int) * 8 * 32, hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -248,7 +263,7 @@ TEST(rocrand_kernel_sobol32, rocrand_normal)
     typedef rocrand_state_sobol32 state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -256,27 +271,25 @@ TEST(rocrand_kernel_sobol32, rocrand_normal)
     ROCRAND_CHECK(
         rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
 
-    unsigned int * m_vector;
+    unsigned int* m_vector;
     HIP_CHECK(hipMallocHelper(&m_vector, sizeof(unsigned int) * 8 * 32));
     HIP_CHECK(
         hipMemcpy(m_vector, h_directions, sizeof(unsigned int) * 8 * 32, hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -303,7 +316,7 @@ TEST(rocrand_kernel_sobol32, rocrand_log_normal)
     typedef rocrand_state_sobol32 state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -311,27 +324,25 @@ TEST(rocrand_kernel_sobol32, rocrand_log_normal)
     ROCRAND_CHECK(
         rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
 
-    unsigned int * m_vector;
+    unsigned int* m_vector;
     HIP_CHECK(hipMallocHelper(&m_vector, sizeof(unsigned int) * 8 * 32));
     HIP_CHECK(
         hipMemcpy(m_vector, h_directions, sizeof(unsigned int) * 8 * 32, hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -351,13 +362,14 @@ TEST(rocrand_kernel_sobol32, rocrand_log_normal)
     stddev = std::sqrt(stddev / output_size);
 
     double logmean = std::log(mean * mean / std::sqrt(stddev + mean * mean));
-    double logstd = std::sqrt(std::log(1.0f + stddev/(mean * mean)));
+    double logstd  = std::sqrt(std::log(1.0f + stddev / (mean * mean)));
 
     EXPECT_NEAR(1.6, logmean, 1.6 * 0.2);
     EXPECT_NEAR(0.25, logstd, 0.25 * 0.2);
 }
 
-class rocrand_kernel_sobol32_poisson : public ::testing::TestWithParam<double> { };
+class rocrand_kernel_sobol32_poisson : public ::testing::TestWithParam<double>
+{};
 
 TEST_P(rocrand_kernel_sobol32_poisson, rocrand_poisson)
 {
@@ -369,35 +381,99 @@ TEST_P(rocrand_kernel_sobol32_poisson, rocrand_poisson)
     ROCRAND_CHECK(
         rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
 
-    unsigned int * m_vector;
+    unsigned int* m_vector;
     HIP_CHECK(hipMallocHelper(&m_vector, sizeof(unsigned int) * 8 * 32));
     HIP_CHECK(
         hipMemcpy(m_vector, h_directions, sizeof(unsigned int) * 8 * 32, hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size, lambda
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size,
+                       lambda);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipFree(output));
+    HIP_CHECK(hipFree(m_vector));
+
+    double mean = 0;
+    for(auto v : output_host)
+    {
+        mean += static_cast<double>(v);
+    }
+    mean = mean / output_size;
+
+    double variance = 0;
+    for(auto v : output_host)
+    {
+        variance += std::pow(v - mean, 2);
+    }
+    variance = variance / output_size;
+
+    EXPECT_NEAR(mean, lambda, std::max(1.0, lambda * 1e-1));
+    EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
+}
+
+TEST_P(rocrand_kernel_sobol32_poisson, rocrand_discrete)
+{
+    typedef rocrand_state_sobol32 state_type;
+
+    const double lambda = GetParam();
+
+    const unsigned int* h_directions;
+    ROCRAND_CHECK(
+        rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6));
+
+    unsigned int* m_vector;
+    HIP_CHECK(hipMallocHelper(&m_vector, sizeof(unsigned int) * 8 * 32));
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(m_vector, h_directions, sizeof(unsigned int) * 8 * 32, hipMemcpyHostToDevice));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    const size_t  output_size = 8192;
+    unsigned int* output;
+    HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    rocrand_discrete_distribution discrete_distribution;
+    ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size,
+                       discrete_distribution);
+    HIP_CHECK(hipGetLastError());
+
+    std::vector<unsigned int> output_host(output_size);
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
+    ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
 
     double mean = 0;
     for(auto v : output_host)
@@ -417,8 +493,8 @@ TEST_P(rocrand_kernel_sobol32_poisson, rocrand_poisson)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
-const double lambdas[] = { 1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0 };
+const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_sobol32_poisson,
-                        rocrand_kernel_sobol32_poisson,
-                        ::testing::ValuesIn(lambdas));
+                         rocrand_kernel_sobol32_poisson,
+                         ::testing::ValuesIn(lambdas));
diff --git a/test/test_rocrand_kernel_sobol64.cpp b/test/test_rocrand_kernel_sobol64.cpp
index 3334dcc7e..5779ce2f5 100644
--- a/test/test_rocrand_kernel_sobol64.cpp
+++ b/test/test_rocrand_kernel_sobol64.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,12 +18,12 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include <stdio.h>
 #include <gtest/gtest.h>
+#include <stdio.h>
 
-#include <vector>
 #include <cmath>
 #include <type_traits>
+#include <vector>
 
 #include <hip/hip_runtime.h>
 
@@ -32,13 +32,12 @@
 #define HIP_CHECK(state) ASSERT_EQ(state, hipSuccess)
 #define ROCRAND_CHECK(state) ASSERT_EQ(state, ROCRAND_STATUS_SUCCESS)
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_init_kernel(GeneratorState * states,
-                         const size_t states_size,
-                         unsigned long long int * vectors,
-                         unsigned long long int offset)
+void rocrand_init_kernel(GeneratorState*               states,
+                         const size_t                  states_size,
+                         const unsigned long long int* vectors,
+                         unsigned long long int        offset)
 {
     const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
     if(state_id < states_size)
@@ -49,97 +48,120 @@ void rocrand_init_kernel(GeneratorState * states,
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_kernel(unsigned long long int * output, unsigned long long int * vectors, const size_t size)
+void rocrand_kernel(unsigned long long int*       output,
+                    const unsigned long long int* vectors,
+                    const size_t                  size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand(&state);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_uniform_kernel(double * output, unsigned long long int * vectors, const size_t size)
+void rocrand_uniform_kernel(double*                       output,
+                            const unsigned long long int* vectors,
+                            const size_t                  size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_uniform_double(&state);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_normal_kernel(double * output, unsigned long long int * vectors, const size_t size)
+void rocrand_normal_kernel(double* output, const unsigned long long int* vectors, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_normal_double(&state);
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-__launch_bounds__(32)
-void rocrand_log_normal_kernel(double * output, unsigned long long int * vectors, const size_t size)
+void rocrand_log_normal_kernel(double*                       output,
+                               const unsigned long long int* vectors,
+                               const size_t                  size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_log_normal_double(&state, 1.6f, 0.25f);
     }
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_poisson_kernel(unsigned long long int* output,
-                                                             unsigned long long int* vectors,
-                                                             const size_t            size,
-                                                             double                  lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int*                 output,
+                            const unsigned long long int* vectors,
+                            const size_t                  size,
+                            double                        lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
     const unsigned int n = size / global_size;
-    GeneratorState state;
+    GeneratorState     state;
     rocrand_init(vectors, 1234 + state_id * n, &state);
 
-    for (unsigned int i = 0; i < n; i++)
+    for(unsigned int i = 0; i < n; i++)
     {
         output[state_id * n + i] = rocrand_poisson(&state, lambda);
     }
 }
 
+template<class GeneratorState>
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const unsigned long long int* vectors,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
+{
+    const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int global_size = gridDim.x * blockDim.x;
+
+    const unsigned int n = size / global_size;
+    GeneratorState     state;
+    rocrand_init(vectors, 1234 + state_id * n, &state);
+
+    for(unsigned int i = 0; i < n; i++)
+    {
+        output[state_id * n + i] = rocrand_discrete(&state, discrete_distribution);
+    }
+}
+
 TEST(rocrand_kernel_sobol64, rocrand_state_sobol64_type)
 {
     typedef rocrand_state_sobol64 state_type;
@@ -155,33 +177,31 @@ TEST(rocrand_kernel_sobol64, rocrand)
     using Type = unsigned long long int;
 
     const size_t output_size = 8192;
-    Type * output;
+    Type*        output;
     HIP_CHECK(hipMalloc(&output, output_size * sizeof(Type)));
     HIP_CHECK(hipDeviceSynchronize());
 
     const unsigned long long* h_directions;
     rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
 
-    Type * m_vector;
+    Type* m_vector;
     HIP_CHECK(hipMalloc(&m_vector, sizeof(Type) * 8 * 64));
     HIP_CHECK(hipMemcpy(m_vector, h_directions, sizeof(Type) * 8 * 64, hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<Type> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(Type),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(Type), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -198,10 +218,10 @@ TEST(rocrand_kernel_sobol64, rocrand)
 TEST(rocrand_kernel_sobol64, rocrand_uniform)
 {
     typedef rocrand_state_sobol64 state_type;
-    typedef double Type;
+    typedef double                Type;
 
     const size_t output_size = 256;
-    Type * output;
+    Type*        output;
     HIP_CHECK(hipMalloc(&output, output_size * sizeof(Type)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -209,7 +229,7 @@ TEST(rocrand_kernel_sobol64, rocrand_uniform)
     const DirectionVectorType*     h_directions;
     rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
 
-    DirectionVectorType * m_vector;
+    DirectionVectorType* m_vector;
     HIP_CHECK(hipMalloc(&m_vector, sizeof(DirectionVectorType) * 8 * 64));
     HIP_CHECK(hipMemcpy(m_vector,
                         h_directions,
@@ -217,21 +237,19 @@ TEST(rocrand_kernel_sobol64, rocrand_uniform)
                         hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<Type> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(Type),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(Type), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -248,10 +266,10 @@ TEST(rocrand_kernel_sobol64, rocrand_uniform)
 TEST(rocrand_kernel_sobol64, rocrand_normal)
 {
     typedef rocrand_state_sobol64 state_type;
-    typedef double Type;
+    typedef double                Type;
 
     const size_t output_size = 8192;
-    Type * output;
+    Type*        output;
     HIP_CHECK(hipMalloc(&output, output_size * sizeof(Type)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -259,7 +277,7 @@ TEST(rocrand_kernel_sobol64, rocrand_normal)
     const DirectionVectorType*     h_directions;
     rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
 
-    DirectionVectorType * m_vector;
+    DirectionVectorType* m_vector;
     HIP_CHECK(hipMalloc(&m_vector, sizeof(DirectionVectorType) * 8 * 64));
     HIP_CHECK(hipMemcpy(m_vector,
                         h_directions,
@@ -267,21 +285,19 @@ TEST(rocrand_kernel_sobol64, rocrand_normal)
                         hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<Type> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(Type),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(Type), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -306,10 +322,10 @@ TEST(rocrand_kernel_sobol64, rocrand_normal)
 TEST(rocrand_kernel_sobol64, rocrand_log_normal)
 {
     typedef rocrand_state_sobol64 state_type;
-    typedef double Type;
+    typedef double                Type;
 
     const size_t output_size = 8192;
-    Type * output;
+    Type*        output;
     HIP_CHECK(hipMalloc(&output, output_size * sizeof(Type)));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -317,7 +333,7 @@ TEST(rocrand_kernel_sobol64, rocrand_log_normal)
     const DirectionVectorType*     h_directions;
     rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
 
-    DirectionVectorType * m_vector;
+    DirectionVectorType* m_vector;
     HIP_CHECK(hipMalloc(&m_vector, sizeof(DirectionVectorType) * 8 * 64));
     HIP_CHECK(hipMemcpy(m_vector,
                         h_directions,
@@ -325,21 +341,19 @@ TEST(rocrand_kernel_sobol64, rocrand_log_normal)
                         hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<Type> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(Type),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(Type), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
@@ -359,19 +373,83 @@ TEST(rocrand_kernel_sobol64, rocrand_log_normal)
     stddev = std::sqrt(stddev / output_size);
 
     Type logmean = std::log(mean * mean / std::sqrt(stddev + mean * mean));
-    Type logstd = std::sqrt(std::log(1.0f + stddev/(mean * mean)));
+    Type logstd  = std::sqrt(std::log(1.0f + stddev / (mean * mean)));
 
     EXPECT_NEAR(1.6, logmean, 1.6 * 0.2);
     EXPECT_NEAR(0.25, logstd, 0.25 * 0.2);
 }
 
-class rocrand_kernel_sobol64_poisson : public ::testing::TestWithParam<double> { };
+class rocrand_kernel_sobol64_poisson : public ::testing::TestWithParam<double>
+{};
 
 TEST_P(rocrand_kernel_sobol64_poisson, rocrand_poisson)
 {
     typedef rocrand_state_sobol64 state_type;
-    typedef double Type;
-    typedef unsigned long long int ResultType;
+    typedef double                Type;
+    typedef unsigned int          ResultType;
+
+    const Type lambda = GetParam();
+
+    typedef unsigned long long int DirectionVectorType;
+    const DirectionVectorType*     h_directions;
+    rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
+
+    DirectionVectorType* m_vector;
+    HIP_CHECK(hipMalloc(&m_vector, sizeof(DirectionVectorType) * 8 * 64));
+    HIP_CHECK(hipMemcpy(m_vector,
+                        h_directions,
+                        sizeof(DirectionVectorType) * 8 * 64,
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    const size_t output_size = 8192;
+    ResultType*  output;
+    HIP_CHECK(hipMalloc(&output, output_size * sizeof(ResultType)));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size,
+                       lambda);
+    HIP_CHECK(hipGetLastError());
+
+    std::vector<ResultType> output_host(output_size);
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(ResultType),
+                        hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipFree(output));
+    HIP_CHECK(hipFree(m_vector));
+
+    Type mean = 0;
+    for(auto v : output_host)
+    {
+        mean += static_cast<Type>(v);
+    }
+    mean = mean / output_size;
+
+    Type variance = 0;
+    for(auto v : output_host)
+    {
+        variance += std::pow(v - mean, 2);
+    }
+    variance = variance / output_size;
+
+    EXPECT_NEAR(mean, lambda, std::max(1.0, lambda * 1e-1));
+    EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
+}
+
+TEST_P(rocrand_kernel_sobol64_poisson, rocrand_discrete)
+{
+    typedef rocrand_state_sobol64 state_type;
+    typedef double                Type;
+    typedef unsigned int          ResultType;
 
     const Type lambda = GetParam();
 
@@ -379,7 +457,7 @@ TEST_P(rocrand_kernel_sobol64_poisson, rocrand_poisson)
     const DirectionVectorType*     h_directions;
     rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6);
 
-    DirectionVectorType * m_vector;
+    DirectionVectorType* m_vector;
     HIP_CHECK(hipMalloc(&m_vector, sizeof(DirectionVectorType) * 8 * 64));
     HIP_CHECK(hipMemcpy(m_vector,
                         h_directions,
@@ -392,11 +470,18 @@ TEST_P(rocrand_kernel_sobol64_poisson, rocrand_poisson)
     HIP_CHECK(hipMalloc(&output, output_size * sizeof(ResultType)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
-        dim3(8), dim3(32), 0, 0,
-        output, m_vector, output_size, lambda
-    );
+    rocrand_discrete_distribution discrete_distribution;
+    ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
+                       dim3(8),
+                       dim3(32),
+                       0,
+                       0,
+                       output,
+                       m_vector,
+                       output_size,
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<ResultType> output_host(output_size);
@@ -407,6 +492,7 @@ TEST_P(rocrand_kernel_sobol64_poisson, rocrand_poisson)
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     HIP_CHECK(hipFree(m_vector));
+    ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
 
     Type mean = 0;
     for(auto v : output_host)
@@ -426,8 +512,8 @@ TEST_P(rocrand_kernel_sobol64_poisson, rocrand_poisson)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
-const double lambdas[] = { 1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0 };
+const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_sobol64_poisson,
-                        rocrand_kernel_sobol64_poisson,
-                        ::testing::ValuesIn(lambdas));
+                         rocrand_kernel_sobol64_poisson,
+                         ::testing::ValuesIn(lambdas));
diff --git a/test/test_rocrand_kernel_threefry2x32_20.cpp b/test/test_rocrand_kernel_threefry2x32_20.cpp
index 2efe8e916..46ef621f3 100644
--- a/test/test_rocrand_kernel_threefry2x32_20.cpp
+++ b/test/test_rocrand_kernel_threefry2x32_20.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -34,10 +34,11 @@
 #include "test_rocrand_common.hpp"
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    states,
-                                                          const size_t       states_size,
-                                                          unsigned long long seed,
-                                                          unsigned long long offset)
+__global__
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         unsigned long long seed,
+                         unsigned long long offset)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int subsequence = state_id;
@@ -50,7 +51,8 @@ __global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    sta
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const size_t size)
+__global__
+void rocrand_kernel(unsigned int* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -68,7 +70,8 @@ __global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, const size_t size)
+__global__
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -86,8 +89,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, cons
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*      output,
-                                                                    const size_t size)
+__global__
+void rocrand_uniform_double_kernel(double* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -105,7 +108,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -126,7 +130,8 @@ __global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -147,9 +152,8 @@ __global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, c
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* output,
-                                                             const size_t  size,
-                                                             double        lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -167,8 +171,10 @@ __global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* outpu
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_discrete_kernel(
-    unsigned int* output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -508,14 +514,14 @@ TEST_P(rocrand_kernel_threefry2x32_20_poisson, rocrand_discrete)
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
                        dim3(4),
                        dim3(64),
                        0,
                        0,
                        output,
                        output_size,
-                       lambda);
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
diff --git a/test/test_rocrand_kernel_threefry2x64_20.cpp b/test/test_rocrand_kernel_threefry2x64_20.cpp
index 70af6616a..dc10db821 100644
--- a/test/test_rocrand_kernel_threefry2x64_20.cpp
+++ b/test/test_rocrand_kernel_threefry2x64_20.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -34,10 +34,11 @@
 #include "test_rocrand_common.hpp"
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    states,
-                                                          const size_t       states_size,
-                                                          unsigned long long seed,
-                                                          unsigned long long offset)
+__global__
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         unsigned long long seed,
+                         unsigned long long offset)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int subsequence = state_id;
@@ -50,7 +51,8 @@ __global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    sta
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_kernel(unsigned long long* output, const size_t size)
+__global__
+void rocrand_kernel(unsigned long long* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -68,7 +70,8 @@ __global__ __launch_bounds__(32) void rocrand_kernel(unsigned long long* output,
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, const size_t size)
+__global__
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -86,8 +89,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, cons
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*      output,
-                                                                    const size_t size)
+__global__
+void rocrand_uniform_double_kernel(double* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -105,7 +108,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -126,7 +130,8 @@ __global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -147,9 +152,8 @@ __global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, c
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* output,
-                                                             const size_t  size,
-                                                             double        lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -167,8 +171,10 @@ __global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* outpu
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_discrete_kernel(
-    unsigned int* output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -460,7 +466,7 @@ TEST_P(rocrand_kernel_threefry2x64_20_poisson, rocrand_poisson)
 
     hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
                        dim3(4),
-                       dim3(64),
+                       dim3(32),
                        0,
                        0,
                        output,
@@ -508,14 +514,14 @@ TEST_P(rocrand_kernel_threefry2x64_20_poisson, rocrand_discrete)
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
                        dim3(4),
-                       dim3(64),
+                       dim3(32),
                        0,
                        0,
                        output,
                        output_size,
-                       lambda);
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
diff --git a/test/test_rocrand_kernel_threefry4x32_20.cpp b/test/test_rocrand_kernel_threefry4x32_20.cpp
index ee1fb82e6..81e1f81d0 100644
--- a/test/test_rocrand_kernel_threefry4x32_20.cpp
+++ b/test/test_rocrand_kernel_threefry4x32_20.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -34,10 +34,11 @@
 #include "test_rocrand_common.hpp"
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    states,
-                                                          const size_t       states_size,
-                                                          unsigned long long seed,
-                                                          unsigned long long offset)
+__global__
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         unsigned long long seed,
+                         unsigned long long offset)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int subsequence = state_id;
@@ -50,7 +51,8 @@ __global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    sta
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const size_t size)
+__global__
+void rocrand_kernel(unsigned int* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -68,7 +70,8 @@ __global__ __launch_bounds__(32) void rocrand_kernel(unsigned int* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, const size_t size)
+__global__
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -86,8 +89,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, cons
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*      output,
-                                                                    const size_t size)
+__global__
+void rocrand_uniform_double_kernel(double* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -105,7 +108,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -126,7 +130,8 @@ __global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -147,9 +152,8 @@ __global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, c
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* output,
-                                                             const size_t  size,
-                                                             double        lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -167,8 +171,10 @@ __global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* outpu
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_discrete_kernel(
-    unsigned int* output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -508,14 +514,14 @@ TEST_P(rocrand_kernel_threefry4x32_20_poisson, rocrand_discrete)
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
                        dim3(4),
                        dim3(64),
                        0,
                        0,
                        output,
                        output_size,
-                       lambda);
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
diff --git a/test/test_rocrand_kernel_threefry4x64_20.cpp b/test/test_rocrand_kernel_threefry4x64_20.cpp
index deeaefd5e..caf117b50 100644
--- a/test/test_rocrand_kernel_threefry4x64_20.cpp
+++ b/test/test_rocrand_kernel_threefry4x64_20.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -34,10 +34,11 @@
 #include "test_rocrand_common.hpp"
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    states,
-                                                          const size_t       states_size,
-                                                          unsigned long long seed,
-                                                          unsigned long long offset)
+__global__
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
+                         unsigned long long seed,
+                         unsigned long long offset)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int subsequence = state_id;
@@ -50,7 +51,8 @@ __global__ __launch_bounds__(32) void rocrand_init_kernel(GeneratorState*    sta
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_kernel(unsigned long long* output, const size_t size)
+__global__
+void rocrand_kernel(unsigned long long* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -68,7 +70,8 @@ __global__ __launch_bounds__(32) void rocrand_kernel(unsigned long long* output,
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, const size_t size)
+__global__
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -86,8 +89,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_kernel(float* output, cons
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*      output,
-                                                                    const size_t size)
+__global__
+void rocrand_uniform_double_kernel(double* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -105,7 +108,8 @@ __global__ __launch_bounds__(32) void rocrand_uniform_double_kernel(double*
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -126,7 +130,8 @@ __global__ __launch_bounds__(32) void rocrand_normal_kernel(float* output, const
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, const size_t size)
+__global__
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -147,9 +152,8 @@ __global__ __launch_bounds__(32) void rocrand_log_normal_kernel(float* output, c
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* output,
-                                                             const size_t  size,
-                                                             double        lambda)
+__global__
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -167,8 +171,10 @@ __global__ __launch_bounds__(64) void rocrand_poisson_kernel(unsigned int* outpu
 }
 
 template<class GeneratorState>
-__global__ __launch_bounds__(32) void rocrand_discrete_kernel(
-    unsigned int* output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+__global__
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
@@ -508,14 +514,14 @@ TEST_P(rocrand_kernel_threefry4x64_20_poisson, rocrand_discrete)
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
                        dim3(4),
                        dim3(64),
                        0,
                        0,
                        output,
                        output_size,
-                       lambda);
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
diff --git a/test/test_rocrand_kernel_xorwow.cpp b/test/test_rocrand_kernel_xorwow.cpp
index 6614b3427..2beb0633b 100644
--- a/test/test_rocrand_kernel_xorwow.cpp
+++ b/test/test_rocrand_kernel_xorwow.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,26 +18,25 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include <stdio.h>
 #include <gtest/gtest.h>
+#include <stdio.h>
 
-#include <vector>
 #include <cmath>
 #include <type_traits>
+#include <vector>
 
 #include <hip/hip_runtime.h>
 
-#include <rocrand/rocrand_kernel.h>
 #include <rocrand/rocrand.h>
+#include <rocrand/rocrand_kernel.h>
 
 #include "test_common.hpp"
 #include "test_rocrand_common.hpp"
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_init_kernel(GeneratorState * states,
-                         const size_t states_size,
+void rocrand_init_kernel(GeneratorState*    states,
+                         const size_t       states_size,
                          unsigned long long seed,
                          unsigned long long offset)
 {
@@ -51,15 +50,14 @@ void rocrand_init_kernel(GeneratorState * states,
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_kernel(unsigned int * output, const size_t size)
+void rocrand_kernel(unsigned int* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 123ULL, &state);
 
@@ -71,15 +69,14 @@ void rocrand_kernel(unsigned int * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_uniform_kernel(float * output, const size_t size)
+void rocrand_uniform_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 234ULL, &state);
 
@@ -91,15 +88,14 @@ void rocrand_uniform_kernel(float * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_normal_kernel(float * output, const size_t size)
+void rocrand_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 345ULL, &state);
 
@@ -116,15 +112,14 @@ void rocrand_normal_kernel(float * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_log_normal_kernel(float * output, const size_t size)
+void rocrand_log_normal_kernel(float* output, const size_t size)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 456ULL, &state);
 
@@ -141,15 +136,14 @@ void rocrand_log_normal_kernel(float * output, const size_t size)
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_poisson_kernel(unsigned int * output, const size_t size, double lambda)
+void rocrand_poisson_kernel(unsigned int* output, const size_t size, double lambda)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 234ULL, &state);
 
@@ -161,15 +155,16 @@ void rocrand_poisson_kernel(unsigned int * output, const size_t size, double lam
     }
 }
 
-template <class GeneratorState>
+template<class GeneratorState>
 __global__
-// __launch_bounds__(64) // Causes errors on MI200/HIP on Windows gfx1030
-void rocrand_discrete_kernel(unsigned int * output, const size_t size, rocrand_discrete_distribution discrete_distribution)
+void rocrand_discrete_kernel(unsigned int*                 output,
+                             const size_t                  size,
+                             rocrand_discrete_distribution discrete_distribution)
 {
     const unsigned int state_id    = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int global_size = gridDim.x * blockDim.x;
 
-    GeneratorState state;
+    GeneratorState     state;
     const unsigned int subsequence = state_id;
     rocrand_init(0, subsequence, 234ULL, &state);
 
@@ -198,42 +193,41 @@ TEST(rocrand_kernel_xorwow, rocrand_init)
         typedef rocrand_state_xorwow::xorwow_state internal_state_type;
 
     public:
+        rocrand_state_xorwow_test() {}
 
-        __host__ rocrand_state_xorwow_test() {}
-
-        __host__ internal_state_type internal_state() const
+        internal_state_type internal_state() const
         {
             return m_state;
         }
     };
 
-    typedef rocrand_state_xorwow state_type;
+    typedef rocrand_state_xorwow      state_type;
     typedef rocrand_state_xorwow_test state_type_test;
 
-    unsigned long long seed = 0xdeadbeefbeefdeadULL;
+    unsigned long long seed   = 0xdeadbeefbeefdeadULL;
     unsigned long long offset = 345678ULL;
 
     const size_t states_size = 256;
-    state_type * states;
+    state_type*  states;
     HIP_CHECK(hipMallocHelper(&states, states_size * sizeof(state_type)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_init_kernel),
-        dim3(4), dim3(64), 0, 0,
-        states, states_size,
-        seed, offset
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_init_kernel),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       states,
+                       states_size,
+                       seed,
+                       offset);
     HIP_CHECK(hipGetLastError());
 
     std::vector<state_type_test> states_host(states_size);
-    HIP_CHECK(
-        hipMemcpy(
-            states_host.data(), states,
-            states_size * sizeof(state_type),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(states_host.data(),
+                        states,
+                        states_size * sizeof(state_type),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(states));
 
@@ -256,26 +250,25 @@ TEST(rocrand_kernel_xorwow, rocrand)
 {
     typedef rocrand_state_xorwow state_type;
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -293,25 +286,22 @@ TEST(rocrand_kernel_xorwow, rocrand_uniform)
     typedef rocrand_state_xorwow state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_uniform_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -329,25 +319,22 @@ TEST(rocrand_kernel_xorwow, rocrand_normal)
     typedef rocrand_state_xorwow state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_normal_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -373,25 +360,22 @@ TEST(rocrand_kernel_xorwow, rocrand_log_normal)
     typedef rocrand_state_xorwow state_type;
 
     const size_t output_size = 8192;
-    float * output;
+    float*       output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(float)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_log_normal_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size);
     HIP_CHECK(hipGetLastError());
 
     std::vector<float> output_host(output_size);
     HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(float),
-            hipMemcpyDeviceToHost
-        )
-    );
+        hipMemcpy(output_host.data(), output, output_size * sizeof(float), hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -410,13 +394,14 @@ TEST(rocrand_kernel_xorwow, rocrand_log_normal)
     stddev = std::sqrt(stddev / output_size);
 
     double logmean = std::log(mean * mean / std::sqrt(stddev + mean * mean));
-    double logstd = std::sqrt(std::log(1.0f + stddev/(mean * mean)));
+    double logstd  = std::sqrt(std::log(1.0f + stddev / (mean * mean)));
 
     EXPECT_NEAR(1.6, logmean, 1.6 * 0.2);
     EXPECT_NEAR(0.25, logstd, 0.25 * 0.2);
 }
 
-class rocrand_kernel_xorwow_poisson : public ::testing::TestWithParam<double> { };
+class rocrand_kernel_xorwow_poisson : public ::testing::TestWithParam<double>
+{};
 
 TEST_P(rocrand_kernel_xorwow_poisson, rocrand_poisson)
 {
@@ -424,26 +409,26 @@ TEST_P(rocrand_kernel_xorwow_poisson, rocrand_poisson)
 
     const double lambda = GetParam();
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size, lambda
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_poisson_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size,
+                       lambda);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
 
@@ -471,29 +456,29 @@ TEST_P(rocrand_kernel_xorwow_poisson, rocrand_discrete)
 
     const double lambda = GetParam();
 
-    const size_t output_size = 8192;
-    unsigned int * output;
+    const size_t  output_size = 8192;
+    unsigned int* output;
     HIP_CHECK(hipMallocHelper(&output, output_size * sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
     rocrand_discrete_distribution discrete_distribution;
     ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution));
 
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
-        dim3(4), dim3(64), 0, 0,
-        output, output_size, discrete_distribution
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(rocrand_discrete_kernel<state_type>),
+                       dim3(4),
+                       dim3(64),
+                       0,
+                       0,
+                       output,
+                       output_size,
+                       discrete_distribution);
     HIP_CHECK(hipGetLastError());
 
     std::vector<unsigned int> output_host(output_size);
-    HIP_CHECK(
-        hipMemcpy(
-            output_host.data(), output,
-            output_size * sizeof(unsigned int),
-            hipMemcpyDeviceToHost
-        )
-    );
+    HIP_CHECK(hipMemcpy(output_host.data(),
+                        output,
+                        output_size * sizeof(unsigned int),
+                        hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
     HIP_CHECK(hipFree(output));
     ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution));
@@ -516,8 +501,8 @@ TEST_P(rocrand_kernel_xorwow_poisson, rocrand_discrete)
     EXPECT_NEAR(variance, lambda, std::max(1.0, lambda * 1e-1));
 }
 
-const double lambdas[] = { 1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0 };
+const double lambdas[] = {1.0, 5.5, 20.0, 100.0, 1234.5, 5000.0};
 
 INSTANTIATE_TEST_SUITE_P(rocrand_kernel_xorwow_poisson,
-                        rocrand_kernel_xorwow_poisson,
-                        ::testing::ValuesIn(lambdas));
+                         rocrand_kernel_xorwow_poisson,
+                         ::testing::ValuesIn(lambdas));

From 6c6e6990ddc0c56af70bb80f77290f7b29e5ef71 Mon Sep 17 00:00:00 2001
From: Nara Prasetya <nara@streamhpc.com>
Date: Wed, 29 Jan 2025 14:45:49 +0000
Subject: [PATCH 16/17] docs(changelog): move c++14 deprecation notice to
 upcoming changes

---
 CHANGELOG.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d0d25cc0..89bd37f4a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,11 @@ Documentation for rocRAND is available at
 
 ### Changed
 * Updated several `gfx942` auto tuning parameters.
-* Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release.
+* Deprecated C++14 and set the default target to C++17.
+
+### Upcoming changes
+* C++14 will be removed in the next major release.
+
 
 ### Fixed
 * Fixed an issue where `mt19937.hpp` would cause kernel errors during auto tuning.

From ce3e3cbefaf2ecb8c1cb18a6856c6291fd02141f Mon Sep 17 00:00:00 2001
From: Nara Prasetya <nara@streamhpc.com>
Date: Thu, 30 Jan 2025 16:05:53 +0000
Subject: [PATCH 17/17] Deprecate sobol constants and vectors

---
 CHANGELOG.md                                       |  8 ++++++++
 .../rocrand/rocrand_scrambled_sobol32_constants.h  |  5 ++++-
 .../rocrand_scrambled_sobol32_precomputed.h        |  5 +++--
 .../rocrand/rocrand_scrambled_sobol64_constants.h  |  5 ++++-
 .../rocrand_scrambled_sobol64_precomputed.h        |  5 +++--
 .../include/rocrand/rocrand_sobol32_precomputed.h  |  6 ++++--
 .../include/rocrand/rocrand_sobol64_precomputed.h  |  6 ++++--
 library/src/rng/sobol.hpp                          |  8 +++++++-
 library/src/rocrand.cpp                            | 14 +++++++++++++-
 9 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 89bd37f4a..d5dda1c07 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,9 +8,17 @@ Documentation for rocRAND is available at
 ### Changed
 * Updated several `gfx942` auto tuning parameters.
 * Deprecated C++14 and set the default target to C++17.
+* Directly accessing the (scrambled) sobol32 and sobol64 constants and direction vectors is deprecated:
+  * `h_scrambled_sobol32_constants`, use `rocrand_get_scramble_constants32` instead.
+  * `h_scrambled_sobol64_constants`, use `rocrand_get_scramble_constants64` instead.
+  * `rocrand_h_sobol32_direction_vectors`, use `rocrand_get_direction_vectors32` instead.
+  * `rocrand_h_sobol64_direction_vectors`, use `rocrand_get_direction_vectors64` instead.
+  * `rocrand_h_scrambled_sobol32_direction_vectors`, use `rocrand_get_direction_vectors32` instead.
+  * `rocrand_h_scrambled_sobol64_direction_vectors`, use `rocrand_get_direction_vectors64` instead.
 
 ### Upcoming changes
 * C++14 will be removed in the next major release.
+* Directly accessing the (scrambled) sobol32 and sobol64 constants and direction vectors will be removed in the next major release.
 
 
 ### Fixed
diff --git a/library/include/rocrand/rocrand_scrambled_sobol32_constants.h b/library/include/rocrand/rocrand_scrambled_sobol32_constants.h
index 4316d43c5..ce919162e 100644
--- a/library/include/rocrand/rocrand_scrambled_sobol32_constants.h
+++ b/library/include/rocrand/rocrand_scrambled_sobol32_constants.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,8 @@
     #define SCRAMBLED_SOBOL_DIM 20000
 #endif
 
+// clang-format off
+[[deprecated("Use 'rocrand_get_scramble_constants32(...)' instead.")]]
 static const unsigned int h_scrambled_sobol32_constants[SCRAMBLED_SOBOL_DIM] = {
     0xd091bb5c, 0x22ae9ef6, 0xe7e1faee, 0xd5c31f79, 0x2082352c, 0xf807b7df, 0xe9d30005, 0x3895afe1,
     0xa1e24bba, 0x4ee4092b, 0x18f86863, 0x8c16a625, 0x474ba8c4, 0x3039cd1a, 0x8c006d5f, 0xfe2d7810,
@@ -2531,5 +2533,6 @@ static const unsigned int h_scrambled_sobol32_constants[SCRAMBLED_SOBOL_DIM] = {
     0xbb12e127, 0x35d59e37, 0xe6a00d27, 0xae71c378, 0x76cb8bc3, 0xac7d2f07, 0x78283cae, 0xaaaa2bc1,
 
 };
+// clang-format on
 
 #endif // ROCRAND_SCRAMBLED_SOBOL32_CONSTANTS_H_
diff --git a/library/include/rocrand/rocrand_scrambled_sobol32_precomputed.h b/library/include/rocrand/rocrand_scrambled_sobol32_precomputed.h
index c5d275086..7b161eab8 100644
--- a/library/include/rocrand/rocrand_scrambled_sobol32_precomputed.h
+++ b/library/include/rocrand/rocrand_scrambled_sobol32_precomputed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,8 @@
 #endif // SCRAMBLED_SOBOL_DIM
 #define SCRAMBLED_SOBOL32_N 640000
 
-extern "C" ROCRANDAPI const unsigned int
+extern "C" [[deprecated(
+    "Use 'rocrand_get_direction_vectors32(...)' instead.")]] ROCRANDAPI const unsigned int
     rocrand_h_scrambled_sobol32_direction_vectors[SCRAMBLED_SOBOL32_N];
 
 #endif // ROCRAND_SCRAMBLED_SOBOL32_PRECOMPUTED_H_
diff --git a/library/include/rocrand/rocrand_scrambled_sobol64_constants.h b/library/include/rocrand/rocrand_scrambled_sobol64_constants.h
index 1f676fe4b..ac7f33777 100644
--- a/library/include/rocrand/rocrand_scrambled_sobol64_constants.h
+++ b/library/include/rocrand/rocrand_scrambled_sobol64_constants.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,8 @@
     #define SCRAMBLED_SOBOL_DIM 20000
 #endif
 
+// clang-format off
+[[deprecated("Use 'rocrand_get_scramble_constants64(...)' instead.")]]
 static const unsigned long long int h_scrambled_sobol64_constants[SCRAMBLED_SOBOL_DIM] = {
     0xd091bb5c22ae9ef6, 0xe7e1faeed5c31f79, 0x2082352cf807b7df, 0xe9d300053895afe1,
     0xa1e24bba4ee4092b, 0x18f868638c16a625, 0x474ba8c43039cd1a, 0x8c006d5ffe2d7810,
@@ -5031,5 +5033,6 @@ static const unsigned long long int h_scrambled_sobol64_constants[SCRAMBLED_SOBO
     0xbd2b0c36a656a3a0, 0x0e00fce8b03b2622, 0x3e45ff730078396e, 0xbaa57b945eb14241,
 
 };
+// clang-format on
 
 #endif // ROCRAND_SCRAMBLED_SOBOL64_CONSTANTS_H_
diff --git a/library/include/rocrand/rocrand_scrambled_sobol64_precomputed.h b/library/include/rocrand/rocrand_scrambled_sobol64_precomputed.h
index 050a5c635..359788dd0 100644
--- a/library/include/rocrand/rocrand_scrambled_sobol64_precomputed.h
+++ b/library/include/rocrand/rocrand_scrambled_sobol64_precomputed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,8 @@
 #endif // SCRAMBLED_SOBOL_DIM
 #define SCRAMBLED_SOBOL64_N 1280000
 
-extern "C" ROCRANDAPI const unsigned long long
+extern "C" [[deprecated(
+    "Use 'rocrand_get_direction_vectors64(...)' instead.")]] ROCRANDAPI const unsigned long long
     rocrand_h_scrambled_sobol64_direction_vectors[SCRAMBLED_SOBOL64_N];
 
 #endif // ROCRAND_SCRAMBLED_SOBOL64_PRECOMPUTED_H_
diff --git a/library/include/rocrand/rocrand_sobol32_precomputed.h b/library/include/rocrand/rocrand_sobol32_precomputed.h
index 4e73732be..8740c58f2 100644
--- a/library/include/rocrand/rocrand_sobol32_precomputed.h
+++ b/library/include/rocrand/rocrand_sobol32_precomputed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,8 @@
 #endif // SOBOL_DIM
 #define SOBOL32_N 640000
 
-extern "C" ROCRANDAPI const unsigned int rocrand_h_sobol32_direction_vectors[SOBOL32_N];
+extern "C" [[deprecated(
+    "Use 'rocrand_get_direction_vectors32(...)' instead.")]] ROCRANDAPI const unsigned int
+    rocrand_h_sobol32_direction_vectors[SOBOL32_N];
 
 #endif // ROCRAND_SOBOL32_PRECOMPUTED_H_
diff --git a/library/include/rocrand/rocrand_sobol64_precomputed.h b/library/include/rocrand/rocrand_sobol64_precomputed.h
index 5c1d619b2..462b2a968 100644
--- a/library/include/rocrand/rocrand_sobol64_precomputed.h
+++ b/library/include/rocrand/rocrand_sobol64_precomputed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,8 @@
 #endif // SOBOL_DIM
 #define SOBOL64_N 1280000
 
-extern "C" ROCRANDAPI const unsigned long long rocrand_h_sobol64_direction_vectors[SOBOL64_N];
+extern "C" [[deprecated(
+    "Use 'rocrand_get_direction_vectors64(...)' instead.")]] ROCRANDAPI const unsigned long long
+    rocrand_h_sobol64_direction_vectors[SOBOL64_N];
 
 #endif // ROCRAND_SOBOL64_PRECOMPUTED_H_
diff --git a/library/src/rng/sobol.hpp b/library/src/rng/sobol.hpp
index 65aff0bde..cbb343269 100644
--- a/library/src/rng/sobol.hpp
+++ b/library/src/rng/sobol.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -355,6 +355,8 @@ class sobol_constant_accessor
 
     static const constant_type* get_direction_vectors_ptr()
     {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
         if constexpr(Is64)
         {
             if constexpr(Scrambled)
@@ -377,6 +379,7 @@ class sobol_constant_accessor
                 return rocrand_h_sobol32_direction_vectors;
             }
         }
+#pragma clang diagnostic pop
     }
 
     // Device
@@ -415,6 +418,8 @@ class sobol_constant_accessor
 
     static const constant_type* get_scramble_constants_ptr()
     {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
         if constexpr(Is64)
         {
             return h_scrambled_sobol64_constants;
@@ -423,6 +428,7 @@ class sobol_constant_accessor
         {
             return h_scrambled_sobol32_constants;
         }
+#pragma clang diagnostic pop
     }
 
     // Not scrambled
diff --git a/library/src/rocrand.cpp b/library/src/rocrand.cpp
index 76b669b66..698b2f0fc 100644
--- a/library/src/rocrand.cpp
+++ b/library/src/rocrand.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -590,6 +590,8 @@ rocrand_status ROCRANDAPI
 rocrand_status ROCRANDAPI rocrand_get_direction_vectors32(const unsigned int**         vectors,
                                                           rocrand_direction_vector_set set)
 {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
     switch(set)
     {
         case ROCRAND_DIRECTION_VECTORS_32_JOEKUO6:
@@ -600,11 +602,14 @@ rocrand_status ROCRANDAPI rocrand_get_direction_vectors32(const unsigned int**
             return ROCRAND_STATUS_SUCCESS;
         default: return ROCRAND_STATUS_OUT_OF_RANGE;
     }
+#pragma clang diagnostic pop
 }
 
 rocrand_status ROCRANDAPI rocrand_get_direction_vectors64(const unsigned long long**   vectors,
                                                           rocrand_direction_vector_set set)
 {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
     switch(set)
     {
         case ROCRAND_DIRECTION_VECTORS_64_JOEKUO6:
@@ -615,18 +620,25 @@ rocrand_status ROCRANDAPI rocrand_get_direction_vectors64(const unsigned long lo
             return ROCRAND_STATUS_SUCCESS;
         default: return ROCRAND_STATUS_OUT_OF_RANGE;
     }
+#pragma clang diagnostic pop
 }
 
 rocrand_status ROCRANDAPI rocrand_get_scramble_constants32(const unsigned int** constants)
 {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
     *constants = h_scrambled_sobol32_constants;
     return ROCRAND_STATUS_SUCCESS;
+#pragma clang diagnostic pop
 }
 
 rocrand_status ROCRANDAPI rocrand_get_scramble_constants64(const unsigned long long** constants)
 {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
     *constants = h_scrambled_sobol64_constants;
     return ROCRAND_STATUS_SUCCESS;
+#pragma clang diagnostic pop
 }
 
 #if defined(__cplusplus)