Skip to content

Commit

Permalink
use fast_scan for non-pod data types on CUDA 3.1 and newer
Browse files Browse the repository at this point in the history
resolves issue NVIDIA#203

--HG--
rename : performance/inclusive_segmented_scan.test => performance/inclusive_scan_by_key.test
  • Loading branch information
wnbell committed Sep 3, 2010
1 parent 8b223d8 commit f413760
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 44 deletions.
3 changes: 2 additions & 1 deletion performance/inclusive_scan.test
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ FINALIZE = \
"""
RECORD_TIME();
RECORD_THROUGHPUT(double($InputSize));
RECORD_BANDWIDTH(4*sizeof($InputType)*double($InputSize));
"""

InputTypes = SignedIntegerTypes
InputSizes = StandardSizes
InputSizes = [2**24] #StandardSizes

TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]

47 changes: 47 additions & 0 deletions performance/inclusive_scan_by_key.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
PREAMBLE = \
"""
#include <thrust/scan.h>
"""

INITIALIZE = \
"""
thrust::host_vector<$ValueType> h_values = unittest::random_integers<$ValueType>($InputSize);
thrust::device_vector<$ValueType> d_values = h_values;

thrust::host_vector<$ValueType> h_output($InputSize);
thrust::device_vector<$ValueType> d_output($InputSize);

srand(13);
thrust::host_vector<$KeyType> h_keys($InputSize);
for(size_t i = 0, k = 0; i < $InputSize; i++)
{
h_keys[i] = k;
if (rand() % 50 == 0)
k++;
}
thrust::device_vector<$KeyType> d_keys = h_keys;

thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), h_output.begin());
thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_output.begin());

ASSERT_EQUAL(h_output, d_output);
"""

TIME = \
"""
thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_output.begin());
"""

FINALIZE = \
"""
RECORD_TIME();
RECORD_THROUGHPUT(double($InputSize));
RECORD_BANDWIDTH(4*(sizeof($KeyType) + sizeof($ValueType))*double($InputSize));
"""

KeyTypes = ['int'] #SignedIntegerTypes
ValueTypes = SignedIntegerTypes
InputSizes = [2**24] #StandardSizes

TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes), ('InputSize', InputSizes)]

43 changes: 0 additions & 43 deletions performance/inclusive_segmented_scan.test

This file was deleted.

12 changes: 12 additions & 0 deletions thrust/detail/device/cuda/scan.inl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ template<typename InputIterator,

// whether to use fast_scan or safe_scan
// TODO profile this threshold
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && CUDA_VERSION >= 3010
// CUDA 3.1 and higher support non-pod types in statically-allocated __shared__ memory
static const bool use_fast_scan = sizeof(OutputType) <= 16;
#else
// CUDA 3.0 and earlier must use safe_scan for non-pod types
static const bool use_fast_scan = sizeof(OutputType) <= 16 && thrust::detail::is_pod<OutputType>::value;
#endif

// XXX WAR nvcc unused variable warning
(void) use_fast_scan;
Expand Down Expand Up @@ -82,7 +88,13 @@ template<typename InputIterator,

// whether to use fast_scan or safe_scan
// TODO profile this threshold
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && CUDA_VERSION >= 3010
// CUDA 3.1 and higher support non-pod types in statically-allocated __shared__ memory
static const bool use_fast_scan = sizeof(OutputType) <= 16;
#else
// CUDA 3.0 and earlier must use safe_scan for non-pod types
static const bool use_fast_scan = sizeof(OutputType) <= 16 && thrust::detail::is_pod<OutputType>::value;
#endif

// XXX WAR nvcc 3.0 unused variable warning
(void) use_fast_scan;
Expand Down

0 comments on commit f413760

Please sign in to comment.