From f4137600de2153fd66221128efee3a53c8147c15 Mon Sep 17 00:00:00 2001 From: Nathan Bell Date: Fri, 3 Sep 2010 07:15:29 -0700 Subject: [PATCH] use fast_scan for non-pod data types on CUDA 3.1 and newer resolves issue #203 --HG-- rename : performance/inclusive_segmented_scan.test => performance/inclusive_scan_by_key.test --- performance/inclusive_scan.test | 3 +- performance/inclusive_scan_by_key.test | 47 +++++++++++++++++++++++ performance/inclusive_segmented_scan.test | 43 --------------------- thrust/detail/device/cuda/scan.inl | 12 ++++++ 4 files changed, 61 insertions(+), 44 deletions(-) create mode 100644 performance/inclusive_scan_by_key.test delete mode 100644 performance/inclusive_segmented_scan.test diff --git a/performance/inclusive_scan.test b/performance/inclusive_scan.test index 98b9a5b02..c4d2c53f9 100644 --- a/performance/inclusive_scan.test +++ b/performance/inclusive_scan.test @@ -26,10 +26,11 @@ FINALIZE = \ """ RECORD_TIME(); RECORD_THROUGHPUT(double($InputSize)); + RECORD_BANDWIDTH(4*sizeof($InputType)*double($InputSize)); """ InputTypes = SignedIntegerTypes -InputSizes = StandardSizes +InputSizes = [2**24] #StandardSizes TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)] diff --git a/performance/inclusive_scan_by_key.test b/performance/inclusive_scan_by_key.test new file mode 100644 index 000000000..8843d5e0c --- /dev/null +++ b/performance/inclusive_scan_by_key.test @@ -0,0 +1,47 @@ +PREAMBLE = \ + """ + #include + """ + +INITIALIZE = \ + """ + thrust::host_vector<$ValueType> h_values = unittest::random_integers<$ValueType>($InputSize); + thrust::device_vector<$ValueType> d_values = h_values; + + thrust::host_vector<$ValueType> h_output($InputSize); + thrust::device_vector<$ValueType> d_output($InputSize); + + srand(13); + thrust::host_vector<$KeyType> h_keys($InputSize); + for(size_t i = 0, k = 0; i < $InputSize; i++) + { + h_keys[i] = k; + if (rand() % 50 == 0) + k++; + } + thrust::device_vector<$KeyType> d_keys = h_keys; + + thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), h_output.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_output.begin()); + + ASSERT_EQUAL(h_output, d_output); + """ + +TIME = \ + """ + thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_output.begin()); + """ + +FINALIZE = \ + """ + RECORD_TIME(); + RECORD_THROUGHPUT(double($InputSize)); + RECORD_BANDWIDTH(4*(sizeof($KeyType) + sizeof($ValueType))*double($InputSize)); + """ + +KeyTypes = ['int'] #SignedIntegerTypes +ValueTypes = SignedIntegerTypes +InputSizes = [2**24] #StandardSizes + +TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes), ('InputSize', InputSizes)] + diff --git a/performance/inclusive_segmented_scan.test b/performance/inclusive_segmented_scan.test deleted file mode 100644 index 3343ac47b..000000000 --- a/performance/inclusive_segmented_scan.test +++ /dev/null @@ -1,43 +0,0 @@ -PREAMBLE = \ - """ - #include - """ - -INITIALIZE = \ - """ - thrust::host_vector<$InputType> h_input = unittest::random_integers<$InputType>($InputSize); - thrust::device_vector<$InputType> d_input = h_input; - - thrust::host_vector<$InputType> h_output($InputSize); - thrust::device_vector<$InputType> d_output($InputSize); - - thrust::host_vector h_keys($InputSize); - for(size_t i = 0, k = 0; i < $InputSize; i++){ - h_keys[i] = k; - if (rand() % 50 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::experimental::inclusive_segmented_scan(h_input.begin(), h_input.end(), h_keys.begin(), h_output.begin()); - thrust::experimental::inclusive_segmented_scan(d_input.begin(), d_input.end(), d_keys.begin(), d_output.begin()); - - ASSERT_EQUAL(h_output, d_output); - """ - -TIME = \ - """ - thrust::experimental::inclusive_segmented_scan(d_input.begin(), d_input.end(), d_keys.begin(), d_output.begin()); - """ - -FINALIZE = \ - """ - RECORD_TIME(); - RECORD_THROUGHPUT(double($InputSize)); - """ - -InputTypes = SignedIntegerTypes -InputSizes = StandardSizes - -TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)] - diff --git a/thrust/detail/device/cuda/scan.inl b/thrust/detail/device/cuda/scan.inl index 711fb0df9..59f915664 100644 --- a/thrust/detail/device/cuda/scan.inl +++ b/thrust/detail/device/cuda/scan.inl @@ -51,7 +51,13 @@ template= 3010 + // CUDA 3.1 and higher support non-pod types in statically-allocated __shared__ memory + static const bool use_fast_scan = sizeof(OutputType) <= 16; +#else + // CUDA 3.0 and earlier must use safe_scan for non-pod types static const bool use_fast_scan = sizeof(OutputType) <= 16 && thrust::detail::is_pod::value; +#endif // XXX WAR nvcc unused variable warning (void) use_fast_scan; @@ -82,7 +88,13 @@ template= 3010 + // CUDA 3.1 and higher support non-pod types in statically-allocated __shared__ memory + static const bool use_fast_scan = sizeof(OutputType) <= 16; +#else + // CUDA 3.0 and earlier must use safe_scan for non-pod types static const bool use_fast_scan = sizeof(OutputType) <= 16 && thrust::detail::is_pod::value; +#endif // XXX WAR nvcc 3.0 unused variable warning (void) use_fast_scan;