From 6031df5642b2088a12271682e08fa5b17f75ffba Mon Sep 17 00:00:00 2001 From: Jared Hoberock Date: Fri, 5 Nov 2010 17:39:19 -0700 Subject: [PATCH] First cut at improved CUDA set_intersection & perf test. --- performance/set_intersection.test | 45 ++ testing/set_intersection.cu | 51 +- .../detail/device/cuda/block/inclusive_scan.h | 26 + .../set_intersection.h} | 18 +- .../device/cuda/block/set_intersection.inl | 141 +++++ .../get_set_operation_splitter_ranks.inl | 5 +- thrust/detail/device/cuda/merge.inl | 2 + thrust/detail/device/cuda/set_intersection.h | 49 ++ .../detail/device/cuda/set_intersection.inl | 485 +++++++++++++++ thrust/detail/device/cuda/set_operations.inl | 550 ------------------ .../detail/device/dispatch/set_operations.h | 2 +- 11 files changed, 776 insertions(+), 598 deletions(-) create mode 100644 performance/set_intersection.test rename thrust/detail/device/cuda/{set_operations.h => block/set_intersection.h} (76%) create mode 100644 thrust/detail/device/cuda/block/set_intersection.inl create mode 100644 thrust/detail/device/cuda/set_intersection.h create mode 100644 thrust/detail/device/cuda/set_intersection.inl delete mode 100644 thrust/detail/device/cuda/set_operations.inl diff --git a/performance/set_intersection.test b/performance/set_intersection.test new file mode 100644 index 000000000..2316fc36a --- /dev/null +++ b/performance/set_intersection.test @@ -0,0 +1,45 @@ +PREAMBLE = \ + """ + #include + #include + #include + """ + +INITIALIZE = \ + """ + thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize); + thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize); + thrust::sort(h_a.begin(), h_a.end()); + thrust::sort(h_b.begin(), h_b.end()); + + thrust::host_vector<$InputType> h_result(h_a.size()); + thrust::host_vector<$InputType>::iterator new_end = + thrust::set_intersection(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin()); + h_result.resize(new_end - h_result.begin()); + + thrust::device_vector<$InputType> d_a = h_a, d_b = h_b; + + thrust::device_vector<$InputType> d_result(h_result.size()); + thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin()); + + ASSERT_EQUAL(h_result, d_result); + """ + +TIME = \ + """ + thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin()); + """ + +FINALIZE = \ + """ + RECORD_TIME(); + RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType)); + RECORD_SORTING_RATE(2 * double($InputSize)) + """ + + +InputTypes = ['char', 'short', 'int', 'long', 'float', 'double'] +InputSizes = [2**N for N in range(10, 25)] + +TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)] + diff --git a/testing/set_intersection.cu b/testing/set_intersection.cu index c72fb0e13..f32423d45 100644 --- a/testing/set_intersection.cu +++ b/testing/set_intersection.cu @@ -141,48 +141,26 @@ void TestSetIntersectionMultiset(const size_t n) DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset); -struct non_arithmetic +template + void TestSetIntersectionKeyValue(size_t n) { - __host__ __device__ - non_arithmetic(void) - {} + typedef key_value T; - __host__ __device__ - non_arithmetic(const non_arithmetic &x) - : key(x.key) {} + thrust::host_vector h_keys_a = unittest::random_integers(n); + thrust::host_vector h_values_a = unittest::random_integers(n); - __host__ __device__ - non_arithmetic(const int k) - : key(k) {} + thrust::host_vector h_keys_b = unittest::random_integers(n); + thrust::host_vector h_values_b = unittest::random_integers(n); - __host__ __device__ - bool operator<(const non_arithmetic &rhs) const + thrust::host_vector h_a(n), h_b(n); + for(size_t i = 0; i < n; ++i) { - return key < rhs.key; + h_a[i] = T(h_keys_a[i], h_values_a[i]); + h_b[i] = T(h_keys_b[i], h_values_b[i]); } - __host__ __device__ - bool operator==(const non_arithmetic &rhs) const - { - return key == rhs.key; - } - - int key; -}; - - -void TestSetIntersectionNonArithmetic(void) -{ - const unsigned int n = 12345; - - typedef non_arithmetic T; - - thrust::host_vector temp = unittest::random_integers(2 * n); - thrust::host_vector h_a(temp.begin(), temp.begin() + n); - thrust::host_vector h_b(temp.begin() + n, temp.end()); - - thrust::sort(h_a.begin(), h_a.end()); - thrust::sort(h_b.begin(), h_b.end()); + thrust::stable_sort(h_a.begin(), h_a.end()); + thrust::stable_sort(h_b.begin(), h_b.end()); thrust::device_vector d_a = h_a; thrust::device_vector d_b = h_b; @@ -206,6 +184,5 @@ void TestSetIntersectionNonArithmetic(void) ASSERT_EQUAL_QUIET(h_result, d_result); } -DECLARE_UNITTEST(TestSetIntersectionNonArithmetic); - +DECLARE_VARIABLE_UNITTEST(TestSetIntersectionKeyValue); diff --git a/thrust/detail/device/cuda/block/inclusive_scan.h b/thrust/detail/device/cuda/block/inclusive_scan.h index 1235c5654..583237898 100644 --- a/thrust/detail/device/cuda/block/inclusive_scan.h +++ b/thrust/detail/device/cuda/block/inclusive_scan.h @@ -51,6 +51,32 @@ __device__ if(block_size > 512) { if (threadIdx.x >= 512) { val = binary_op(first[threadIdx.x - 512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } } // end inplace_inclusive_scan() + +template +__device__ +void inplace_inclusive_scan_n(RandomAccessIterator first, + Size n, + BinaryFunction binary_op) +{ + typename thrust::iterator_value::type val = first[threadIdx.x]; + __syncthreads(); + + // assume n <= 2048 + if(n > 1) { if (threadIdx.x < n && threadIdx.x >= 1) { val = binary_op(first[threadIdx.x - 1], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 2) { if (threadIdx.x < n && threadIdx.x >= 2) { val = binary_op(first[threadIdx.x - 2], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 4) { if (threadIdx.x < n && threadIdx.x >= 4) { val = binary_op(first[threadIdx.x - 4], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 8) { if (threadIdx.x < n && threadIdx.x >= 8) { val = binary_op(first[threadIdx.x - 8], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 16) { if (threadIdx.x < n && threadIdx.x >= 16) { val = binary_op(first[threadIdx.x - 16], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 32) { if (threadIdx.x < n && threadIdx.x >= 32) { val = binary_op(first[threadIdx.x - 32], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 64) { if (threadIdx.x < n && threadIdx.x >= 64) { val = binary_op(first[threadIdx.x - 64], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 128) { if (threadIdx.x < n && threadIdx.x >= 128) { val = binary_op(first[threadIdx.x - 128], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 256) { if (threadIdx.x < n && threadIdx.x >= 256) { val = binary_op(first[threadIdx.x - 256], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 512) { if (threadIdx.x < n && threadIdx.x >= 512) { val = binary_op(first[threadIdx.x - 512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } + if(n > 1024) { if (threadIdx.x < n && threadIdx.x >= 1024) { val = binary_op(first[threadIdx.x - 1024], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } +} // end inplace_inclusive_scan() + } // end namespace block } // end namespace cuda } // end namespace device diff --git a/thrust/detail/device/cuda/set_operations.h b/thrust/detail/device/cuda/block/set_intersection.h similarity index 76% rename from thrust/detail/device/cuda/set_operations.h rename to thrust/detail/device/cuda/block/set_intersection.h index da9ec0356..7e8c68e42 100644 --- a/thrust/detail/device/cuda/set_operations.h +++ b/thrust/detail/device/cuda/block/set_intersection.h @@ -18,34 +18,34 @@ namespace thrust { - namespace detail { - namespace device { - namespace cuda { +namespace block +{ template - RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1, +__device__ __forceinline__ + RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2, RandomAccessIterator2 last2, - RandomAccessIterator3 result, + RandomAccessIterator3 temporary, + RandomAccessIterator4 result, StrictWeakOrdering comp); +} // end block } // end cuda - } // end device - } // end detail - } // end thrust -#include +#include diff --git a/thrust/detail/device/cuda/block/set_intersection.inl b/thrust/detail/device/cuda/block/set_intersection.inl new file mode 100644 index 000000000..d1c272ab7 --- /dev/null +++ b/thrust/detail/device/cuda/block/set_intersection.inl @@ -0,0 +1,141 @@ +/* + * Copyright 2008-2010 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace thrust +{ +namespace detail +{ +namespace device +{ +namespace cuda +{ +namespace block +{ + +namespace set_intersection_detail +{ + +// this predicate tests two two-element tuples +// we first use a Compare for the first element +// if the first elements are equivalent, we use +// < for the second elements +// XXX merge duplicates this +// move it some place common +template + struct compare_first_less_second +{ + __host__ __device__ + compare_first_less_second(Compare c) + : comp(c) {} + + template + __host__ __device__ + bool operator()(T1 lhs, T2 rhs) + { + return comp(lhs.get<0>(), rhs.get<0>()) || (!comp(rhs.get<0>(), lhs.get<0>()) && lhs.get<1>() < rhs.get<1>()); + } + + Compare comp; +}; // end compare_first_less_second + +} // end set_intersection_detail + + +template +__device__ __forceinline__ + RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1, + RandomAccessIterator1 last1, + RandomAccessIterator2 first2, + RandomAccessIterator2 last2, + RandomAccessIterator3 temporary, + RandomAccessIterator4 result, + StrictWeakOrdering comp) +{ + using namespace set_intersection_detail; + + typedef typename thrust::iterator_difference::type difference1; + typedef typename thrust::iterator_difference::type difference2; + + difference1 n1 = last1 - first1; + + // search for all matches in the second range for each element in the first + bool found = false; + if(threadIdx.x < n1) + { + RandomAccessIterator1 x = first1; + x += threadIdx.x; + + // count the number of previous occurrances of x the first range + difference1 rank = x - thrust::detail::device::generic::scalar::lower_bound(first1,x,dereference(x),comp); + + // count the number of equivalent elements of x in the second range + thrust::pair matches = + thrust::detail::device::generic::scalar::equal_range(first2,last2,dereference(x),comp); + + difference2 num_matches = matches.second - matches.first; + + // the element is "found" if its rank is less than the number of matches + found = rank < num_matches; + } // end if + + // mark whether my element was found or not in the scratch array + RandomAccessIterator3 temp = temporary; + temp += threadIdx.x; + dereference(temp) = found; + + block::inplace_inclusive_scan_n(temporary, n1, thrust::plus()); + + // copy_if + if(found) + { + // find the index to write our element + unsigned int output_index = 0; + if(threadIdx.x > 0) + { + RandomAccessIterator3 src = temporary; + src += threadIdx.x - 1; + output_index = dereference(src); + } // end if + + RandomAccessIterator1 x = first1; + x += threadIdx.x; + + RandomAccessIterator4 dst = result; + dst += output_index; + dereference(dst) = dereference(x); + } // end if + + return result + temporary[n1-1]; +} // end set_intersection + +} // end block +} // end cuda +} // end device +} // end detail +} // end thrust + diff --git a/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl b/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl index dffc1b5cd..d0ef66e12 100644 --- a/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl +++ b/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl @@ -186,7 +186,10 @@ template struct compare_first_less_second { diff --git a/thrust/detail/device/cuda/set_intersection.h b/thrust/detail/device/cuda/set_intersection.h new file mode 100644 index 000000000..e64dc0e21 --- /dev/null +++ b/thrust/detail/device/cuda/set_intersection.h @@ -0,0 +1,49 @@ +/* + * Copyright 2008-2010 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#pragma once + +namespace thrust +{ +namespace detail +{ +namespace device +{ +namespace cuda +{ + + +template +RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1, + RandomAccessIterator1 last1, + RandomAccessIterator2 first2, + RandomAccessIterator2 last2, + RandomAccessIterator3 result, + Compare comp); + + +} // end cuda +} // end device +} // end detail +} // end thrust + +#include + diff --git a/thrust/detail/device/cuda/set_intersection.inl b/thrust/detail/device/cuda/set_intersection.inl new file mode 100644 index 000000000..bb6b63b18 --- /dev/null +++ b/thrust/detail/device/cuda/set_intersection.inl @@ -0,0 +1,485 @@ +/* + * Copyright 2008-2010 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace thrust +{ +namespace detail +{ +namespace device +{ +namespace cuda +{ + +namespace set_intersection_detail +{ + +template +T1 ceil_div(T1 up, T2 down) +{ + T1 div = up / down; + T1 rem = up % down; + return (rem != 0) ? div + 1 : div; +} + +__host__ __device__ +inline unsigned int align_size_to_int(unsigned int N) +{ + return (N / sizeof(int)) + ((N % sizeof(int)) ? 1 : 0); +} + +template +unsigned int get_set_intersection_kernel_per_block_dynamic_smem_usage(unsigned int block_size) +{ + typedef typename thrust::iterator_value::type value_type1; + typedef typename thrust::iterator_value::type value_type2; + + // set_intersection_kernel allocates memory aligned to int + const unsigned int array_size1 = align_size_to_int(block_size * sizeof(value_type1)); + const unsigned int array_size2 = align_size_to_int(block_size * sizeof(value_type2)); + const unsigned int array_size3 = align_size_to_int(block_size * sizeof(int)); + const unsigned int array_size4 = align_size_to_int(block_size * sizeof(value_type1)); + + return sizeof(int) * (array_size1 + array_size2 + array_size3 + array_size4); +} // end get_set_intersection_kernel_per_block_dynamic_smem_usage() + +template +__global__ void set_intersection_kernel(const RandomAccessIterator1 first1, + const RandomAccessIterator1 last1, + const RandomAccessIterator2 first2, + const RandomAccessIterator2 last2, + RandomAccessIterator3 splitter_ranks1, + RandomAccessIterator4 splitter_ranks2, + const RandomAccessIterator5 result, + RandomAccessIterator6 result_partition_sizes, + StrictWeakOrdering comp, + Size num_partitions) +{ + typedef typename thrust::iterator_value::type value_type1; + typedef typename thrust::iterator_value::type value_type2; + + // allocate shared storage + const unsigned int array_size1 = align_size_to_int(blockDim.x * sizeof(value_type1)); + const unsigned int array_size2 = align_size_to_int(blockDim.x * sizeof(value_type2)); + const unsigned int array_size3 = align_size_to_int(blockDim.x * sizeof(int)); + const unsigned int array_size4 = align_size_to_int(blockDim.x * sizeof(value_type1)); + int *_shared1 = extern_shared_ptr(); + int *_shared2 = _shared1 + array_size1; + int *s_scratch = _shared2 + array_size2; + int *_shared3 = s_scratch + array_size3; + + value_type1 *s_input1 = reinterpret_cast(_shared1); + value_type2 *s_input2 = reinterpret_cast(_shared2); + value_type1 *s_result = reinterpret_cast(_shared3); + + // advance per-partition iterators + splitter_ranks1 += blockIdx.x; + splitter_ranks2 += blockIdx.x; + result_partition_sizes += blockIdx.x; + + for(Size partition_idx = blockIdx.x; + partition_idx < num_partitions; + partition_idx += gridDim.x, + splitter_ranks1 += gridDim.x, + splitter_ranks2 += gridDim.x, + result_partition_sizes += gridDim.x) + { + RandomAccessIterator1 input_begin1 = first1; + RandomAccessIterator1 input_end1 = last1; + RandomAccessIterator2 input_begin2 = first2; + RandomAccessIterator2 input_end2 = last2; + + RandomAccessIterator5 output_begin = result; + + // find the end of the input if this is not the last block + // the end of merged partition i is at splitter_ranks1[i] + splitter_ranks2[i] + if(partition_idx != num_partitions - 1) + { + RandomAccessIterator3 rank1 = splitter_ranks1; + RandomAccessIterator4 rank2 = splitter_ranks2; + + input_end1 = first1 + dereference(rank1); + input_end2 = first2 + dereference(rank2); + } + + // find the beginning of the input and output if this is not the first partition + // merged partition i begins at splitter_ranks1[i-1] + if(partition_idx != 0) + { + RandomAccessIterator3 rank1 = splitter_ranks1; + --rank1; + RandomAccessIterator4 rank2 = splitter_ranks2; + --rank2; + + // advance the input to point to the beginning + input_begin1 += dereference(rank1); + input_begin2 += dereference(rank2); + + // advance the result to point to the beginning of the output + output_begin += dereference(rank1); + } + + // the result is empty to begin with + value_type1 *s_result_end = s_result; + + if(input_begin1 < input_end1 && input_begin2 < input_end2) + { + typedef typename thrust::iterator_difference::type difference1; + + typedef typename thrust::iterator_difference::type difference2; + + // load the first segment + difference1 s_input1_size = thrust::min(blockDim.x, input_end1 - input_begin1); + + block::copy(input_begin1, input_begin1 + s_input1_size, s_input1); + input_begin1 += s_input1_size; + + // load the second segment + difference2 s_input2_size = thrust::min(blockDim.x, input_end2 - input_begin2); + + block::copy(input_begin2, input_begin2 + s_input2_size, s_input2); + input_begin2 += s_input2_size; + + __syncthreads(); + + s_result_end = block::set_intersection(s_input1, s_input1 + s_input1_size, + s_input2, s_input2 + s_input2_size, + s_scratch, + s_result, + comp); + + __syncthreads(); + + // store to gmem + output_begin = block::copy(s_result, s_result_end, output_begin); + } // end if + + // store size of the result + if(threadIdx.x == 0) + { + dereference(result_partition_sizes) = s_result_end - s_result; + } // end if + } // end for partition +} // end set_intersection_kernel + +template +__global__ +void grouped_gather(const RandomAccessIterator1 result, + const RandomAccessIterator2 first, + RandomAccessIterator3 splitter_ranks1, + RandomAccessIterator4 size_of_result_before_and_including_each_partition, + Size num_partitions) +{ + using namespace thrust::detail::device; + + // advance iterators + splitter_ranks1 += blockIdx.x; + size_of_result_before_and_including_each_partition += blockIdx.x; + + for(Size partition_idx = blockIdx.x; + partition_idx < num_partitions; + partition_idx += gridDim.x, + splitter_ranks1 += gridDim.x, + size_of_result_before_and_including_each_partition += gridDim.x) + { + RandomAccessIterator1 output_begin = result; + RandomAccessIterator2 input_begin = first; + + // find the location of the input and output if this is not the first partition + typename thrust::iterator_value::type partition_size + = dereference(size_of_result_before_and_including_each_partition); + if(partition_idx != 0) + { + // advance iterator to the beginning of this partition's input + RandomAccessIterator3 rank = splitter_ranks1; + --rank; + input_begin += dereference(rank); + + // subtract away the previous element of size_of_result_preceding_each_partition + // resulting in this size of this partition + --size_of_result_before_and_including_each_partition; + typename thrust::iterator_value::type beginning_of_output + = dereference(size_of_result_before_and_including_each_partition); + + output_begin += beginning_of_output; + partition_size -= dereference(size_of_result_before_and_including_each_partition); + } // end if + + thrust::detail::device::cuda::block::copy(input_begin, input_begin + partition_size, output_begin); + } // end for partition +} // end grouped_gather + + +template + struct equivalence_from_compare +{ + equivalence_from_compare(Compare c) + : comp(c) {} + + template + __host__ __device__ + bool operator()(T1 &lhs, T2 &rhs) + { + // two elements are equivalent if neither sorts before the other + return !comp(lhs, rhs) && !comp(rhs, lhs); + } + + Compare comp; +}; // end equivalence_from_compare + + +// this predicate tests two two-element tuples +// we first use a Compare for the first element +// if the first elements are equivalent, we use +// < for the second elements +// XXX set_intersection duplicates this +// move it some place common +template + struct compare_first_less_second +{ + compare_first_less_second(Compare c) + : comp(c) {} + + template + __host__ __device__ + bool operator()(T1 lhs, T2 rhs) + { + return comp(lhs.get<0>(), rhs.get<0>()) || (!comp(rhs.get<0>(), lhs.get<0>()) && lhs.get<1>() < rhs.get<1>()); + } + + Compare comp; +}; // end compare_first_less_second + + +template + void get_set_intersection_splitter_ranks(RandomAccessIterator1 first1, + RandomAccessIterator1 last1, + RandomAccessIterator2 first2, + RandomAccessIterator2 last2, + RandomAccessIterator3 splitter_ranks1, + RandomAccessIterator4 splitter_ranks2, + Compare comp, + Size1 partition_size, + Size2 num_splitters_from_each_range) +{ + using namespace thrust::detail; + + typedef typename thrust::iterator_difference::type difference1; + typedef typename thrust::iterator_difference::type difference2; + + const difference1 num_elements1 = last1 - first1; + const difference2 num_elements2 = last2 - first2; + + // enumerate each key within its sub-segment of equivalent keys + // XXX replace these explicit ranges with fancy iterators + raw_buffer key_ranks1(num_elements1); + raw_buffer key_ranks2(num_elements2); + + thrust::exclusive_scan_by_key(first1, last1, + thrust::make_constant_iterator(1), + key_ranks1.begin(), + difference1(0), + equivalence_from_compare(comp)); + + thrust::exclusive_scan_by_key(first2, last2, + thrust::make_constant_iterator(1), + key_ranks2.begin(), + difference2(0), + equivalence_from_compare(comp)); + + // zip up the keys with their ranks to disambiguate repeated elements during rank-finding + typedef typename raw_buffer::iterator RankIterator1; + typedef typename raw_buffer::iterator RankIterator2; + typedef thrust::tuple iterator_tuple1; + typedef thrust::tuple iterator_tuple2; + typedef thrust::zip_iterator iterator_and_rank1; + typedef thrust::zip_iterator iterator_and_rank2; + + iterator_and_rank1 first_and_rank1 = + thrust::make_zip_iterator(thrust::make_tuple(first1, key_ranks1.begin())); + iterator_and_rank1 last_and_rank1 = first_and_rank1 + num_elements1; + + iterator_and_rank2 first_and_rank2 = + thrust::make_zip_iterator(thrust::make_tuple(first2, key_ranks2.begin())); + iterator_and_rank2 last_and_rank2 = first_and_rank2 + num_elements2; + + // take into account the tuples when comparing + typedef compare_first_less_second splitter_compare; + + using namespace thrust::detail::device::cuda::detail; + return get_set_operation_splitter_ranks(first_and_rank1, last_and_rank1, + first_and_rank2, last_and_rank2, + splitter_ranks1, + splitter_ranks2, + splitter_compare(comp), + partition_size, + num_splitters_from_each_range); +} // end get_set_intersection_splitter_ranks() + + +} // end namespace set_intersection_detail + + +template +RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1, + RandomAccessIterator1 last1, + RandomAccessIterator2 first2, + RandomAccessIterator2 last2, + RandomAccessIterator3 result, + Compare comp) +{ + typedef typename thrust::iterator_value::type value_type1; + typedef typename thrust::iterator_difference::type difference1; + typedef typename thrust::iterator_difference::type difference2; + + const difference1 num_elements1 = last1 - first1; + const difference2 num_elements2 = last2 - first2; + + // check for trivial problem + if(num_elements1 == 0 || num_elements2 == 0) + return result; + + using namespace set_intersection_detail; + using namespace thrust::detail; + using namespace thrust::detail::device::cuda::detail; + + typedef typename thrust::iterator_value::type value_type; + + // prefer large blocks to keep the partitions as large as possible + const size_t block_size = + arch::max_blocksize_subject_to_smem_usage(set_intersection_kernel< + RandomAccessIterator1, + RandomAccessIterator2, + typename raw_buffer::iterator, + typename raw_buffer::iterator, + typename raw_buffer::iterator, + typename raw_buffer::iterator, + Compare, + size_t + >, + get_set_intersection_kernel_per_block_dynamic_smem_usage< + RandomAccessIterator1, + RandomAccessIterator2, + typename raw_buffer::iterator + >); + + const size_t partition_size = block_size; + const difference1 num_partitions = ceil_div(num_elements1, partition_size); + const difference1 num_splitters_from_each_range = num_partitions - 1; + const size_t num_merged_partitions = 2 * num_splitters_from_each_range + 1; + + // allocate storage for splitter ranks + raw_buffer splitter_ranks1(2 * num_splitters_from_each_range); + raw_buffer splitter_ranks2(2 * num_splitters_from_each_range); + + // select some splitters and find the rank of each splitter in the other range + // XXX it's possible to fuse rank-finding with the merge_kernel below + // this eliminates the temporary buffers splitter_ranks1 & splitter_ranks2 + // but this spills to lmem and causes a 10x speeddown + get_set_intersection_splitter_ranks(first1,last1, + first2,last2, + splitter_ranks1.begin(), + splitter_ranks2.begin(), + comp, + partition_size, + num_splitters_from_each_range); + + // allocate storage to store each intersected partition's size + raw_buffer result_partition_sizes(num_merged_partitions); + + // allocate storage to store the largest possible intersection + raw_buffer::type, cuda_device_space_tag> temporary_results(num_elements1); + + // maximize the number of blocks we can launch + const size_t max_blocks = thrust::detail::device::cuda::arch::max_grid_dimensions().x; + const size_t num_blocks = thrust::min(num_merged_partitions, max_blocks); + const size_t dynamic_smem_size = get_set_intersection_kernel_per_block_dynamic_smem_usage(block_size); + + set_intersection_detail::set_intersection_kernel<<(block_size), static_cast(dynamic_smem_size)>>>( + first1, last1, + first2, last2, + splitter_ranks1.begin(), + splitter_ranks2.begin(), + temporary_results.begin(), + result_partition_sizes.begin(), + comp, + num_merged_partitions); + synchronize_if_enabled("set_intersection_kernel"); + + // inclusive scan the element counts to get the number of elements occurring before and including each partition + thrust::inclusive_scan(result_partition_sizes.begin(), + result_partition_sizes.end(), + result_partition_sizes.begin()); + + // gather from temporary_results to result + // XXX use a different heuristic for this kernel config + grouped_gather<<(block_size)>>>( + result, + temporary_results.begin(), + splitter_ranks1.begin(), + result_partition_sizes.begin(), + num_merged_partitions); + synchronize_if_enabled("grouped_gather"); + + return result + result_partition_sizes[num_merged_partitions - 1]; +} // end merge + +} // end namespace cuda +} // end namespace device +} // end namespace detail +} // end namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + diff --git a/thrust/detail/device/cuda/set_operations.inl b/thrust/detail/device/cuda/set_operations.inl deleted file mode 100644 index 5131d7cce..000000000 --- a/thrust/detail/device/cuda/set_operations.inl +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright 2008-2010 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file set_operations.inl - * \brief CUDA implementation of set_intersection, - * based on Gregory Diamos' original implementation. - */ - -#include - -#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC - -// TODO use thrust/detail/device/ where possible -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include - -namespace thrust -{ -namespace detail -{ -namespace device -{ -namespace cuda -{ - -namespace block -{ - -template -__device__ -thrust::tuple< - RandomAccessIterator1, - RandomAccessIterator2, - RandomAccessIterator3 -> - - set_intersection(RandomAccessIterator1 first1, - RandomAccessIterator1 last1, - RandomAccessIterator2 first2, - RandomAccessIterator2 last2, - RandomAccessIterator3 result, - StrictWeakOrdering comp) -{ - namespace gs = thrust::detail::device::generic::scalar; - - bool do_output_element = false; - - // these variables are needed outside of the branch for the last part of the algorithm - // so declare them here - thrust::pair matches2 = thrust::make_pair(first2,first2); - - typename thrust::iterator_difference::type size_of_partial_intersection(0); - - // for each element in the first range, search the second range, looking for a match - if(first1 + threadIdx.x < last1) - { - // not only must we figure out if our element exists in the second range, we need to - // 1. figure out how many m copies of our element exists in the first range - // 2. figure out how many n copies of our element exists in the second range - // 3. keep the element if its rank < min(m,n) - - thrust::pair matches1 = gs::equal_range(first1, last1, first1[threadIdx.x], comp); - matches2 = gs::equal_range(first2, last2, first1[threadIdx.x], comp); - - size_of_partial_intersection = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (matches1.second - matches1.first, - matches2.second - matches2.first); - - // compute the rank of the element in the first range - unsigned int element_rank = threadIdx.x - (matches1.first - first1); - - do_output_element = (element_rank < size_of_partial_intersection); - } - - // copy matches to the result - - // record, for each thread, whether we should output its element - __shared__ unsigned int shared[block_size]; - shared[threadIdx.x] = do_output_element; - - // a shared copy_if operation follows: - - block::inplace_inclusive_scan(shared, thrust::plus()); - - unsigned int size_of_total_intersection = shared[block_size-1]; - - // find the index to write our element - unsigned int output_index = 0; - if(threadIdx.x > 0) - { - output_index = shared[threadIdx.x-1]; - } - - if(do_output_element) - { - result[output_index] = first1[threadIdx.x]; - } - - // find the last element of both ranges to be consumed, and point to the next element - __shared__ unsigned int one_past_last_element_consumed[2]; - if(threadIdx.x == 0) - { - one_past_last_element_consumed[0] = 0; - one_past_last_element_consumed[1] = 0; - } - __syncthreads(); - - if(do_output_element && shared[threadIdx.x] == size_of_total_intersection) - { - one_past_last_element_consumed[0] = threadIdx.x+1; - one_past_last_element_consumed[1] = (matches2.first + size_of_partial_intersection) - first2; - } - __syncthreads(); - - return thrust::make_tuple(first1 + one_past_last_element_consumed[0], - first2 + one_past_last_element_consumed[1], - result + size_of_total_intersection); -} // end set_intersection - -} // end block - -namespace set_intersection_detail -{ - -template -T1 ceil_div(T1 up, T2 down) -{ - T1 div = up / down; - T1 rem = up % down; - return (rem != 0) ? div + 1 : div; -} - -template - struct align_size_to_int -{ - static const unsigned int value = (N / sizeof(int)) + ((N % sizeof(int)) ? 1 : 0); -}; - -// limited form of block-synchronous rotate which assumes distance(first, last) <= blockDim.x -template -__device__ - void small_rotate(RandomAccessIterator first, - RandomAccessIterator middle, - RandomAccessIterator last) -{ - typedef typename thrust::iterator_value::type value_type; - - typedef typename thrust::iterator_difference::type difference; - - // XXX these should use scalar::distance - difference k = middle - first; - difference l = last - middle; - - // advance first - // XXX this should use scalar::advance - first += threadIdx.x; - - value_type temp = dereference(first); - - __syncthreads(); - - if(threadIdx.x < k) - { - // move front half to back - // advance first to point to the back - // XXX this should use scalar::advance - first += l; - - dereference(first) = temp; - } - else - { - // move back half to front - // advance first to point to the front - // XXX this should use scalar::advance - first -= k; - - dereference(first) = temp; - } -} - -template -__device__ - unsigned int shift_and_fetch(RandomAccessIterator first, - RandomAccessIterator last, - T *s_storage, - Range s_range) -{ - // push remaining input from the previous iteration to the front of the storage - if(s_range.first != s_range.second) - { - // note that rotate is actually redundant -- we don't need to keep [first,middle) - small_rotate(s_storage, s_range.first, s_range.second); - - // note that the block is convergent at this barrier - __syncthreads(); - } - - // compute the number of new elements to copy - unsigned int n = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (s_range.first - s_storage, last - first); - - // copy from the input into shared mem - thrust::detail::device::cuda::block::copy(first, first + n, s_storage + (s_range.second - s_range.first)); - - __syncthreads(); - - return n; -} - - -template -__launch_bounds__(block_size, 1) -__global__ void set_intersection_kernel(RandomAccessIterator1 first1, - RandomAccessIterator1 last1, - RandomAccessIterator2 first2, - RandomAccessIterator2 last2, - RandomAccessIterator3 result, - RandomAccessIterator4 partition_begin_indices1, - RandomAccessIterator5 partition_begin_indices2, - RandomAccessIterator6 result_partition_sizes, - StrictWeakOrdering comp) -{ - typedef typename thrust::iterator_value::type value_type; - - const unsigned int block_idx = blockIdx.x; - const unsigned int thread_idx = threadIdx.x; - - // advance iterators - partition_begin_indices1 += block_idx; - partition_begin_indices2 += block_idx; - - // find the ends of our partition if this is not the last block - if(block_idx != gridDim.x - 1) - { - RandomAccessIterator4 temp1 = partition_begin_indices1 + 1; - RandomAccessIterator5 temp2 = partition_begin_indices2 + 1; - - last1 = first1 + dereference(temp1); - last2 = first2 + dereference(temp2); - } - - // point to the beginning of our partition in each range - first1 += dereference(partition_begin_indices1); - first2 += dereference(partition_begin_indices2); - result += dereference(partition_begin_indices1); - - // allocate shared backing store - const unsigned int array_size = align_size_to_int::value; - __shared__ int _shared1[array_size]; - __shared__ int _shared2[array_size]; - __shared__ int _result[array_size]; - - value_type *s_storage1 = reinterpret_cast(_shared1); - value_type *s_storage2 = reinterpret_cast(_shared2); - value_type *s_result = reinterpret_cast(_result); - - // keep two ranges in shared memory - // these ranges begin empty, pointing to the end of their backing store - pair s_range1 = make_pair(s_storage1 + block_size, s_storage1 + block_size); - pair s_range2 = make_pair(s_storage2 + block_size, s_storage2 + block_size); - - typename thrust::iterator_value::type result_partition_size(0); - - unsigned int round = 0; - - // continuously bring input into our shared storage over multiple rounds until - // we find we can accomodation input but have none left - // the loop termination condition is somewhat complicated, so we just break at the bottom - while(true) - { - // fetch into the first segment if there's input left and we have room - if((first1 < last1) && (s_range1.first > s_storage1)) - { - unsigned int num_new_elements = shift_and_fetch(first1,last1,s_storage1,s_range1); - - // advance first1 - first1 += num_new_elements; - - // adjust the bounds of range1 - unsigned int num_old_elements = s_range1.second - s_range1.first; - s_range1.first = s_storage1; - s_range1.second = s_storage1 + num_old_elements + num_new_elements; - } - - // fetch into the second segment if we have room - if((first2 < last2) && (s_range2.first > s_storage2)) - { - unsigned int num_new_elements = shift_and_fetch(first2,last2,s_storage2,s_range2); - - // advance first2 - first2 += num_new_elements; - - // adjust the bounds of range2 - unsigned int num_old_elements = s_range2.second - s_range2.first; - s_range2.first = s_storage2; - s_range2.second = s_storage2 + num_old_elements + num_new_elements; - } - - // XXX bring these into registers rather than do multiple shared loads? - bool range2_has_strictly_lesser_bound = comp(s_range2.second[-1], s_range1.second[-1]); - bool range1_has_strictly_lesser_bound = comp(s_range1.second[-1], s_range2.second[-1]); - - pair s_range_with_greater_bound = range2_has_strictly_lesser_bound ? s_range1 : s_range2; - pair s_range_with_lesser_bound = range2_has_strictly_lesser_bound ? s_range2 : s_range1; - - // XXX this spot is probably the point of parameterization for other related algorithms - value_type *s_result_end; - tie(s_range_with_lesser_bound.first, s_range_with_greater_bound.first, s_result_end) - = block::set_intersection(s_range_with_lesser_bound.first, s_range_with_lesser_bound.second, - s_range_with_greater_bound.first, s_range_with_greater_bound.second, - s_result, comp); - - // when s_range_with_greater_bound.second[-1] is not equivalent to s_range_with_lesser_bound.second[-1], - // (i.e., s_range_with_greater_bound truly has a greater bound) - // it's always safe to eject all of s_range_with_lesser_bound - // in this case, collapse s_range_with_lesser_bound to an empty range - // else, we need to retain elements that were not intersected. - // this is a special case that occurs in the presense of multisets. - if(range1_has_strictly_lesser_bound || range2_has_strictly_lesser_bound) - { - // ranges' bounds are not equivalent, it's safe to eject all of the range with lesser bound - s_range_with_lesser_bound.first = s_range_with_lesser_bound.second; - } - - // write the updated ranges back - s_range1 = range2_has_strictly_lesser_bound ? s_range_with_greater_bound : s_range_with_lesser_bound; - s_range2 = range2_has_strictly_lesser_bound ? s_range_with_lesser_bound : s_range_with_greater_bound; - - // copy out to result, advance iterator - result = thrust::detail::device::cuda::block::copy(s_result, s_result_end, result); - - __syncthreads(); - - // increment size of the output - result_partition_size += (s_result_end - s_result); - - ++round; - - // if we require more input but have run out, break the loop - if(s_range1.first == s_range1.second && first1 == last1) break; - if(s_range2.first == s_range2.second && first2 == last2) break; - } - - if(thread_idx == 0) - { - result_partition_sizes += block_idx; - dereference(result_partition_sizes) = result_partition_size; - } -} - - -template -__launch_bounds__(block_size, 1) -__global__ -void grouped_gather(RandomAccessIterator1 result, - RandomAccessIterator2 first, - RandomAccessIterator3 input_segment_begin_indices, - RandomAccessIterator4 output_segment_end_indices) -{ - using namespace thrust::detail::device; - - // advance iterators - input_segment_begin_indices += blockIdx.x; - output_segment_end_indices += blockIdx.x; - - // point at the beginning of this block's input - first += dereference(input_segment_begin_indices); - - // initialize the size of the segment to read - typename thrust::iterator_value::type size = dereference(output_segment_end_indices); - - if(blockIdx.x != 0) - { - RandomAccessIterator4 ptr_to_previous_segment_end_index = output_segment_end_indices; - --ptr_to_previous_segment_end_index; - - // get the end of the previous segment - typename thrust::iterator_value::type previous_segment_end_index = dereference(ptr_to_previous_segment_end_index); - - // point result at our segment's begin index - result += previous_segment_end_index; - - // compute our segment's size: the difference between our end index and the previous segment's end index - size -= previous_segment_end_index; - } - - thrust::detail::device::cuda::block::copy(first, first + size, result); -} - - -template -struct mult_by - : thrust::unary_function -{ - T _value; - - mult_by(const T& v):_value(v){} - - __host__ __device__ - T operator()(const T& v) const - { - return _value * v; - } -}; - -} // end namespace set_intersection_detail - - -template -RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1, - RandomAccessIterator1 last1, - RandomAccessIterator2 first2, - RandomAccessIterator2 last2, - RandomAccessIterator3 result, - Compare comp) -{ - typedef typename thrust::iterator_difference::type difference1; - typedef typename thrust::iterator_difference::type difference2; - - const difference1 num_elements1 = last1 - first1; - const difference2 num_elements2 = last2 - first2; - - // check for trivial problem - if(num_elements1 == 0 || num_elements2 == 0) - return result; - - using namespace set_intersection_detail; - using namespace thrust::detail; - - typedef typename thrust::iterator_value::type value_type; - - // XXX makes more sense to create a number of partitions equal to the number of SMs - const size_t block_size = 128; - const size_t partition_size = 1024; - - difference1 num_partitions = ceil_div(num_elements1, partition_size); - - // create the range [0, partition_size, 2*partition_size, 3*partition_size, ...] - typedef thrust::counting_iterator counter1; - thrust::transform_iterator< mult_by, counter1> - partition_begin_indices1_guess - = thrust::make_transform_iterator(counter1(0), mult_by(partition_size)); - - // XXX we could encapsulate this gather in a permutation_iterator, - // but the call to unique before confounds us a little - raw_buffer partition_values(num_partitions); - thrust::gather(partition_begin_indices1_guess, partition_begin_indices1_guess + partition_values.size(), - first1, - partition_values.begin()); - - // we require the partition splitters to be unique - typename raw_buffer::iterator end = - thrust::unique(partition_values.begin(), partition_values.end()); - num_partitions = end - partition_values.begin(); - - raw_buffer partition_begin_indices1(num_partitions); - raw_buffer partition_begin_indices2(num_partitions); - - thrust::lower_bound(first1, last1, - partition_values.begin(), partition_values.end(), - partition_begin_indices1.begin(), comp); - - thrust::lower_bound(first2, last2, - partition_values.begin(), partition_values.end(), - partition_begin_indices2.begin(), comp); - - raw_buffer result_partition_sizes(num_partitions); - raw_buffer< value_type, cuda_device_space_tag> temp_result(num_elements1); - - set_intersection_detail::set_intersection_kernel<<<(unsigned int) num_partitions, (unsigned int) block_size >>>( - first1, last1, - first2, last2, - temp_result.begin(), - partition_begin_indices1.begin(), - partition_begin_indices2.begin(), - result_partition_sizes.begin(), - comp); - synchronize_if_enabled("set_intersection_kernel"); - - thrust::inclusive_scan(result_partition_sizes.begin(), result_partition_sizes.end(), result_partition_sizes.begin()); - - // after the inclusive scan, we have the end of each segment - raw_buffer &output_segment_end_indices = result_partition_sizes; - - set_intersection_detail::grouped_gather<<<(unsigned int) num_partitions, (unsigned int) block_size >>>( - result, - temp_result.begin(), - partition_begin_indices1.begin(), - output_segment_end_indices.begin()); - synchronize_if_enabled("grouped_gather"); - - return result + result_partition_sizes[num_partitions - 1]; -} // end set_intersection - -} // end namespace cuda -} // end namespace device -} // end namespace detail -} // end namespace thrust - -#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC - diff --git a/thrust/detail/device/dispatch/set_operations.h b/thrust/detail/device/dispatch/set_operations.h index 201961516..6d6c3599b 100644 --- a/thrust/detail/device/dispatch/set_operations.h +++ b/thrust/detail/device/dispatch/set_operations.h @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include namespace thrust