First cut at improved CUDA set_intersection & perf test.

STEllAR-GROUP · Nov 6, 2010 · 6031df5 · 6031df5
1 parent b6233e2
commit 6031df5
Show file tree

Hide file tree

Showing 11 changed files with 776 additions and 598 deletions.
diff --git a/performance/set_intersection.test b/performance/set_intersection.test
@@ -0,0 +1,45 @@
+PREAMBLE = \
+    """
+    #include <thrust/set_operations.h>
+    #include <thrust/sort.h>
+    #include <algorithm>
+    """
+
+INITIALIZE = \
+    """
+    thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
+    thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
+    thrust::sort(h_a.begin(), h_a.end());
+    thrust::sort(h_b.begin(), h_b.end());
+
+    thrust::host_vector<$InputType> h_result(h_a.size());
+    thrust::host_vector<$InputType>::iterator new_end = 
+      thrust::set_intersection(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
+    h_result.resize(new_end - h_result.begin());
+
+    thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;
+
+    thrust::device_vector<$InputType> d_result(h_result.size());
+    thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+    """
+
+TIME = \
+    """
+    thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
+    """
+
+FINALIZE = \
+    """
+    RECORD_TIME();
+    RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType));
+    RECORD_SORTING_RATE(2 * double($InputSize))
+    """
+
+
+InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
+InputSizes = [2**N for N in range(10, 25)]
+
+TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
+
diff --git a/testing/set_intersection.cu b/testing/set_intersection.cu
@@ -141,48 +141,26 @@ void TestSetIntersectionMultiset(const size_t n)
 DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);
 
 
-struct non_arithmetic
+template<typename U>
+  void TestSetIntersectionKeyValue(size_t n)
 {
-  __host__ __device__
-  non_arithmetic(void)
-    {}
+  typedef key_value<U,U> T;
 
-  __host__ __device__
-  non_arithmetic(const non_arithmetic &x)
-    : key(x.key) {}
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
 
-  __host__ __device__
-  non_arithmetic(const int k)
-    : key(k) {}
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
 
-  __host__ __device__
-  bool operator<(const non_arithmetic &rhs) const
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
   {
-    return key < rhs.key;
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
   }
 
-  __host__ __device__
-  bool operator==(const non_arithmetic &rhs) const
-  {
-    return key == rhs.key;
-  }
-
-  int key;
-};
-
-
-void TestSetIntersectionNonArithmetic(void)
-{
-  const unsigned int n = 12345;
-
-  typedef non_arithmetic T;
-
-  thrust::host_vector<T> temp = unittest::random_integers<int>(2 * n);
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
-
-  thrust::sort(h_a.begin(), h_a.end());
-  thrust::sort(h_b.begin(), h_b.end());
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
 
   thrust::device_vector<T> d_a = h_a;
   thrust::device_vector<T> d_b = h_b;
@@ -206,6 +184,5 @@ void TestSetIntersectionNonArithmetic(void)
 
   ASSERT_EQUAL_QUIET(h_result, d_result);
 }
-DECLARE_UNITTEST(TestSetIntersectionNonArithmetic);
-
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionKeyValue);
 
diff --git a/thrust/detail/device/cuda/block/inclusive_scan.h b/thrust/detail/device/cuda/block/inclusive_scan.h
@@ -51,6 +51,32 @@ __device__
   if(block_size > 512) { if (threadIdx.x >= 512) { val = binary_op(first[threadIdx.x - 512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
 } // end inplace_inclusive_scan()
 
+
+template<typename RandomAccessIterator,
+         typename Size,
+         typename BinaryFunction>
+__device__
+void inplace_inclusive_scan_n(RandomAccessIterator first,
+                              Size n,
+                              BinaryFunction binary_op)
+{
+  typename thrust::iterator_value<RandomAccessIterator>::type val = first[threadIdx.x];
+  __syncthreads();
+
+  // assume n <= 2048
+  if(n >    1) { if (threadIdx.x < n && threadIdx.x >=    1) { val = binary_op(first[threadIdx.x -    1], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >    2) { if (threadIdx.x < n && threadIdx.x >=    2) { val = binary_op(first[threadIdx.x -    2], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } 
+  if(n >    4) { if (threadIdx.x < n && threadIdx.x >=    4) { val = binary_op(first[threadIdx.x -    4], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >    8) { if (threadIdx.x < n && threadIdx.x >=    8) { val = binary_op(first[threadIdx.x -    8], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >   16) { if (threadIdx.x < n && threadIdx.x >=   16) { val = binary_op(first[threadIdx.x -   16], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >   32) { if (threadIdx.x < n && threadIdx.x >=   32) { val = binary_op(first[threadIdx.x -   32], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >   64) { if (threadIdx.x < n && threadIdx.x >=   64) { val = binary_op(first[threadIdx.x -   64], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >  128) { if (threadIdx.x < n && threadIdx.x >=  128) { val = binary_op(first[threadIdx.x -  128], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >  256) { if (threadIdx.x < n && threadIdx.x >=  256) { val = binary_op(first[threadIdx.x -  256], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >  512) { if (threadIdx.x < n && threadIdx.x >=  512) { val = binary_op(first[threadIdx.x -  512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n > 1024) { if (threadIdx.x < n && threadIdx.x >= 1024) { val = binary_op(first[threadIdx.x - 1024], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+} // end inplace_inclusive_scan()
+
 } // end namespace block
 } // end namespace cuda
 } // end namespace device

diff --git a/thrust/detail/device/cuda/set_operations.h → ...tail/device/cuda/block/set_intersection.h b/thrust/detail/device/cuda/set_operations.h → ...tail/device/cuda/block/set_intersection.h
@@ -18,34 +18,34 @@
 
 namespace thrust
 {
-
 namespace detail
 {
-
 namespace device
 {
-
 namespace cuda
 {
+namespace block
+{
 
 template<typename RandomAccessIterator1,
          typename RandomAccessIterator2,
          typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
          typename StrictWeakOrdering>
-  RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1,
+__device__ __forceinline__
+  RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1,
                                          RandomAccessIterator1 last1,
                                          RandomAccessIterator2 first2,
                                          RandomAccessIterator2 last2,
-                                         RandomAccessIterator3 result,
+                                         RandomAccessIterator3 temporary,
+                                         RandomAccessIterator4 result,
                                          StrictWeakOrdering comp);
 
+} // end block
 } // end cuda
-
 } // end device
-
 } // end detail
-
 } // end thrust
 
-#include <thrust/detail/device/cuda/set_operations.inl>
+#include <thrust/detail/device/cuda/block/set_intersection.inl>
 
diff --git a/thrust/detail/device/cuda/block/set_intersection.inl b/thrust/detail/device/cuda/block/set_intersection.inl
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2010 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/device/cuda/block/merge.h>
+#include <thrust/detail/device/generic/scalar/binary_search.h>
+#include <thrust/detail/device/dereference.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace device
+{
+namespace cuda
+{
+namespace block
+{
+
+namespace set_intersection_detail
+{
+
+// this predicate tests two two-element tuples
+// we first use a Compare for the first element
+// if the first elements are equivalent, we use
+// < for the second elements
+// XXX merge duplicates this
+//     move it some place common
+template<typename Compare>
+  struct compare_first_less_second
+{
+  __host__ __device__
+  compare_first_less_second(Compare c)
+    : comp(c) {}
+
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(T1 lhs, T2 rhs)
+  {
+    return comp(lhs.get<0>(), rhs.get<0>()) || (!comp(rhs.get<0>(), lhs.get<0>()) && lhs.get<1>() < rhs.get<1>());
+  }
+
+  Compare comp;
+}; // end compare_first_less_second
+
+} // end set_intersection_detail
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+__device__ __forceinline__
+  RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1,
+                                         RandomAccessIterator1 last1,
+                                         RandomAccessIterator2 first2,
+                                         RandomAccessIterator2 last2,
+                                         RandomAccessIterator3 temporary,
+                                         RandomAccessIterator4 result,
+                                         StrictWeakOrdering comp)
+{
+  using namespace set_intersection_detail;
+
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
+  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
+
+  difference1 n1 = last1 - first1;
+
+  // search for all matches in the second range for each element in the first
+  bool found = false;
+  if(threadIdx.x < n1)
+  {
+    RandomAccessIterator1 x = first1;
+    x += threadIdx.x;
+
+    // count the number of previous occurrances of x the first range
+    difference1 rank = x - thrust::detail::device::generic::scalar::lower_bound(first1,x,dereference(x),comp);
+
+    // count the number of equivalent elements of x in the second range
+    thrust::pair<RandomAccessIterator2,RandomAccessIterator2> matches = 
+      thrust::detail::device::generic::scalar::equal_range(first2,last2,dereference(x),comp);
+
+    difference2 num_matches = matches.second - matches.first;
+
+    // the element is "found" if its rank is less than the number of matches
+    found = rank < num_matches;
+  } // end if
+
+  // mark whether my element was found or not in the scratch array
+  RandomAccessIterator3 temp = temporary;
+  temp += threadIdx.x;
+  dereference(temp) = found;
+
+  block::inplace_inclusive_scan_n(temporary, n1, thrust::plus<int>());
+
+  // copy_if
+  if(found)
+  {
+    // find the index to write our element
+    unsigned int output_index = 0;
+    if(threadIdx.x > 0)
+    {
+      RandomAccessIterator3 src = temporary;
+      src += threadIdx.x - 1;
+      output_index = dereference(src);
+    } // end if
+
+    RandomAccessIterator1 x = first1;
+    x += threadIdx.x;
+
+    RandomAccessIterator4 dst = result;
+    dst += output_index;
+    dereference(dst) = dereference(x);
+  } // end if
+
+  return result + temporary[n1-1];
+} // end set_intersection
+
+} // end block
+} // end cuda
+} // end device
+} // end detail
+} // end thrust
+
diff --git a/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl b/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl
@@ -186,7 +186,10 @@ template<typename RandomAccessIterator1,
                       splitters_begin, splitters_end, 
                       splitter_ranks2, comp);
 
-  thrust::upper_bound(first1, last1,
+//  thrust::upper_bound(first1, last1,
+//                      splitters_begin, splitters_end,
+//                      splitter_ranks1, comp);
+  thrust::lower_bound(first1, last1,
                       splitters_begin, splitters_end,
                       splitter_ranks1, comp);
 } // end get_set_operation_splitter_ranks()

diff --git a/thrust/detail/device/cuda/merge.inl b/thrust/detail/device/cuda/merge.inl
@@ -200,6 +200,8 @@ __global__ void merge_kernel(const RandomAccessIterator1 first1,
 // we first use a Compare for the first element
 // if the first elements are equivalent, we use
 // < for the second elements
+// XXX set_intersection duplicates this
+//     move it some place common
 template<typename Compare>
   struct compare_first_less_second
 {