From 6031df5642b2088a12271682e08fa5b17f75ffba Mon Sep 17 00:00:00 2001
From: Jared Hoberock <jaredhoberock@gmail.com>
Date: Fri, 5 Nov 2010 17:39:19 -0700
Subject: [PATCH] First cut at improved CUDA set_intersection & perf test.

---
 performance/set_intersection.test             |  45 ++
 testing/set_intersection.cu                   |  51 +-
 .../detail/device/cuda/block/inclusive_scan.h |  26 +
 .../set_intersection.h}                       |  18 +-
 .../device/cuda/block/set_intersection.inl    | 141 +++++
 .../get_set_operation_splitter_ranks.inl      |   5 +-
 thrust/detail/device/cuda/merge.inl           |   2 +
 thrust/detail/device/cuda/set_intersection.h  |  49 ++
 .../detail/device/cuda/set_intersection.inl   | 485 +++++++++++++++
 thrust/detail/device/cuda/set_operations.inl  | 550 ------------------
 .../detail/device/dispatch/set_operations.h   |   2 +-
 11 files changed, 776 insertions(+), 598 deletions(-)
 create mode 100644 performance/set_intersection.test
 rename thrust/detail/device/cuda/{set_operations.h => block/set_intersection.h} (76%)
 create mode 100644 thrust/detail/device/cuda/block/set_intersection.inl
 create mode 100644 thrust/detail/device/cuda/set_intersection.h
 create mode 100644 thrust/detail/device/cuda/set_intersection.inl
 delete mode 100644 thrust/detail/device/cuda/set_operations.inl
diff --git a/performance/set_intersection.test b/performance/set_intersection.test
new file mode 100644
index 000000000..2316fc36a
--- /dev/null
+++ b/performance/set_intersection.test
@@ -0,0 +1,45 @@
+PREAMBLE = \
+    """
+    #include <thrust/set_operations.h>
+    #include <thrust/sort.h>
+    #include <algorithm>
+    """
+
+INITIALIZE = \
+    """
+    thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
+    thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
+    thrust::sort(h_a.begin(), h_a.end());
+    thrust::sort(h_b.begin(), h_b.end());
+
+    thrust::host_vector<$InputType> h_result(h_a.size());
+    thrust::host_vector<$InputType>::iterator new_end = 
+      thrust::set_intersection(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
+    h_result.resize(new_end - h_result.begin());
+
+    thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;
+
+    thrust::device_vector<$InputType> d_result(h_result.size());
+    thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
+
+    ASSERT_EQUAL(h_result, d_result);
+    """
+
+TIME = \
+    """
+    thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
+    """
+
+FINALIZE = \
+    """
+    RECORD_TIME();
+    RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType));
+    RECORD_SORTING_RATE(2 * double($InputSize))
+    """
+
+
+InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
+InputSizes = [2**N for N in range(10, 25)]
+
+TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
+
diff --git a/testing/set_intersection.cu b/testing/set_intersection.cu
index c72fb0e13..f32423d45 100644
--- a/testing/set_intersection.cu
+++ b/testing/set_intersection.cu
@@ -141,48 +141,26 @@ void TestSetIntersectionMultiset(const size_t n)
 DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);
 
 
-struct non_arithmetic
+template<typename U>
+  void TestSetIntersectionKeyValue(size_t n)
 {
-  __host__ __device__
-  non_arithmetic(void)
-    {}
+  typedef key_value<U,U> T;
 
-  __host__ __device__
-  non_arithmetic(const non_arithmetic &x)
-    : key(x.key) {}
+  thrust::host_vector<U> h_keys_a   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);
 
-  __host__ __device__
-  non_arithmetic(const int k)
-    : key(k) {}
+  thrust::host_vector<U> h_keys_b   = unittest::random_integers<U>(n);
+  thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);
 
-  __host__ __device__
-  bool operator<(const non_arithmetic &rhs) const
+  thrust::host_vector<T> h_a(n), h_b(n);
+  for(size_t i = 0; i < n; ++i)
   {
-    return key < rhs.key;
+    h_a[i] = T(h_keys_a[i], h_values_a[i]);
+    h_b[i] = T(h_keys_b[i], h_values_b[i]);
   }
 
-  __host__ __device__
-  bool operator==(const non_arithmetic &rhs) const
-  {
-    return key == rhs.key;
-  }
-
-  int key;
-};
-
-
-void TestSetIntersectionNonArithmetic(void)
-{
-  const unsigned int n = 12345;
-
-  typedef non_arithmetic T;
-
-  thrust::host_vector<T> temp = unittest::random_integers<int>(2 * n);
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
-
-  thrust::sort(h_a.begin(), h_a.end());
-  thrust::sort(h_b.begin(), h_b.end());
+  thrust::stable_sort(h_a.begin(), h_a.end());
+  thrust::stable_sort(h_b.begin(), h_b.end());
 
   thrust::device_vector<T> d_a = h_a;
   thrust::device_vector<T> d_b = h_b;
@@ -206,6 +184,5 @@ void TestSetIntersectionNonArithmetic(void)
 
   ASSERT_EQUAL_QUIET(h_result, d_result);
 }
-DECLARE_UNITTEST(TestSetIntersectionNonArithmetic);
-
+DECLARE_VARIABLE_UNITTEST(TestSetIntersectionKeyValue);
 
diff --git a/thrust/detail/device/cuda/block/inclusive_scan.h b/thrust/detail/device/cuda/block/inclusive_scan.h
index 1235c5654..583237898 100644
--- a/thrust/detail/device/cuda/block/inclusive_scan.h
+++ b/thrust/detail/device/cuda/block/inclusive_scan.h
@@ -51,6 +51,32 @@ __device__
   if(block_size > 512) { if (threadIdx.x >= 512) { val = binary_op(first[threadIdx.x - 512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
 } // end inplace_inclusive_scan()
 
+
+template<typename RandomAccessIterator,
+         typename Size,
+         typename BinaryFunction>
+__device__
+void inplace_inclusive_scan_n(RandomAccessIterator first,
+                              Size n,
+                              BinaryFunction binary_op)
+{
+  typename thrust::iterator_value<RandomAccessIterator>::type val = first[threadIdx.x];
+  __syncthreads();
+
+  // assume n <= 2048
+  if(n >    1) { if (threadIdx.x < n && threadIdx.x >=    1) { val = binary_op(first[threadIdx.x -    1], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >    2) { if (threadIdx.x < n && threadIdx.x >=    2) { val = binary_op(first[threadIdx.x -    2], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); } 
+  if(n >    4) { if (threadIdx.x < n && threadIdx.x >=    4) { val = binary_op(first[threadIdx.x -    4], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >    8) { if (threadIdx.x < n && threadIdx.x >=    8) { val = binary_op(first[threadIdx.x -    8], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >   16) { if (threadIdx.x < n && threadIdx.x >=   16) { val = binary_op(first[threadIdx.x -   16], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >   32) { if (threadIdx.x < n && threadIdx.x >=   32) { val = binary_op(first[threadIdx.x -   32], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >   64) { if (threadIdx.x < n && threadIdx.x >=   64) { val = binary_op(first[threadIdx.x -   64], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >  128) { if (threadIdx.x < n && threadIdx.x >=  128) { val = binary_op(first[threadIdx.x -  128], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >  256) { if (threadIdx.x < n && threadIdx.x >=  256) { val = binary_op(first[threadIdx.x -  256], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n >  512) { if (threadIdx.x < n && threadIdx.x >=  512) { val = binary_op(first[threadIdx.x -  512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+  if(n > 1024) { if (threadIdx.x < n && threadIdx.x >= 1024) { val = binary_op(first[threadIdx.x - 1024], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
+} // end inplace_inclusive_scan()
+
 } // end namespace block
 } // end namespace cuda
 } // end namespace device
diff --git a/thrust/detail/device/cuda/set_operations.h b/thrust/detail/device/cuda/block/set_intersection.h
similarity index 76%
rename from thrust/detail/device/cuda/set_operations.h
rename to thrust/detail/device/cuda/block/set_intersection.h
index da9ec0356..7e8c68e42 100644
--- a/thrust/detail/device/cuda/set_operations.h
+++ b/thrust/detail/device/cuda/block/set_intersection.h
@@ -18,34 +18,34 @@
 
 namespace thrust
 {
-
 namespace detail
 {
-
 namespace device
 {
-
 namespace cuda
 {
+namespace block
+{
 
 template<typename RandomAccessIterator1,
          typename RandomAccessIterator2,
          typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
          typename StrictWeakOrdering>
-  RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1,
+__device__ __forceinline__
+  RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1,
                                          RandomAccessIterator1 last1,
                                          RandomAccessIterator2 first2,
                                          RandomAccessIterator2 last2,
-                                         RandomAccessIterator3 result,
+                                         RandomAccessIterator3 temporary,
+                                         RandomAccessIterator4 result,
                                          StrictWeakOrdering comp);
 
+} // end block
 } // end cuda
-
 } // end device
-
 } // end detail
-
 } // end thrust
 
-#include <thrust/detail/device/cuda/set_operations.inl>
+#include <thrust/detail/device/cuda/block/set_intersection.inl>
 
diff --git a/thrust/detail/device/cuda/block/set_intersection.inl b/thrust/detail/device/cuda/block/set_intersection.inl
new file mode 100644
index 000000000..d1c272ab7
--- /dev/null
+++ b/thrust/detail/device/cuda/block/set_intersection.inl
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2010 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/device/cuda/block/merge.h>
+#include <thrust/detail/device/generic/scalar/binary_search.h>
+#include <thrust/detail/device/dereference.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace device
+{
+namespace cuda
+{
+namespace block
+{
+
+namespace set_intersection_detail
+{
+
+// this predicate tests two two-element tuples
+// we first use a Compare for the first element
+// if the first elements are equivalent, we use
+// < for the second elements
+// XXX merge duplicates this
+//     move it some place common
+template<typename Compare>
+  struct compare_first_less_second
+{
+  __host__ __device__
+  compare_first_less_second(Compare c)
+    : comp(c) {}
+
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(T1 lhs, T2 rhs)
+  {
+    return comp(lhs.get<0>(), rhs.get<0>()) || (!comp(rhs.get<0>(), lhs.get<0>()) && lhs.get<1>() < rhs.get<1>());
+  }
+
+  Compare comp;
+}; // end compare_first_less_second
+
+} // end set_intersection_detail
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+__device__ __forceinline__
+  RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1,
+                                         RandomAccessIterator1 last1,
+                                         RandomAccessIterator2 first2,
+                                         RandomAccessIterator2 last2,
+                                         RandomAccessIterator3 temporary,
+                                         RandomAccessIterator4 result,
+                                         StrictWeakOrdering comp)
+{
+  using namespace set_intersection_detail;
+
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
+  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
+
+  difference1 n1 = last1 - first1;
+
+  // search for all matches in the second range for each element in the first
+  bool found = false;
+  if(threadIdx.x < n1)
+  {
+    RandomAccessIterator1 x = first1;
+    x += threadIdx.x;
+
+    // count the number of previous occurrances of x the first range
+    difference1 rank = x - thrust::detail::device::generic::scalar::lower_bound(first1,x,dereference(x),comp);
+
+    // count the number of equivalent elements of x in the second range
+    thrust::pair<RandomAccessIterator2,RandomAccessIterator2> matches = 
+      thrust::detail::device::generic::scalar::equal_range(first2,last2,dereference(x),comp);
+
+    difference2 num_matches = matches.second - matches.first;
+
+    // the element is "found" if its rank is less than the number of matches
+    found = rank < num_matches;
+  } // end if
+
+  // mark whether my element was found or not in the scratch array
+  RandomAccessIterator3 temp = temporary;
+  temp += threadIdx.x;
+  dereference(temp) = found;
+
+  block::inplace_inclusive_scan_n(temporary, n1, thrust::plus<int>());
+
+  // copy_if
+  if(found)
+  {
+    // find the index to write our element
+    unsigned int output_index = 0;
+    if(threadIdx.x > 0)
+    {
+      RandomAccessIterator3 src = temporary;
+      src += threadIdx.x - 1;
+      output_index = dereference(src);
+    } // end if
+
+    RandomAccessIterator1 x = first1;
+    x += threadIdx.x;
+
+    RandomAccessIterator4 dst = result;
+    dst += output_index;
+    dereference(dst) = dereference(x);
+  } // end if
+
+  return result + temporary[n1-1];
+} // end set_intersection
+
+} // end block
+} // end cuda
+} // end device
+} // end detail
+} // end thrust
+
diff --git a/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl b/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl
index dffc1b5cd..d0ef66e12 100644
--- a/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl
+++ b/thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.inl
@@ -186,7 +186,10 @@ template<typename RandomAccessIterator1,
                       splitters_begin, splitters_end, 
                       splitter_ranks2, comp);
 
-  thrust::upper_bound(first1, last1,
+//  thrust::upper_bound(first1, last1,
+//                      splitters_begin, splitters_end,
+//                      splitter_ranks1, comp);
+  thrust::lower_bound(first1, last1,
                       splitters_begin, splitters_end,
                       splitter_ranks1, comp);
 } // end get_set_operation_splitter_ranks()
diff --git a/thrust/detail/device/cuda/merge.inl b/thrust/detail/device/cuda/merge.inl
index a842ad059..ecdbd8d19 100644
--- a/thrust/detail/device/cuda/merge.inl
+++ b/thrust/detail/device/cuda/merge.inl
@@ -200,6 +200,8 @@ __global__ void merge_kernel(const RandomAccessIterator1 first1,
 // we first use a Compare for the first element
 // if the first elements are equivalent, we use
 // < for the second elements
+// XXX set_intersection duplicates this
+//     move it some place common
 template<typename Compare>
   struct compare_first_less_second
 {
diff --git a/thrust/detail/device/cuda/set_intersection.h b/thrust/detail/device/cuda/set_intersection.h
new file mode 100644
index 000000000..e64dc0e21
--- /dev/null
+++ b/thrust/detail/device/cuda/set_intersection.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright 2008-2010 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+
+#pragma once
+
+namespace thrust
+{
+namespace detail
+{
+namespace device
+{
+namespace cuda
+{
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1,
+                                       RandomAccessIterator1 last1,
+                                       RandomAccessIterator2 first2,
+                                       RandomAccessIterator2 last2,
+                                       RandomAccessIterator3 result,
+                                       Compare comp);
+
+
+} // end cuda
+} // end device
+} // end detail
+} // end thrust
+
+#include <thrust/detail/device/cuda/set_intersection.inl>
+
diff --git a/thrust/detail/device/cuda/set_intersection.inl b/thrust/detail/device/cuda/set_intersection.inl
new file mode 100644
index 000000000..bb6b63b18
--- /dev/null
+++ b/thrust/detail/device/cuda/set_intersection.inl
@@ -0,0 +1,485 @@
+/*
+ *  Copyright 2008-2010 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/device/dereference.h>
+#include <thrust/extrema.h>
+#include <thrust/detail/device/cuda/arch.h>
+#include <thrust/detail/device/cuda/block/copy.h>
+#include <thrust/detail/device/cuda/block/set_intersection.h>
+#include <thrust/detail/device/cuda/synchronize.h>
+#include <thrust/detail/device/cuda/arch.h>
+#include <thrust/detail/device/cuda/extern_shared_ptr.h>
+#include <thrust/detail/device/cuda/detail/get_set_operation_splitter_ranks.h>
+#include <thrust/scan.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace device
+{
+namespace cuda
+{
+
+namespace set_intersection_detail
+{
+
+template<typename T1, typename T2>
+T1 ceil_div(T1 up, T2 down)
+{
+  T1 div = up / down;
+  T1 rem = up % down;
+  return (rem != 0) ? div + 1 : div;
+}
+
+__host__ __device__
+inline unsigned int align_size_to_int(unsigned int N)
+{
+  return (N / sizeof(int)) + ((N % sizeof(int)) ? 1 : 0);
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator5>
+unsigned int get_set_intersection_kernel_per_block_dynamic_smem_usage(unsigned int block_size)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  // set_intersection_kernel allocates memory aligned to int
+  const unsigned int array_size1 = align_size_to_int(block_size * sizeof(value_type1));
+  const unsigned int array_size2 = align_size_to_int(block_size * sizeof(value_type2));
+  const unsigned int array_size3 = align_size_to_int(block_size * sizeof(int));
+  const unsigned int array_size4 = align_size_to_int(block_size * sizeof(value_type1));
+
+  return sizeof(int) * (array_size1 + array_size2 + array_size3 + array_size4);
+} // end get_set_intersection_kernel_per_block_dynamic_smem_usage()
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename RandomAccessIterator5,
+         typename RandomAccessIterator6,
+         typename StrictWeakOrdering,
+         typename Size>
+__global__ void set_intersection_kernel(const RandomAccessIterator1 first1, 
+                                        const RandomAccessIterator1 last1,
+                                        const RandomAccessIterator2 first2,
+                                        const RandomAccessIterator2 last2,
+                                        RandomAccessIterator3 splitter_ranks1,
+                                        RandomAccessIterator4 splitter_ranks2,
+                                        const RandomAccessIterator5 result,
+                                        RandomAccessIterator6 result_partition_sizes,
+                                        StrictWeakOrdering comp,
+                                        Size num_partitions)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  // allocate shared storage
+  const unsigned int array_size1 = align_size_to_int(blockDim.x * sizeof(value_type1));
+  const unsigned int array_size2 = align_size_to_int(blockDim.x * sizeof(value_type2));
+  const unsigned int array_size3 = align_size_to_int(blockDim.x * sizeof(int));
+  const unsigned int array_size4 = align_size_to_int(blockDim.x * sizeof(value_type1));
+  int *_shared1  = extern_shared_ptr<int>();
+  int *_shared2  = _shared1 + array_size1;
+  int *s_scratch = _shared2 + array_size2;
+  int *_shared3  = s_scratch + array_size3;
+
+  value_type1 *s_input1 = reinterpret_cast<value_type1*>(_shared1);
+  value_type2 *s_input2 = reinterpret_cast<value_type2*>(_shared2);
+  value_type1 *s_result = reinterpret_cast<value_type1*>(_shared3);
+
+  // advance per-partition iterators
+  splitter_ranks1        += blockIdx.x;
+  splitter_ranks2        += blockIdx.x;
+  result_partition_sizes += blockIdx.x;
+
+  for(Size partition_idx     = blockIdx.x;
+      partition_idx          < num_partitions;
+      partition_idx          += gridDim.x,
+      splitter_ranks1        += gridDim.x,
+      splitter_ranks2        += gridDim.x,
+      result_partition_sizes += gridDim.x)
+  {
+    RandomAccessIterator1 input_begin1 = first1;
+    RandomAccessIterator1 input_end1   = last1;
+    RandomAccessIterator2 input_begin2 = first2;
+    RandomAccessIterator2 input_end2   = last2;
+
+    RandomAccessIterator5 output_begin = result;
+
+    // find the end of the input if this is not the last block
+    // the end of merged partition i is at splitter_ranks1[i] + splitter_ranks2[i]
+    if(partition_idx != num_partitions - 1)
+    {
+      RandomAccessIterator3 rank1 = splitter_ranks1;
+      RandomAccessIterator4 rank2 = splitter_ranks2;
+
+      input_end1 = first1 + dereference(rank1);
+      input_end2 = first2 + dereference(rank2);
+    }
+
+    // find the beginning of the input and output if this is not the first partition
+    // merged partition i begins at splitter_ranks1[i-1]
+    if(partition_idx != 0)
+    {
+      RandomAccessIterator3 rank1 = splitter_ranks1;
+      --rank1;
+      RandomAccessIterator4 rank2 = splitter_ranks2;
+      --rank2;
+
+      // advance the input to point to the beginning
+      input_begin1 += dereference(rank1);
+      input_begin2 += dereference(rank2);
+
+      // advance the result to point to the beginning of the output
+      output_begin += dereference(rank1);
+    }
+
+    // the result is empty to begin with
+    value_type1 *s_result_end = s_result;
+
+    if(input_begin1 < input_end1 && input_begin2 < input_end2)
+    {
+      typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
+
+      typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
+
+      // load the first segment
+      difference1 s_input1_size = thrust::min<difference1>(blockDim.x, input_end1 - input_begin1);
+
+      block::copy(input_begin1, input_begin1 + s_input1_size, s_input1);
+      input_begin1 += s_input1_size;
+
+      // load the second segment
+      difference2 s_input2_size = thrust::min<difference2>(blockDim.x, input_end2 - input_begin2);
+
+      block::copy(input_begin2, input_begin2 + s_input2_size, s_input2);
+      input_begin2 += s_input2_size;
+
+      __syncthreads();
+
+      s_result_end = block::set_intersection(s_input1, s_input1 + s_input1_size,
+                                             s_input2, s_input2 + s_input2_size,
+                                             s_scratch,
+                                             s_result,
+                                             comp);
+
+      __syncthreads();
+
+      // store to gmem
+      output_begin = block::copy(s_result, s_result_end, output_begin);
+    } // end if
+
+    // store size of the result
+    if(threadIdx.x == 0)
+    {
+      dereference(result_partition_sizes) = s_result_end - s_result;
+    } // end if
+  } // end for partition
+} // end set_intersection_kernel
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename Size>
+__global__
+void grouped_gather(const RandomAccessIterator1 result,
+                    const RandomAccessIterator2 first,
+                    RandomAccessIterator3 splitter_ranks1,
+                    RandomAccessIterator4 size_of_result_before_and_including_each_partition,
+                    Size num_partitions)
+{
+  using namespace thrust::detail::device;
+
+  // advance iterators
+  splitter_ranks1                                     += blockIdx.x;
+  size_of_result_before_and_including_each_partition  += blockIdx.x;
+
+  for(Size partition_idx                                 = blockIdx.x;
+      partition_idx                                      < num_partitions;
+      partition_idx                                      += gridDim.x,
+      splitter_ranks1                                    += gridDim.x,
+      size_of_result_before_and_including_each_partition += gridDim.x)
+  {
+    RandomAccessIterator1 output_begin = result;
+    RandomAccessIterator2 input_begin = first;
+
+    // find the location of the input and output if this is not the first partition
+    typename thrust::iterator_value<RandomAccessIterator4>::type partition_size
+      = dereference(size_of_result_before_and_including_each_partition);
+    if(partition_idx != 0)
+    {
+      // advance iterator to the beginning of this partition's input
+      RandomAccessIterator3 rank = splitter_ranks1;
+      --rank;
+      input_begin += dereference(rank);
+
+      // subtract away the previous element of size_of_result_preceding_each_partition
+      // resulting in this size of this partition
+      --size_of_result_before_and_including_each_partition;
+      typename thrust::iterator_value<RandomAccessIterator4>::type beginning_of_output
+        = dereference(size_of_result_before_and_including_each_partition);
+
+      output_begin += beginning_of_output;
+      partition_size -= dereference(size_of_result_before_and_including_each_partition);
+    } // end if
+
+    thrust::detail::device::cuda::block::copy(input_begin, input_begin + partition_size, output_begin);
+  } // end for partition
+} // end grouped_gather
+
+
+template<typename Compare>
+  struct equivalence_from_compare
+{
+  equivalence_from_compare(Compare c)
+    : comp(c) {}
+
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(T1 &lhs, T2 &rhs)
+  {
+    // two elements are equivalent if neither sorts before the other
+    return !comp(lhs, rhs) && !comp(rhs, lhs);
+  }
+  
+  Compare comp;
+}; // end equivalence_from_compare
+
+
+// this predicate tests two two-element tuples
+// we first use a Compare for the first element
+// if the first elements are equivalent, we use
+// < for the second elements
+// XXX set_intersection duplicates this
+//     move it some place common
+template<typename Compare>
+  struct compare_first_less_second
+{
+  compare_first_less_second(Compare c)
+    : comp(c) {}
+
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(T1 lhs, T2 rhs)
+  {
+    return comp(lhs.get<0>(), rhs.get<0>()) || (!comp(rhs.get<0>(), lhs.get<0>()) && lhs.get<1>() < rhs.get<1>());
+  }
+
+  Compare comp;
+}; // end compare_first_less_second
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename Compare,
+         typename Size1,
+         typename Size2>
+  void get_set_intersection_splitter_ranks(RandomAccessIterator1 first1,
+                                           RandomAccessIterator1 last1,
+                                           RandomAccessIterator2 first2,
+                                           RandomAccessIterator2 last2,
+                                           RandomAccessIterator3 splitter_ranks1,
+                                           RandomAccessIterator4 splitter_ranks2,
+                                           Compare comp,
+                                           Size1 partition_size,
+                                           Size2 num_splitters_from_each_range)
+{
+  using namespace thrust::detail;
+
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
+  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
+
+  const difference1 num_elements1 = last1 - first1;
+  const difference2 num_elements2 = last2 - first2;
+
+  // enumerate each key within its sub-segment of equivalent keys
+  // XXX replace these explicit ranges with fancy iterators
+  raw_buffer<difference1, cuda_device_space_tag> key_ranks1(num_elements1);
+  raw_buffer<difference2, cuda_device_space_tag> key_ranks2(num_elements2);
+
+  thrust::exclusive_scan_by_key(first1, last1,
+                                thrust::make_constant_iterator<difference1>(1),
+                                key_ranks1.begin(),
+                                difference1(0),
+                                equivalence_from_compare<Compare>(comp));
+
+  thrust::exclusive_scan_by_key(first2, last2,
+                                thrust::make_constant_iterator<difference2>(1),
+                                key_ranks2.begin(),
+                                difference2(0),
+                                equivalence_from_compare<Compare>(comp));
+
+  // zip up the keys with their ranks to disambiguate repeated elements during rank-finding
+  typedef typename raw_buffer<difference1, cuda_device_space_tag>::iterator RankIterator1;
+  typedef typename raw_buffer<difference2, cuda_device_space_tag>::iterator RankIterator2;
+  typedef thrust::tuple<RandomAccessIterator1,RankIterator1> iterator_tuple1;
+  typedef thrust::tuple<RandomAccessIterator2,RankIterator2> iterator_tuple2;
+  typedef thrust::zip_iterator<iterator_tuple1> iterator_and_rank1;
+  typedef thrust::zip_iterator<iterator_tuple2> iterator_and_rank2;
+
+  iterator_and_rank1 first_and_rank1 =
+    thrust::make_zip_iterator(thrust::make_tuple(first1, key_ranks1.begin()));
+  iterator_and_rank1 last_and_rank1 = first_and_rank1 + num_elements1;
+
+  iterator_and_rank2 first_and_rank2 =
+    thrust::make_zip_iterator(thrust::make_tuple(first2, key_ranks2.begin()));
+  iterator_and_rank2 last_and_rank2 = first_and_rank2 + num_elements2;
+
+  // take into account the tuples when comparing
+  typedef compare_first_less_second<Compare> splitter_compare;
+
+  using namespace thrust::detail::device::cuda::detail;
+  return get_set_operation_splitter_ranks(first_and_rank1, last_and_rank1,
+                                          first_and_rank2, last_and_rank2,
+                                          splitter_ranks1,
+                                          splitter_ranks2,
+                                          splitter_compare(comp),
+                                          partition_size,
+                                          num_splitters_from_each_range);
+} // end get_set_intersection_splitter_ranks()
+
+
+} // end namespace set_intersection_detail
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1,
+                                       RandomAccessIterator1 last1,
+                                       RandomAccessIterator2 first2,
+                                       RandomAccessIterator2 last2,
+                                       RandomAccessIterator3 result,
+                                       Compare comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type      value_type1;
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
+  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
+
+  const difference1 num_elements1 = last1 - first1;
+  const difference2 num_elements2 = last2 - first2;
+
+  // check for trivial problem
+  if(num_elements1 == 0 || num_elements2 == 0)
+    return result;
+
+  using namespace set_intersection_detail;
+  using namespace thrust::detail;
+  using namespace thrust::detail::device::cuda::detail;
+
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+  
+  // prefer large blocks to keep the partitions as large as possible
+  const size_t block_size =
+    arch::max_blocksize_subject_to_smem_usage(set_intersection_kernel<
+                                                RandomAccessIterator1,
+                                                RandomAccessIterator2,
+                                                typename raw_buffer<difference1,cuda_device_space_tag>::iterator,
+                                                typename raw_buffer<difference2,cuda_device_space_tag>::iterator,
+                                                typename raw_buffer<value_type1,cuda_device_space_tag>::iterator,
+                                                typename raw_buffer<difference1,cuda_device_space_tag>::iterator,
+                                                Compare,
+                                                size_t
+                                              >,
+                                              get_set_intersection_kernel_per_block_dynamic_smem_usage<
+                                                RandomAccessIterator1,
+                                                RandomAccessIterator2,
+                                                typename raw_buffer<value_type1,cuda_device_space_tag>::iterator
+                                              >);
+
+  const size_t partition_size = block_size;
+  const difference1 num_partitions = ceil_div(num_elements1, partition_size);
+  const difference1 num_splitters_from_each_range  = num_partitions - 1;
+  const size_t num_merged_partitions = 2 * num_splitters_from_each_range + 1;
+
+  // allocate storage for splitter ranks
+  raw_buffer<difference1, cuda_device_space_tag> splitter_ranks1(2 * num_splitters_from_each_range);
+  raw_buffer<difference2, cuda_device_space_tag> splitter_ranks2(2 * num_splitters_from_each_range);
+
+  // select some splitters and find the rank of each splitter in the other range
+  // XXX it's possible to fuse rank-finding with the merge_kernel below
+  //     this eliminates the temporary buffers splitter_ranks1 & splitter_ranks2
+  //     but this spills to lmem and causes a 10x speeddown
+  get_set_intersection_splitter_ranks(first1,last1,
+                                      first2,last2,
+                                      splitter_ranks1.begin(),
+                                      splitter_ranks2.begin(),
+                                      comp,
+                                      partition_size,
+                                      num_splitters_from_each_range);
+
+  // allocate storage to store each intersected partition's size
+  raw_buffer<difference1, cuda_device_space_tag> result_partition_sizes(num_merged_partitions);
+
+  // allocate storage to store the largest possible intersection
+  raw_buffer<typename thrust::iterator_value<RandomAccessIterator1>::type, cuda_device_space_tag> temporary_results(num_elements1);
+
+  // maximize the number of blocks we can launch
+  const size_t max_blocks = thrust::detail::device::cuda::arch::max_grid_dimensions().x;
+  const size_t num_blocks = thrust::min(num_merged_partitions, max_blocks);
+  const size_t dynamic_smem_size = get_set_intersection_kernel_per_block_dynamic_smem_usage<RandomAccessIterator1,RandomAccessIterator2,RandomAccessIterator3>(block_size);
+
+  set_intersection_detail::set_intersection_kernel<<<num_blocks, static_cast<unsigned int>(block_size), static_cast<unsigned int>(dynamic_smem_size)>>>( 
+  	first1, last1,
+        first2, last2,
+  	splitter_ranks1.begin(),
+  	splitter_ranks2.begin(),
+  	temporary_results.begin(), 
+        result_partition_sizes.begin(),
+  	comp,
+        num_merged_partitions);
+  synchronize_if_enabled("set_intersection_kernel");
+
+  // inclusive scan the element counts to get the number of elements occurring before and including each partition
+  thrust::inclusive_scan(result_partition_sizes.begin(),
+                         result_partition_sizes.end(),
+                         result_partition_sizes.begin());
+
+  // gather from temporary_results to result
+  // XXX use a different heuristic for this kernel config
+  grouped_gather<<<num_blocks, static_cast<unsigned int>(block_size)>>>(
+                 result,
+                 temporary_results.begin(),
+                 splitter_ranks1.begin(),
+                 result_partition_sizes.begin(),
+                 num_merged_partitions);
+  synchronize_if_enabled("grouped_gather");
+
+  return result + result_partition_sizes[num_merged_partitions - 1];
+} // end merge
+
+} // end namespace cuda
+} // end namespace device
+} // end namespace detail
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
diff --git a/thrust/detail/device/cuda/set_operations.inl b/thrust/detail/device/cuda/set_operations.inl
deleted file mode 100644
index 5131d7cce..000000000
--- a/thrust/detail/device/cuda/set_operations.inl
+++ /dev/null
@@ -1,550 +0,0 @@
-/*
- *  Copyright 2008-2010 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file set_operations.inl
- *  \brief CUDA implementation of set_intersection,
- *         based on Gregory Diamos' original implementation.
- */
-
-#include <thrust/detail/config.h>
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-// TODO use thrust/detail/device/ where possible
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/binary_search.h>
-#include <thrust/gather.h>
-#include <thrust/scan.h>
-#include <thrust/distance.h>
-#include <thrust/extrema.h>
-#include <thrust/unique.h>
-
-#include <thrust/detail/device/dereference.h>
-
-#include <thrust/detail/device/cuda/block/copy.h>
-#include <thrust/detail/device/cuda/block/inclusive_scan.h>
-#include <thrust/detail/device/cuda/scalar/rotate.h>
-#include <thrust/pair.h>
-#include <thrust/extrema.h>
-#include <thrust/detail/device/cuda/synchronize.h>
-
-#include <thrust/detail/device/generic/scalar/binary_search.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace device
-{
-namespace cuda
-{
-
-namespace block
-{
-
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-__device__
-thrust::tuple<
-  RandomAccessIterator1,
-  RandomAccessIterator2,
-  RandomAccessIterator3
->
-
-  set_intersection(RandomAccessIterator1 first1,
-                   RandomAccessIterator1 last1,
-                   RandomAccessIterator2 first2,
-                   RandomAccessIterator2 last2,
-                   RandomAccessIterator3 result,
-                   StrictWeakOrdering comp)
-{
-  namespace gs = thrust::detail::device::generic::scalar;
-
-  bool do_output_element = false;
-
-  // these variables are needed outside of the branch for the last part of the algorithm
-  // so declare them here
-  thrust::pair<RandomAccessIterator2,RandomAccessIterator2> matches2 = thrust::make_pair(first2,first2);
-
-  typename thrust::iterator_difference<RandomAccessIterator1>::type size_of_partial_intersection(0);
-
-  // for each element in the first range, search the second range, looking for a match
-  if(first1 + threadIdx.x < last1)
-  {
-    // not only must we figure out if our element exists in the second range, we need to
-    // 1. figure out how many m copies of our element exists in the first range
-    // 2. figure out how many n copies of our element exists in the second range
-    // 3. keep the element if its rank < min(m,n)
-
-    thrust::pair<RandomAccessIterator1,RandomAccessIterator1> matches1 = gs::equal_range(first1, last1, first1[threadIdx.x], comp);
-    matches2 = gs::equal_range(first2, last2, first1[threadIdx.x], comp);
-
-    size_of_partial_intersection = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (matches1.second - matches1.first,
-                                                                                  matches2.second - matches2.first);
-
-    // compute the rank of the element in the first range
-    unsigned int element_rank = threadIdx.x - (matches1.first - first1);
-
-    do_output_element = (element_rank < size_of_partial_intersection);
-  }
-
-  // copy matches to the result
-
-  // record, for each thread, whether we should output its element
-  __shared__ unsigned int shared[block_size];
-  shared[threadIdx.x] = do_output_element;
-
-  // a shared copy_if operation follows:
-
-  block::inplace_inclusive_scan<block_size>(shared, thrust::plus<unsigned int>());
-
-  unsigned int size_of_total_intersection = shared[block_size-1];
-
-  // find the index to write our element
-  unsigned int output_index = 0;
-  if(threadIdx.x > 0)
-  {
-    output_index = shared[threadIdx.x-1];
-  }
-
-  if(do_output_element)
-  {
-    result[output_index] = first1[threadIdx.x];
-  }
-
-  // find the last element of both ranges to be consumed, and point to the next element
-  __shared__ unsigned int one_past_last_element_consumed[2];
-  if(threadIdx.x == 0)
-  {
-    one_past_last_element_consumed[0] = 0;
-    one_past_last_element_consumed[1] = 0;
-  }
-  __syncthreads();
-
-  if(do_output_element && shared[threadIdx.x] == size_of_total_intersection)
-  {
-    one_past_last_element_consumed[0] = threadIdx.x+1;
-    one_past_last_element_consumed[1] = (matches2.first + size_of_partial_intersection) - first2;
-  }
-  __syncthreads();
-
-  return thrust::make_tuple(first1 + one_past_last_element_consumed[0],
-                            first2 + one_past_last_element_consumed[1],
-                            result + size_of_total_intersection);
-} // end set_intersection
-
-} // end block
-
-namespace set_intersection_detail
-{
-
-template<typename T1, typename T2>
-T1 ceil_div(T1 up, T2 down)
-{
-  T1 div = up / down;
-  T1 rem = up % down;
-  return (rem != 0) ? div + 1 : div;
-}
-
-template<unsigned int N>
-  struct align_size_to_int
-{
-  static const unsigned int value = (N / sizeof(int)) + ((N % sizeof(int)) ? 1 : 0);
-};
-
-// limited form of block-synchronous rotate which assumes distance(first, last) <= blockDim.x
-template<typename RandomAccessIterator>
-__device__
-  void small_rotate(RandomAccessIterator first,
-                    RandomAccessIterator middle,
-                    RandomAccessIterator last)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  typedef typename thrust::iterator_difference<RandomAccessIterator>::type difference;
-
-  // XXX these should use scalar::distance
-  difference k = middle - first;
-  difference l = last - middle;
-
-  // advance first
-  // XXX this should use scalar::advance
-  first += threadIdx.x;
-
-  value_type temp = dereference(first);
-
-  __syncthreads();
-
-  if(threadIdx.x < k)
-  {
-    // move front half to back
-    // advance first to point to the back
-    // XXX this should use scalar::advance
-    first += l;
-
-    dereference(first) = temp;
-  }
-  else
-  {
-    // move back half to front
-    // advance first to point to the front
-    // XXX this should use scalar::advance
-    first -= k;
-
-    dereference(first) = temp;
-  }
-}
-
-template<unsigned int block_size,
-         typename RandomAccessIterator,
-         typename T,
-         typename Range>
-__device__
-  unsigned int shift_and_fetch(RandomAccessIterator first,
-                               RandomAccessIterator last,
-                               T *s_storage,
-                               Range s_range)
-{
-  // push remaining input from the previous iteration to the front of the storage
-  if(s_range.first != s_range.second)
-  {
-    // note that rotate is actually redundant -- we don't need to keep [first,middle)
-    small_rotate(s_storage, s_range.first, s_range.second);
-
-    // note that the block is convergent at this barrier
-    __syncthreads();
-  }
-
-  // compute the number of new elements to copy
-  unsigned int n = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION (s_range.first - s_storage, last - first);
-
-  // copy from the input into shared mem
-  thrust::detail::device::cuda::block::copy(first, first + n, s_storage + (s_range.second - s_range.first));
-
-  __syncthreads();
-
-  return n;
-}
-
-
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename RandomAccessIterator5,
-         typename RandomAccessIterator6,
-         typename StrictWeakOrdering>
-__launch_bounds__(block_size, 1)         
-__global__ void set_intersection_kernel(RandomAccessIterator1 first1, 
-                                        RandomAccessIterator1 last1,
-                                        RandomAccessIterator2 first2,
-                                        RandomAccessIterator2 last2,
-                                        RandomAccessIterator3 result,
-                                        RandomAccessIterator4 partition_begin_indices1, 
-                                        RandomAccessIterator5 partition_begin_indices2, 
-                                        RandomAccessIterator6 result_partition_sizes, 
-                                        StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-  
-  const unsigned int block_idx  = blockIdx.x;
-  const unsigned int thread_idx = threadIdx.x;
-
-  // advance iterators
-  partition_begin_indices1 += block_idx;
-  partition_begin_indices2 += block_idx;
-  
-  // find the ends of our partition if this is not the last block
-  if(block_idx != gridDim.x - 1)
-  {
-    RandomAccessIterator4 temp1 = partition_begin_indices1 + 1;
-    RandomAccessIterator5 temp2 = partition_begin_indices2 + 1;
-
-    last1 = first1 + dereference(temp1);
-    last2 = first2 + dereference(temp2);
-  }
-  
-  // point to the beginning of our partition in each range
-  first1 += dereference(partition_begin_indices1);
-  first2 += dereference(partition_begin_indices2);
-  result += dereference(partition_begin_indices1);
-
-  // allocate shared backing store
-  const unsigned int array_size = align_size_to_int<block_size * sizeof(value_type)>::value;
-  __shared__ int _shared1[array_size];		
-  __shared__ int _shared2[array_size];
-  __shared__ int _result[array_size];
-
-  value_type *s_storage1 = reinterpret_cast<value_type*>(_shared1);
-  value_type *s_storage2 = reinterpret_cast<value_type*>(_shared2);
-  value_type *s_result   = reinterpret_cast<value_type*>(_result);
-
-  // keep two ranges in shared memory
-  // these ranges begin empty, pointing to the end of their backing store
-  pair<value_type*,value_type*> s_range1 = make_pair(s_storage1 + block_size, s_storage1 + block_size);
-  pair<value_type*,value_type*> s_range2 = make_pair(s_storage2 + block_size, s_storage2 + block_size);
-  
-  typename thrust::iterator_value<RandomAccessIterator6>::type result_partition_size(0);
-
-  unsigned int round = 0;
-  	
-  // continuously bring input into our shared storage over multiple rounds until
-  // we find we can accomodation input but have none left
-  // the loop termination condition is somewhat complicated, so we just break at the bottom
-  while(true)
-  {
-    // fetch into the first segment if there's input left and we have room
-    if((first1 < last1) && (s_range1.first > s_storage1))
-    {
-      unsigned int num_new_elements = shift_and_fetch<block_size>(first1,last1,s_storage1,s_range1);
-
-      // advance first1
-      first1 += num_new_elements;
-
-      // adjust the bounds of range1
-      unsigned int num_old_elements = s_range1.second - s_range1.first;
-      s_range1.first  = s_storage1;
-      s_range1.second = s_storage1 + num_old_elements + num_new_elements;
-    }
-
-    // fetch into the second segment if we have room
-    if((first2 < last2) && (s_range2.first > s_storage2))
-    {
-      unsigned int num_new_elements = shift_and_fetch<block_size>(first2,last2,s_storage2,s_range2);
-
-      // advance first2
-      first2 += num_new_elements;
-
-      // adjust the bounds of range2
-      unsigned int num_old_elements = s_range2.second - s_range2.first;
-      s_range2.first  = s_storage2;
-      s_range2.second = s_storage2 + num_old_elements + num_new_elements;
-    }
-
-    // XXX bring these into registers rather than do multiple shared loads?
-    bool range2_has_strictly_lesser_bound = comp(s_range2.second[-1], s_range1.second[-1]);
-    bool range1_has_strictly_lesser_bound = comp(s_range1.second[-1], s_range2.second[-1]);
-    
-    pair<value_type*,value_type*> s_range_with_greater_bound = range2_has_strictly_lesser_bound ? s_range1 : s_range2;
-    pair<value_type*,value_type*> s_range_with_lesser_bound  = range2_has_strictly_lesser_bound ? s_range2 : s_range1;
-
-    // XXX this spot is probably the point of parameterization for other related algorithms
-    value_type *s_result_end;
-    tie(s_range_with_lesser_bound.first, s_range_with_greater_bound.first, s_result_end)
-      = block::set_intersection<block_size>(s_range_with_lesser_bound.first,  s_range_with_lesser_bound.second,
-                                            s_range_with_greater_bound.first, s_range_with_greater_bound.second,
-                                            s_result, comp);
-
-    // when s_range_with_greater_bound.second[-1] is not equivalent to s_range_with_lesser_bound.second[-1],
-    // (i.e., s_range_with_greater_bound truly has a greater bound)
-    // it's always safe to eject all of s_range_with_lesser_bound
-    // in this case, collapse s_range_with_lesser_bound to an empty range
-    // else, we need to retain elements that were not intersected.
-    // this is a special case that occurs in the presense of multisets.
-    if(range1_has_strictly_lesser_bound || range2_has_strictly_lesser_bound)
-    {
-      // ranges' bounds are not equivalent, it's safe to eject all of the range with lesser bound
-      s_range_with_lesser_bound.first = s_range_with_lesser_bound.second;
-    }
-
-    // write the updated ranges back
-    s_range1 = range2_has_strictly_lesser_bound ? s_range_with_greater_bound : s_range_with_lesser_bound;
-    s_range2 = range2_has_strictly_lesser_bound ? s_range_with_lesser_bound  : s_range_with_greater_bound;
-    
-    // copy out to result, advance iterator
-    result = thrust::detail::device::cuda::block::copy(s_result, s_result_end, result);
-
-    __syncthreads();
-
-    // increment size of the output
-    result_partition_size += (s_result_end - s_result);
- 
-    ++round;
-
-    // if we require more input but have run out, break the loop
-    if(s_range1.first == s_range1.second && first1 == last1) break;
-    if(s_range2.first == s_range2.second && first2 == last2) break;
-  }
-  
-  if(thread_idx == 0)
-  {
-    result_partition_sizes += block_idx;
-    dereference(result_partition_sizes) = result_partition_size;
-  }
-}
-
-
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4>
-__launch_bounds__(block_size, 1)         
-__global__
-void grouped_gather(RandomAccessIterator1 result,
-                    RandomAccessIterator2 first,
-                    RandomAccessIterator3 input_segment_begin_indices,
-                    RandomAccessIterator4 output_segment_end_indices)
-{
-  using namespace thrust::detail::device;
-
-  // advance iterators
-  input_segment_begin_indices += blockIdx.x;
-  output_segment_end_indices  += blockIdx.x;
-
-  // point at the beginning of this block's input
-  first += dereference(input_segment_begin_indices);
-  
-  // initialize the size of the segment to read
-  typename thrust::iterator_value<RandomAccessIterator4>::type size = dereference(output_segment_end_indices);
-  
-  if(blockIdx.x != 0)
-  {
-    RandomAccessIterator4 ptr_to_previous_segment_end_index = output_segment_end_indices;
-    --ptr_to_previous_segment_end_index;
-
-    // get the end of the previous segment
-    typename thrust::iterator_value<RandomAccessIterator4>::type previous_segment_end_index = dereference(ptr_to_previous_segment_end_index);
-
-    // point result at our segment's begin index
-    result += previous_segment_end_index;
-
-    // compute our segment's size: the difference between our end index and the previous segment's end index
-    size -= previous_segment_end_index;
-  }
-  
-  thrust::detail::device::cuda::block::copy(first, first + size, result);
-}
-
-
-template<typename T>
-struct mult_by
-  : thrust::unary_function<T,T>
-{
-  T _value;
-  
-  mult_by(const T& v):_value(v){}
-  
-  __host__ __device__
-  T operator()(const T& v) const
-  {
-    return _value * v;
-  }
-};
-
-} // end namespace set_intersection_detail
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1,
-                                       RandomAccessIterator1 last1,
-                                       RandomAccessIterator2 first2,
-                                       RandomAccessIterator2 last2,
-                                       RandomAccessIterator3 result,
-                                       Compare comp)
-{
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
-  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
-
-  const difference1 num_elements1 = last1 - first1;
-  const difference2 num_elements2 = last2 - first2;
-
-  // check for trivial problem
-  if(num_elements1 == 0 || num_elements2 == 0)
-    return result;
-
-  using namespace set_intersection_detail;
-  using namespace thrust::detail;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-  
-  // XXX makes more sense to create a number of partitions equal to the number of SMs
-  const size_t block_size = 128;
-  const size_t partition_size = 1024;
-  
-  difference1 num_partitions = ceil_div(num_elements1, partition_size);
-
-  // create the range [0, partition_size, 2*partition_size, 3*partition_size, ...]
-  typedef thrust::counting_iterator<difference1, cuda_device_space_tag> counter1;
-  thrust::transform_iterator< mult_by<difference1>, counter1>
-    partition_begin_indices1_guess
-      = thrust::make_transform_iterator(counter1(0), mult_by<difference1>(partition_size));
-
-  // XXX we could encapsulate this gather in a permutation_iterator,
-  //     but the call to unique before confounds us a little
-  raw_buffer<value_type, cuda_device_space_tag> partition_values(num_partitions);
-  thrust::gather(partition_begin_indices1_guess, partition_begin_indices1_guess + partition_values.size(),
-                 first1,
-                 partition_values.begin());
-
-  // we require the partition splitters to be unique
-  typename raw_buffer<value_type, cuda_device_space_tag>::iterator end = 
-    thrust::unique(partition_values.begin(), partition_values.end());
-  num_partitions = end - partition_values.begin();
-
-  raw_buffer<difference1, cuda_device_space_tag> partition_begin_indices1(num_partitions);
-  raw_buffer<difference2, cuda_device_space_tag> partition_begin_indices2(num_partitions);
-
-  thrust::lower_bound(first1, last1,
-                      partition_values.begin(), partition_values.end(), 
-                      partition_begin_indices1.begin(), comp);
-
-  thrust::lower_bound(first2, last2,
-                      partition_values.begin(), partition_values.end(), 
-                      partition_begin_indices2.begin(), comp);
-
-  raw_buffer<difference1, cuda_device_space_tag> result_partition_sizes(num_partitions);
-  raw_buffer< value_type, cuda_device_space_tag> temp_result(num_elements1);
-  
-  set_intersection_detail::set_intersection_kernel<block_size><<<(unsigned int) num_partitions, (unsigned int) block_size >>>( 
-  	first1, last1,
-        first2, last2,
-        temp_result.begin(), 
-  	partition_begin_indices1.begin(),
-  	partition_begin_indices2.begin(),
-  	result_partition_sizes.begin(), 
-  	comp);
-  synchronize_if_enabled("set_intersection_kernel");
-
-  thrust::inclusive_scan(result_partition_sizes.begin(), result_partition_sizes.end(), result_partition_sizes.begin());
-
-  // after the inclusive scan, we have the end of each segment
-  raw_buffer<difference1, cuda_device_space_tag> &output_segment_end_indices = result_partition_sizes;
-  
-  set_intersection_detail::grouped_gather<block_size><<<(unsigned int) num_partitions, (unsigned int) block_size >>>( 
-  	result,
-  	temp_result.begin(),
-  	partition_begin_indices1.begin(),
-  	output_segment_end_indices.begin());
-  synchronize_if_enabled("grouped_gather");
-
-  return result + result_partition_sizes[num_partitions - 1];
-} // end set_intersection
-
-} // end namespace cuda
-} // end namespace device
-} // end namespace detail
-} // end namespace thrust
-
-#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
diff --git a/thrust/detail/device/dispatch/set_operations.h b/thrust/detail/device/dispatch/set_operations.h
index 201961516..6d6c3599b 100644
--- a/thrust/detail/device/dispatch/set_operations.h
+++ b/thrust/detail/device/dispatch/set_operations.h
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/device/cuda/set_operations.h>
+#include <thrust/detail/device/cuda/set_intersection.h>
 #include <thrust/detail/device/generic/set_operations.h>
 
 namespace thrust