Skip to content

Commit

Permalink
First cut at improved CUDA set_intersection & perf test.
Browse files Browse the repository at this point in the history
  • Loading branch information
jaredhoberock committed Nov 6, 2010
1 parent b6233e2 commit 6031df5
Show file tree
Hide file tree
Showing 11 changed files with 776 additions and 598 deletions.
45 changes: 45 additions & 0 deletions performance/set_intersection.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
PREAMBLE = \
"""
#include <thrust/set_operations.h>
#include <thrust/sort.h>
#include <algorithm>
"""

INITIALIZE = \
"""
thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
thrust::sort(h_a.begin(), h_a.end());
thrust::sort(h_b.begin(), h_b.end());

thrust::host_vector<$InputType> h_result(h_a.size());
thrust::host_vector<$InputType>::iterator new_end =
thrust::set_intersection(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
h_result.resize(new_end - h_result.begin());

thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;

thrust::device_vector<$InputType> d_result(h_result.size());
thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());

ASSERT_EQUAL(h_result, d_result);
"""

TIME = \
"""
thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
"""

FINALIZE = \
"""
RECORD_TIME();
RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType));
RECORD_SORTING_RATE(2 * double($InputSize))
"""


InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
InputSizes = [2**N for N in range(10, 25)]

TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]

51 changes: 14 additions & 37 deletions testing/set_intersection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,48 +141,26 @@ void TestSetIntersectionMultiset(const size_t n)
DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);


struct non_arithmetic
template<typename U>
void TestSetIntersectionKeyValue(size_t n)
{
__host__ __device__
non_arithmetic(void)
{}
typedef key_value<U,U> T;

__host__ __device__
non_arithmetic(const non_arithmetic &x)
: key(x.key) {}
thrust::host_vector<U> h_keys_a = unittest::random_integers<U>(n);
thrust::host_vector<U> h_values_a = unittest::random_integers<U>(n);

__host__ __device__
non_arithmetic(const int k)
: key(k) {}
thrust::host_vector<U> h_keys_b = unittest::random_integers<U>(n);
thrust::host_vector<U> h_values_b = unittest::random_integers<U>(n);

__host__ __device__
bool operator<(const non_arithmetic &rhs) const
thrust::host_vector<T> h_a(n), h_b(n);
for(size_t i = 0; i < n; ++i)
{
return key < rhs.key;
h_a[i] = T(h_keys_a[i], h_values_a[i]);
h_b[i] = T(h_keys_b[i], h_values_b[i]);
}

__host__ __device__
bool operator==(const non_arithmetic &rhs) const
{
return key == rhs.key;
}

int key;
};


void TestSetIntersectionNonArithmetic(void)
{
const unsigned int n = 12345;

typedef non_arithmetic T;

thrust::host_vector<T> temp = unittest::random_integers<int>(2 * n);
thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
thrust::host_vector<T> h_b(temp.begin() + n, temp.end());

thrust::sort(h_a.begin(), h_a.end());
thrust::sort(h_b.begin(), h_b.end());
thrust::stable_sort(h_a.begin(), h_a.end());
thrust::stable_sort(h_b.begin(), h_b.end());

thrust::device_vector<T> d_a = h_a;
thrust::device_vector<T> d_b = h_b;
Expand All @@ -206,6 +184,5 @@ void TestSetIntersectionNonArithmetic(void)

ASSERT_EQUAL_QUIET(h_result, d_result);
}
DECLARE_UNITTEST(TestSetIntersectionNonArithmetic);

DECLARE_VARIABLE_UNITTEST(TestSetIntersectionKeyValue);

26 changes: 26 additions & 0 deletions thrust/detail/device/cuda/block/inclusive_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,32 @@ __device__
if(block_size > 512) { if (threadIdx.x >= 512) { val = binary_op(first[threadIdx.x - 512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
} // end inplace_inclusive_scan()


template<typename RandomAccessIterator,
typename Size,
typename BinaryFunction>
__device__
void inplace_inclusive_scan_n(RandomAccessIterator first,
Size n,
BinaryFunction binary_op)
{
typename thrust::iterator_value<RandomAccessIterator>::type val = first[threadIdx.x];
__syncthreads();

// assume n <= 2048
if(n > 1) { if (threadIdx.x < n && threadIdx.x >= 1) { val = binary_op(first[threadIdx.x - 1], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 2) { if (threadIdx.x < n && threadIdx.x >= 2) { val = binary_op(first[threadIdx.x - 2], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 4) { if (threadIdx.x < n && threadIdx.x >= 4) { val = binary_op(first[threadIdx.x - 4], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 8) { if (threadIdx.x < n && threadIdx.x >= 8) { val = binary_op(first[threadIdx.x - 8], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 16) { if (threadIdx.x < n && threadIdx.x >= 16) { val = binary_op(first[threadIdx.x - 16], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 32) { if (threadIdx.x < n && threadIdx.x >= 32) { val = binary_op(first[threadIdx.x - 32], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 64) { if (threadIdx.x < n && threadIdx.x >= 64) { val = binary_op(first[threadIdx.x - 64], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 128) { if (threadIdx.x < n && threadIdx.x >= 128) { val = binary_op(first[threadIdx.x - 128], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 256) { if (threadIdx.x < n && threadIdx.x >= 256) { val = binary_op(first[threadIdx.x - 256], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 512) { if (threadIdx.x < n && threadIdx.x >= 512) { val = binary_op(first[threadIdx.x - 512], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
if(n > 1024) { if (threadIdx.x < n && threadIdx.x >= 1024) { val = binary_op(first[threadIdx.x - 1024], val); } __syncthreads(); first[threadIdx.x] = val; __syncthreads(); }
} // end inplace_inclusive_scan()

} // end namespace block
} // end namespace cuda
} // end namespace device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,34 @@

namespace thrust
{

namespace detail
{

namespace device
{

namespace cuda
{
namespace block
{

template<typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename RandomAccessIterator4,
typename StrictWeakOrdering>
RandomAccessIterator3 set_intersection(RandomAccessIterator1 first1,
__device__ __forceinline__
RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1,
RandomAccessIterator1 last1,
RandomAccessIterator2 first2,
RandomAccessIterator2 last2,
RandomAccessIterator3 result,
RandomAccessIterator3 temporary,
RandomAccessIterator4 result,
StrictWeakOrdering comp);

} // end block
} // end cuda

} // end device

} // end detail

} // end thrust

#include <thrust/detail/device/cuda/set_operations.inl>
#include <thrust/detail/device/cuda/block/set_intersection.inl>

141 changes: 141 additions & 0 deletions thrust/detail/device/cuda/block/set_intersection.inl
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Copyright 2008-2010 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <thrust/iterator/iterator_traits.h>
#include <thrust/detail/device/cuda/block/merge.h>
#include <thrust/detail/device/generic/scalar/binary_search.h>
#include <thrust/detail/device/dereference.h>
#include <thrust/tuple.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/counting_iterator.h>

namespace thrust
{
namespace detail
{
namespace device
{
namespace cuda
{
namespace block
{

namespace set_intersection_detail
{

// this predicate tests two two-element tuples
// we first use a Compare for the first element
// if the first elements are equivalent, we use
// < for the second elements
// XXX merge duplicates this
// move it some place common
template<typename Compare>
struct compare_first_less_second
{
__host__ __device__
compare_first_less_second(Compare c)
: comp(c) {}

template<typename T1, typename T2>
__host__ __device__
bool operator()(T1 lhs, T2 rhs)
{
return comp(lhs.get<0>(), rhs.get<0>()) || (!comp(rhs.get<0>(), lhs.get<0>()) && lhs.get<1>() < rhs.get<1>());
}

Compare comp;
}; // end compare_first_less_second

} // end set_intersection_detail


template<typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename RandomAccessIterator4,
typename StrictWeakOrdering>
__device__ __forceinline__
RandomAccessIterator4 set_intersection(RandomAccessIterator1 first1,
RandomAccessIterator1 last1,
RandomAccessIterator2 first2,
RandomAccessIterator2 last2,
RandomAccessIterator3 temporary,
RandomAccessIterator4 result,
StrictWeakOrdering comp)
{
using namespace set_intersection_detail;

typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;

difference1 n1 = last1 - first1;

// search for all matches in the second range for each element in the first
bool found = false;
if(threadIdx.x < n1)
{
RandomAccessIterator1 x = first1;
x += threadIdx.x;

// count the number of previous occurrances of x the first range
difference1 rank = x - thrust::detail::device::generic::scalar::lower_bound(first1,x,dereference(x),comp);

// count the number of equivalent elements of x in the second range
thrust::pair<RandomAccessIterator2,RandomAccessIterator2> matches =
thrust::detail::device::generic::scalar::equal_range(first2,last2,dereference(x),comp);

difference2 num_matches = matches.second - matches.first;

// the element is "found" if its rank is less than the number of matches
found = rank < num_matches;
} // end if

// mark whether my element was found or not in the scratch array
RandomAccessIterator3 temp = temporary;
temp += threadIdx.x;
dereference(temp) = found;

block::inplace_inclusive_scan_n(temporary, n1, thrust::plus<int>());

// copy_if
if(found)
{
// find the index to write our element
unsigned int output_index = 0;
if(threadIdx.x > 0)
{
RandomAccessIterator3 src = temporary;
src += threadIdx.x - 1;
output_index = dereference(src);
} // end if

RandomAccessIterator1 x = first1;
x += threadIdx.x;

RandomAccessIterator4 dst = result;
dst += output_index;
dereference(dst) = dereference(x);
} // end if

return result + temporary[n1-1];
} // end set_intersection

} // end block
} // end cuda
} // end device
} // end detail
} // end thrust

Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,10 @@ template<typename RandomAccessIterator1,
splitters_begin, splitters_end,
splitter_ranks2, comp);

thrust::upper_bound(first1, last1,
// thrust::upper_bound(first1, last1,
// splitters_begin, splitters_end,
// splitter_ranks1, comp);
thrust::lower_bound(first1, last1,
splitters_begin, splitters_end,
splitter_ranks1, comp);
} // end get_set_operation_splitter_ranks()
Expand Down
2 changes: 2 additions & 0 deletions thrust/detail/device/cuda/merge.inl
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ __global__ void merge_kernel(const RandomAccessIterator1 first1,
// we first use a Compare for the first element
// if the first elements are equivalent, we use
// < for the second elements
// XXX set_intersection duplicates this
// move it some place common
template<typename Compare>
struct compare_first_less_second
{
Expand Down
Loading

0 comments on commit 6031df5

Please sign in to comment.