Skip to content

Commit

Permalink
implemented arch-specific tuning for cuda::fill()
Browse files Browse the repository at this point in the history
resolves issue NVIDIA#286

P4 Change 8755144 on 2011/02/08 08:05:22 by nbell
  • Loading branch information
wnbell committed Feb 8, 2011
1 parent 120bd20 commit b64b0ae
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .LATEST_P4_CHANGELIST
Original file line number Diff line number Diff line change
@@ -1 +1 @@
8750234
8755144
1 change: 1 addition & 0 deletions performance/fill_optimization.test
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ PREAMBLE = \
T x;

constant_functor(T x) : x(x) {}
__host__ __device__
T operator()(void) const {return x;}
};

Expand Down
22 changes: 22 additions & 0 deletions testing/arch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,28 @@ void set_func_attributes(cudaFuncAttributes& attributes,
attributes.sharedSizeBytes = sharedSizeBytes;
}

void TestComputeCapability(void)
{
cudaDeviceProp properties;

set_compute_capability(properties, 1, 0);
ASSERT_EQUAL(compute_capability(properties), 10);

set_compute_capability(properties, 1, 1);
ASSERT_EQUAL(compute_capability(properties), 11);

set_compute_capability(properties, 1, 3);
ASSERT_EQUAL(compute_capability(properties), 13);

set_compute_capability(properties, 2, 0);
ASSERT_EQUAL(compute_capability(properties), 20);

set_compute_capability(properties, 2, 1);
ASSERT_EQUAL(compute_capability(properties), 21);
}
DECLARE_UNITTEST(TestComputeCapability);


void TestMaxActiveThreads(void)
{
cudaDeviceProp properties;
Expand Down
9 changes: 9 additions & 0 deletions thrust/detail/device/cuda/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ namespace cuda
namespace arch
{


/*! This function returns the compute capability of a device.
* For example, returns 10 for sm_10 and 21 for sm_21
* \return The compute capability as an integer
*/

inline size_t compute_capability(const cudaDeviceProp &properties);
inline size_t compute_capability(void);

/*! This function returns the number of streaming
* multiprocessors available for processing.
* \return The number of SMs available.
Expand Down
19 changes: 15 additions & 4 deletions thrust/detail/device/cuda/arch.inl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace arch
namespace detail
{

inline void checked_get_current_device_properties(cudaDeviceProp &props)
inline void checked_get_current_device_properties(cudaDeviceProp &properties)
{
int current_device = -1;

Expand All @@ -66,20 +66,20 @@ inline void checked_get_current_device_properties(cudaDeviceProp &props)
if(iter == properties_map.end())
{
// the properties weren't found, ask the runtime to generate them
error = cudaGetDeviceProperties(&props, current_device);
error = cudaGetDeviceProperties(&properties, current_device);

if(error)
{
throw thrust::system_error(error, thrust::cuda_category());
}

// insert the new entry
properties_map[current_device] = props;
properties_map[current_device] = properties;
} // end if
else
{
// use the cached value
props = iter->second;
properties = iter->second;
} // end else
} // end checked_get_current_device_properties()

Expand Down Expand Up @@ -119,6 +119,10 @@ void checked_get_function_attributes(cudaFuncAttributes& attributes, KernelFunct

} // end detail

size_t compute_capability(const cudaDeviceProp &properties)
{
return 10 * properties.major + properties.minor;
} // end compute_capability()

size_t num_multiprocessors(const cudaDeviceProp& properties)
{
Expand Down Expand Up @@ -187,6 +191,13 @@ size_t max_active_blocks_per_multiprocessor(const cudaDeviceProp& properties,


// Functions that query the runtime for device properties
size_t compute_capability(void)
{
cudaDeviceProp properties;
detail::checked_get_current_device_properties(properties);
return compute_capability(properties);
} // end compute_capability()


size_t num_multiprocessors(void)
{
Expand Down
27 changes: 20 additions & 7 deletions thrust/detail/device/cuda/fill.inl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include <thrust/extrema.h>
#include <thrust/detail/internal_functional.h>

#include <thrust/detail/device/cuda/arch.h>

namespace thrust
{
namespace detail
Expand All @@ -39,7 +41,7 @@ namespace cuda
namespace detail
{

template<typename Pointer, typename Size, typename T>
template<typename WideType, typename Pointer, typename Size, typename T>
Pointer wide_fill_n(Pointer first,
Size n,
const T &value)
Expand All @@ -48,9 +50,6 @@ template<typename Pointer, typename Size, typename T>

size_t ALIGNMENT_BOUNDARY = 128; // begin copying blocks at this byte boundary

// type used to pack the OutputTypes
typedef unsigned long long WideType;

WideType wide_exemplar;
OutputType narrow_exemplars[sizeof(WideType) / sizeof(OutputType)];

Expand Down Expand Up @@ -98,11 +97,25 @@ template<typename OutputIterator, typename Size, typename T>

if ( thrust::detail::util::is_aligned<OutputType>(thrust::raw_pointer_cast(&*first)) )
{
wide_fill_n(&*first, n, value);
if (arch::compute_capability() < 20)
{
// 32-bit writes are faster on G80 and GT200
typedef unsigned int WideType;
wide_fill_n<WideType>(&*first, n, value);
}
else
{
// 64-bit writes are faster on Fermi
typedef unsigned long long WideType;
wide_fill_n<WideType>(&*first, n, value);
}

return first + n;
}

return fill_n(first, n, value, thrust::detail::false_type());
else
{
return fill_n(first, n, value, thrust::detail::false_type());
}
}

} // end detail
Expand Down

0 comments on commit b64b0ae

Please sign in to comment.