implemented arch-specific tuning for cuda::fill()

resolves issue NVIDIA#286 P4 Change 8755144 on 2011/02/08 08:05:22 by nbell
STEllAR-GROUP · Feb 8, 2011 · b64b0ae · b64b0ae
1 parent 120bd20
commit b64b0ae
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 12 deletions.
diff --git a/.LATEST_P4_CHANGELIST b/.LATEST_P4_CHANGELIST
@@ -1 +1 @@
-8750234
+8755144
diff --git a/performance/fill_optimization.test b/performance/fill_optimization.test
@@ -9,6 +9,7 @@ PREAMBLE = \
         T x;
 
         constant_functor(T x) : x(x) {}
+        __host__ __device__
         T operator()(void) const {return x;}
     };
 

diff --git a/testing/arch.cu b/testing/arch.cu
@@ -56,6 +56,28 @@ void set_func_attributes(cudaFuncAttributes& attributes,
     attributes.sharedSizeBytes    = sharedSizeBytes;
 }
 
+void TestComputeCapability(void)
+{
+    cudaDeviceProp properties;
+
+    set_compute_capability(properties, 1, 0);
+    ASSERT_EQUAL(compute_capability(properties), 10);
+
+    set_compute_capability(properties, 1, 1);
+    ASSERT_EQUAL(compute_capability(properties), 11);
+
+    set_compute_capability(properties, 1, 3);
+    ASSERT_EQUAL(compute_capability(properties), 13);
+
+    set_compute_capability(properties, 2, 0);
+    ASSERT_EQUAL(compute_capability(properties), 20);
+
+    set_compute_capability(properties, 2, 1);
+    ASSERT_EQUAL(compute_capability(properties), 21);
+}
+DECLARE_UNITTEST(TestComputeCapability);
+
+
 void TestMaxActiveThreads(void)
 {
     cudaDeviceProp properties;

diff --git a/thrust/detail/device/cuda/arch.h b/thrust/detail/device/cuda/arch.h
@@ -49,6 +49,15 @@ namespace cuda
 namespace arch
 {
 
+
+/*! This function returns the compute capability of a device.
+ *  For example, returns 10 for sm_10 and 21 for sm_21
+ *  \return The compute capability as an integer
+ */
+
+inline size_t compute_capability(const cudaDeviceProp &properties);
+inline size_t compute_capability(void);
+
 /*! This function returns the number of streaming
  *  multiprocessors available for processing.
  *  \return The number of SMs available.

diff --git a/thrust/detail/device/cuda/arch.inl b/thrust/detail/device/cuda/arch.inl
@@ -43,7 +43,7 @@ namespace arch
 namespace detail
 {
 
-inline void checked_get_current_device_properties(cudaDeviceProp &props)
+inline void checked_get_current_device_properties(cudaDeviceProp &properties)
 {
   int current_device = -1;
 
@@ -66,20 +66,20 @@ inline void checked_get_current_device_properties(cudaDeviceProp &props)
   if(iter == properties_map.end())
   {
     // the properties weren't found, ask the runtime to generate them
-    error = cudaGetDeviceProperties(&props, current_device);
+    error = cudaGetDeviceProperties(&properties, current_device);
 
     if(error)
     {
       throw thrust::system_error(error, thrust::cuda_category());
     }
 
     // insert the new entry
-    properties_map[current_device] = props;
+    properties_map[current_device] = properties;
   } // end if
   else
   {
     // use the cached value
-    props = iter->second;
+    properties = iter->second;
   } // end else
 } // end checked_get_current_device_properties()
 
@@ -119,6 +119,10 @@ void checked_get_function_attributes(cudaFuncAttributes& attributes, KernelFunct
 
 } // end detail
 
+size_t compute_capability(const cudaDeviceProp &properties)
+{
+  return 10 * properties.major + properties.minor;
+} // end compute_capability()
 
 size_t num_multiprocessors(const cudaDeviceProp& properties)
 {
@@ -187,6 +191,13 @@ size_t max_active_blocks_per_multiprocessor(const cudaDeviceProp& properties,
 
 
 // Functions that query the runtime for device properties
+size_t compute_capability(void)
+{
+    cudaDeviceProp properties;  
+    detail::checked_get_current_device_properties(properties);
+    return compute_capability(properties);
+} // end compute_capability()
+
 
 size_t num_multiprocessors(void)
 {

diff --git a/thrust/detail/device/cuda/fill.inl b/thrust/detail/device/cuda/fill.inl
@@ -28,6 +28,8 @@
 #include <thrust/extrema.h>
 #include <thrust/detail/internal_functional.h>
 
+#include <thrust/detail/device/cuda/arch.h>
+
 namespace thrust
 {
 namespace detail
@@ -39,7 +41,7 @@ namespace cuda
 namespace detail
 {
 
-template<typename Pointer, typename Size, typename T>
+template<typename WideType, typename Pointer, typename Size, typename T>
   Pointer wide_fill_n(Pointer first,
                       Size n,
                       const T &value)
@@ -48,9 +50,6 @@ template<typename Pointer, typename Size, typename T>
 
   size_t ALIGNMENT_BOUNDARY = 128; // begin copying blocks at this byte boundary
 
-  // type used to pack the OutputTypes
-  typedef unsigned long long WideType;
-
   WideType   wide_exemplar;
   OutputType narrow_exemplars[sizeof(WideType) / sizeof(OutputType)];
 
@@ -98,11 +97,25 @@ template<typename OutputIterator, typename Size, typename T>
 
   if ( thrust::detail::util::is_aligned<OutputType>(thrust::raw_pointer_cast(&*first)) )
   {
-      wide_fill_n(&*first, n, value);
+      if (arch::compute_capability() < 20)
+      {
+        // 32-bit writes are faster on G80 and GT200
+        typedef unsigned int WideType;
+        wide_fill_n<WideType>(&*first, n, value);
+      }
+      else
+      {
+        // 64-bit writes are faster on Fermi
+        typedef unsigned long long WideType;
+        wide_fill_n<WideType>(&*first, n, value);
+      }
+
       return first + n;
   }
-
-  return fill_n(first, n, value, thrust::detail::false_type());
+  else
+  {
+    return fill_n(first, n, value, thrust::detail::false_type());
+  }
 }
 
 } // end detail
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ PREAMBLE = \ @@
             T x;
             constant_functor(T x) : x(x) {}
+            __host__ __device__
             T operator()(void) const {return x;}
         };
@@ Expand Down @@