Allow use cpu_serial_kernel with void-lambda (pytorch#27370)

ifedan · facebook-github-bot · commit 50b3f9d815ef · 2019-10-04T10:04:44.000-07:00
Summary: pytorch#27271 Pull Request resolved: pytorch#27370 Differential Revision: D17763265 Pulled By: ifedan fbshipit-source-id: d670560dfc555db529b18c01aa42f0ccb2127889
diff --git a/aten/src/ATen/native/cpu/IsContiguous.h b/aten/src/ATen/native/cpu/IsContiguous.h
@@ -5,34 +5,58 @@ namespace at { namespace native { namespace {
 // n: number of function arguments (arity)
 // traits: function_traits (see FunctionTraits.h)
 // s: index of scalar argument or -1
-template <int n, typename traits, int s=-1>
+template <int n, int stride_index, typename traits, int s=-1>
 struct IsContiguous {
   static bool eval(const int64_t* strides) {
     using type = typename traits::template arg<n - 1>::type;
-    return strides[n] == (s == n ? 0 : sizeof(type)) &&
-        IsContiguous<n - 1, traits, s>::eval(strides);
+    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
+           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
   }
 };
 
+// will be called when there is an output exists
 template <typename traits, int s>
-struct IsContiguous<0, traits, s> {
+struct IsContiguous<0, 0, traits, s> {
   static bool eval(const int64_t* strides) {
     return strides[0] == sizeof(typename traits::result_type);
   }
 };
 
+// will be called when there is no output
+template <typename traits, int s>
+struct IsContiguous<0, -1, traits, s> {
+  static bool eval(const int64_t* strides) {
+    return true;
+  }
+};
+
 // output and all inputs are contiguous
-template <typename traits>
+template <typename traits,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
 static inline bool is_contiguous(const int64_t* strides) {
-  return IsContiguous<traits::arity, traits>::eval(strides);
+  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
+}
+
+template <typename traits,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
 }
 
 // input at `s` is scalar (stride 0); output and other inputs are contiguous
 // NB: output is typically at strides[0] so first input corresponds to s=1
-template <typename traits, int s>
+template <typename traits, int s,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
+}
+
+template <typename traits, int s,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
 static inline bool is_contiguous_scalar(const int64_t* strides) {
   static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
-  return IsContiguous<traits::arity, traits, s>::eval(strides);
+  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
 }
 
 }}}
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
@@ -80,13 +80,40 @@ dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& o
   return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
 }
 
+template <typename func_t,
+    typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
+        &data[1],
+        &strides[1],
+        i));
+  }
+}
+
+template <typename func_t,
+    typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  using traits = function_traits<func_t>;
+  for (; i < n; i++) {
+    c10::guts::apply(op, dereference<traits>(
+        &data[0],
+        &strides[0],
+        i));
+  }
+}
+
 // Basic loop operation (one output, N inputs). May be auto-vectorized
 // by the compiler. Supports inputs and outputs of different types.
 template <typename func_t>
 static inline void
 basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t op) {
   using traits = function_traits<func_t>;
-  using result_type = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
 
   // Copying strides to temporary array helps auto vectorization in older GCC
@@ -96,13 +123,7 @@ basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_
     strides[arg] = strides_[arg];
   }
 
-  for (; i < n; i++) {
-    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = c10::guts::apply(op, dereference<traits>(
-      &data[1],
-      &strides[1],
-      i));
-  }
+  execute_op(data, strides, i, n, op);
 }
 
 // Explicitly vectorized loop implementation. All inputs and outputs must be
@@ -205,7 +226,8 @@ void cpu_kernel_vec(TensorIterator& iter, func_t op, vec_func_t vop) {
 template <typename func_t>
 void cpu_serial_kernel(TensorIterator& iter, func_t op) {
   using traits = function_traits<func_t>;
-  TORCH_INTERNAL_ASSERT(iter.ntensors() >= traits::arity + 1);
+  TORCH_INTERNAL_ASSERT((std::is_void<typename traits::result_type>::value &&
+    iter.noutputs() == 0 && iter.ntensors() == traits::arity) || (iter.ntensors() >= traits::arity + 1));
 
   iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
     if (is_contiguous<traits>(strides)) {
@@ -217,6 +239,7 @@ void cpu_serial_kernel(TensorIterator& iter, func_t op) {
       });
     }
   }, {0, iter.numel()});
+  iter.cast_outputs();
 }
 
 }}}  // namespace at::native::<anonymous>
diff --git a/aten/src/ATen/test/tensor_iterator_test.cpp b/aten/src/ATen/test/tensor_iterator_test.cpp
@@ -61,17 +61,41 @@ TEST(TensorIteratorTest, SerialLoopUnary_##name) {                            \
   ASSERT_ANY_THROW(out.equal(expected));                                      \
 }
 
+#define NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE(ctype,name)                         \
+TEST(TensorIteratorTest, SerialLoopUnaryNoOutput_##name) {                     \
+  auto in = random_tensor_for_type(k##name);                                   \
+  auto iter = at::TensorIterator();                                            \
+  iter.add_input(in);                                                          \
+  iter.build();                                                                \
+  int64_t acc = 0;                                                             \
+  at::native::cpu_serial_kernel(iter, [&](ctype a) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in.numel());                                              \
+}
+
 #define BINARY_TEST_ITER_FOR_TYPE(ctype,name)                                          \
 TEST(TensorIteratorTest, SerialLoopBinary_##name) {                                    \
   Tensor out;                                                                          \
   auto in1 = random_tensor_for_type(k##name);                                          \
   auto in2 = random_tensor_for_type(k##name);                                          \
   auto expected = in1.add(in2);                                                        \
-  auto iter = TensorIterator::binary_op(out, in1, in2);                           \
+  auto iter = TensorIterator::binary_op(out, in1, in2);                                \
   at::native::cpu_serial_kernel(iter, [=](ctype a, ctype b) -> int { return a + b; }); \
   ASSERT_ANY_THROW(out.equal(expected));                                               \
 }
 
+#define NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE(ctype,name)                          \
+TEST(TensorIteratorTest, SerialLoopBinaryNoOutput_##name) {                      \
+  auto in1 = random_tensor_for_type(k##name);                                    \
+  auto in2 = random_tensor_for_type(k##name);                                    \
+  auto iter = at::TensorIterator();                                              \
+  iter.add_input(in1);                                                           \
+  iter.add_input(in2);                                                           \
+  iter.build();                                                                  \
+  int64_t acc = 0;                                                               \
+  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in1.numel());                                               \
+}
+
 #define POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                                    \
 TEST(TensorIteratorTest, SerialLoopPointwise_##name) {                                              \
   Tensor out;                                                                                       \
@@ -89,6 +113,21 @@ TEST(TensorIteratorTest, SerialLoopPointwise_##name) {
   ASSERT_ANY_THROW(out.equal(expected));                                                            \
 }
 
+#define NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                \
+TEST(TensorIteratorTest, SerialLoopPoinwiseNoOutput_##name) {                             \
+  auto in1 = random_tensor_for_type(k##name);                                             \
+  auto in2 = random_tensor_for_type(k##name);                                             \
+  auto in3 = random_tensor_for_type(k##name);                                             \
+  auto iter = at::TensorIterator();                                                       \
+  iter.add_input(in1);                                                                    \
+  iter.add_input(in2);                                                                    \
+  iter.add_input(in3);                                                                    \
+  iter.build();                                                                           \
+  int64_t acc = 0;                                                                        \
+  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b, ctype c) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in1.numel());                                                        \
+}
+
 // The alternative way to calculate a < b is (b - a).clamp(0).toBool()
 // To prevent an overflow in subtraction (b - a) for unsigned types(unit, bool)
 // we will convert in to int first
@@ -112,6 +151,9 @@ TEST(TensorIteratorTest, ComparisonLoopBinary_##name) {
 AT_FORALL_SCALAR_TYPES(UNARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(BINARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(POINTWISE_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES_AND(Bool, COMPARISON_TEST_ITER_FOR_TYPE)
 
 TEST(TensorIteratorTest, SerialLoopSingleThread) {
@@ -172,3 +214,4 @@ TEST(TensorIteratorTest, DoNotComputeCommonDTypeIfOutputIsUndefined) {
   iter.compute_common_dtype_only_for_inputs();
   ASSERT_ANY_THROW(iter.build());
 }
+