Sycl improvements (#44)

Luke Iwanski · benoitsteiner · commit b84717102517 · 2017-02-13T12:39:17.000-08:00
- Eigen version bump
 - Extends Cast and Cwise ops benchmark to cover Sycl device
 - Extends device_lib_test.py to cover Sycl device
 - Registers int32, string and ResourceHandler to run on host for
   Enter and RefEnter Sycl Ops
 - Enables RecudeMax op for Sycl since Eigen implementation is ready
 - Registers Less op for Sycl device
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -111,6 +111,7 @@ load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 
 # -----------------------------------------------------------------------------
 # Public targets
@@ -712,7 +713,7 @@ cc_library(
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform/default/build_config:gtest",
-    ],
+    ] + if_sycl([":sycl_runtime"]),
 )
 
 # This is a link-only library to provide a DirectSession
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
@@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(int64)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(bool) + sizeof(float)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
+#endif // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
@@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Enter")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefEnter")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
 #endif
 
 // Special GPU kernels for int32 and string.
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
@@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
 
+REGISTER_KERNEL_BUILDER(Name("Less")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
   BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
 
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, float, DT_FLOAT);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, double, DT_DOUBLE);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
+#endif // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
+#endif // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 
 // data func scalar.
 static Graph* BinaryScalar(int num, const string& func) {
@@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) {
       ->Arg(1048576);
 
 BM_BINARY_SCALAR(cpu, Less);
+#if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Less);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_BINARY_SCALAR(cpu, Add);
+#if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Add);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
 
 using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
+#endif // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
 
 using Eigen::half;
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
+#endif // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
   BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
   BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
 BM_BCAST_ADD_ROW_ALL(cpu);
+#if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_ROW_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu);
   BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
   BM_BCAST_ADD_COL(DEVICE, 4096, 512);
 BM_BCAST_ADD_COL_ALL(cpu);
+#if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_COL_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
-// REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(float);
 #undef REGISTER_SYCL_KERNELS
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,7 @@ def testListLocalDevices(self):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
@@ -66,10 +66,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "eigen_archive",
       urls = [
           #"http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/60578b474802.tar.gz",
-          "https://bitbucket.org/benoitsteiner/opencl/get/5c067614e3e1.tar.gz",
+          "https://bitbucket.org/benoitsteiner/opencl/get/796628790f36.tar.gz",
       ],
       #sha256 = "7527cda827aff351981ebd910012e16be4d899c28a9ae7f143ae60e7f3f7b83d",
-      strip_prefix = "benoitsteiner-opencl-5c067614e3e1",
+      strip_prefix = "benoitsteiner-opencl-796628790f36",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 

Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,7 @@ load(`
`111`	`111`	`"//tensorflow/core:platform/default/build_config_root.bzl",`
`112`	`112`	`"tf_cuda_tests_tags",`
`113`	`113`	`)`
	`114`	`+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")`
`114`	`115`
`115`	`116`	`# -----------------------------------------------------------------------------`
`116`	`117`	`# Public targets`
`@@ -712,7 +713,7 @@ cc_library(`
`712`	`713`	`"//tensorflow/core/kernels:ops_testutil",`
`713`	`714`	`"//tensorflow/core/kernels:ops_util",`
`714`	`715`	`"//tensorflow/core/platform/default/build_config:gtest",`
`715`		`- ],`
	`716`	`+ ] + if_sycl([":sycl_runtime"]),`
`716`	`717`	`)`
`717`	`718`
`718`	`719`	`# This is a link-only library to provide a DirectSession`