Skip to content

Commit b847171

Browse files
Luke Iwanskibenoitsteiner
authored andcommitted
Sycl improvements (#44)
- Eigen version bump - Extends Cast and Cwise ops benchmark to cover Sycl device - Extends device_lib_test.py to cover Sycl device - Registers int32, string and ResourceHandler to run on host for Enter and RefEnter Sycl Ops - Enables RecudeMax op for Sycl since Eigen implementation is ready - Registers Less op for Sycl device
1 parent 91a06a9 commit b847171

File tree

8 files changed

+105
-5
lines changed

8 files changed

+105
-5
lines changed

tensorflow/core/BUILD

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ load(
111111
"//tensorflow/core:platform/default/build_config_root.bzl",
112112
"tf_cuda_tests_tags",
113113
)
114+
load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
114115

115116
# -----------------------------------------------------------------------------
116117
# Public targets
@@ -712,7 +713,7 @@ cc_library(
712713
"//tensorflow/core/kernels:ops_testutil",
713714
"//tensorflow/core/kernels:ops_util",
714715
"//tensorflow/core/platform/default/build_config:gtest",
715-
],
716+
] + if_sycl([":sycl_runtime"]),
716717
)
717718

718719
# This is a link-only library to provide a DirectSession

tensorflow/core/kernels/cast_op_test.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
105105
testing::BytesProcessed(static_cast<int64>(iters) * num *
106106
(sizeof(float) + sizeof(int64)));
107107
testing::UseRealTime();
108+
#if GOOGLE_CUDA
108109
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
110+
#endif // GOOGLE_CUDA
111+
#ifdef TENSORFLOW_USE_SYCL
112+
test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
113+
#endif // TENSORFLOW_USE_SYCL
109114
}
110115
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
111116

@@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
123128
testing::BytesProcessed(static_cast<int64>(iters) * num *
124129
(sizeof(bool) + sizeof(float)));
125130
testing::UseRealTime();
131+
#if GOOGLE_CUDA
126132
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
133+
#endif // GOOGLE_CUDA
134+
#ifdef TENSORFLOW_USE_SYCL
135+
test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
136+
#endif // TENSORFLOW_USE_SYCL
127137
}
128138
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
129139

@@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
168178
testing::BytesProcessed(static_cast<int64>(iters) * num *
169179
(sizeof(float) + sizeof(Eigen::half)));
170180
testing::UseRealTime();
181+
#if GOOGLE_CUDA
171182
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
183+
#endif // GOOGLE_CUDA
172184
}
173185
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
174186

@@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
177189
testing::BytesProcessed(static_cast<int64>(iters) * num *
178190
(sizeof(float) + sizeof(Eigen::half)));
179191
testing::UseRealTime();
192+
#if GOOGLE_CUDA
180193
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
194+
#endif // GOOGLE_CUDA
181195
}
182196
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
183197

tensorflow/core/kernels/control_flow_ops.cc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
321321

322322
#undef REGISTER_SYCL_KERNEL
323323
#undef REGISTER_SYCL_REF_KERNEL
324+
#define REGISTER_SYCL_HOST_KERNEL(type) \
325+
REGISTER_KERNEL_BUILDER(Name("Enter") \
326+
.Device(DEVICE_SYCL) \
327+
.HostMemory("data") \
328+
.HostMemory("output") \
329+
.TypeConstraint<type>("T"), \
330+
EnterOp)
331+
332+
#define REGISTER_SYCL_HOST_REF_KERNEL(type) \
333+
REGISTER_KERNEL_BUILDER(Name("RefEnter") \
334+
.Device(DEVICE_SYCL) \
335+
.HostMemory("data") \
336+
.HostMemory("output") \
337+
.TypeConstraint<type>("T"), \
338+
EnterOp)
339+
340+
REGISTER_SYCL_HOST_KERNEL(int32);
341+
REGISTER_SYCL_HOST_REF_KERNEL(int32);
342+
REGISTER_SYCL_HOST_KERNEL(string);
343+
REGISTER_SYCL_HOST_REF_KERNEL(string);
344+
REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
345+
346+
#undef REGISTER_SYCL_HOST_KERNEL
347+
#undef REGISTER_SYCL_HOST_REF_KERNEL
324348
#endif
325349

326350
// Special GPU kernels for int32 and string.

tensorflow/core/kernels/cwise_op_less.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
3333
.TypeConstraint<int32>("T"),
3434
BinaryOp<CPUDevice, functor::less<int32>>);
3535
#endif
36+
#ifdef TENSORFLOW_USE_SYCL
37+
REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
3638

39+
REGISTER_KERNEL_BUILDER(Name("Less")
40+
.Device(DEVICE_SYCL)
41+
.HostMemory("x")
42+
.HostMemory("y")
43+
.HostMemory("z")
44+
.TypeConstraint<int32>("T"),
45+
BinaryOp<CPUDevice, functor::less<int32>>);
46+
#endif // TENSORFLOW_USE_SYCL
3747
} // namespace tensorflow

tensorflow/core/kernels/cwise_ops_test.cc

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
5151
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
5252

5353
BM_UNARY(cpu, Floor, float, DT_FLOAT);
54+
#if GOOGLE_CUDA
5455
BM_UNARY(gpu, Floor, float, DT_FLOAT);
56+
#endif // GOOGLE_CUDA
57+
#ifdef TENSORFLOW_USE_SYCL
58+
BM_UNARY(sycl, Floor, float, DT_FLOAT);
59+
#endif // TENSORFLOW_USE_SYCL
60+
5561
BM_UNARY(cpu, Floor, double, DT_DOUBLE);
62+
#if GOOGLE_CUDA
5663
BM_UNARY(gpu, Floor, double, DT_DOUBLE);
64+
#endif // GOOGLE_CUDA
65+
#ifdef TENSORFLOW_USE_SYCL
66+
BM_UNARY(sycl, Floor, double, DT_DOUBLE);
67+
#endif // TENSORFLOW_USE_SYCL
68+
5769
BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
70+
#if GOOGLE_CUDA
5871
BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
72+
#endif // GOOGLE_CUDA
5973
BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
74+
#if GOOGLE_CUDA
6075
BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
76+
#endif // GOOGLE_CUDA
6177

6278
BM_UNARY(cpu, Rint, double, DT_DOUBLE);
79+
#if GOOGLE_CUDA
6380
BM_UNARY(gpu, Rint, double, DT_DOUBLE);
81+
#endif // GOOGLE_CUDA
6482
BM_UNARY(cpu, Rint, float, DT_FLOAT);
83+
#if GOOGLE_CUDA
6584
BM_UNARY(gpu, Rint, float, DT_FLOAT);
85+
#endif // GOOGLE_CUDA
6686

6787
// data func scalar.
6888
static Graph* BinaryScalar(int num, const string& func) {
@@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) {
90110
->Arg(1048576);
91111

92112
BM_BINARY_SCALAR(cpu, Less);
113+
#if GOOGLE_CUDA
93114
BM_BINARY_SCALAR(gpu, Less);
115+
#endif // GOOGLE_CUDA
116+
#ifdef TENSORFLOW_USE_SYCL
117+
BM_BINARY_SCALAR(sycl, Less);
118+
#endif // TENSORFLOW_USE_SYCL
119+
94120
BM_BINARY_SCALAR(cpu, Add);
121+
#if GOOGLE_CUDA
95122
BM_BINARY_SCALAR(gpu, Add);
123+
#endif // GOOGLE_CUDA
124+
#ifdef TENSORFLOW_USE_SYCL
125+
BM_BINARY_SCALAR(sycl, Add);
126+
#endif // TENSORFLOW_USE_SYCL
96127
#undef BM_BINARY_SCALAR
97128

98129
template <class T>
@@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
130161

131162
using Eigen::half;
132163
BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
164+
#if GOOGLE_CUDA
133165
BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
166+
#endif // GOOGLE_CUDA
134167
BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
168+
#if GOOGLE_CUDA
135169
BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
170+
#endif // GOOGLE_CUDA
136171
#undef BM_BIAS_ADD_ALL
137172
#undef BM_BIAS_ADD
138173

@@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
180215
BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
181216

182217
using Eigen::half;
218+
#if GOOGLE_CUDA
183219
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
184220
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
221+
#endif // GOOGLE_CUDA
185222
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
223+
#if GOOGLE_CUDA
186224
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
225+
#endif // GOOGLE_CUDA
187226
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
227+
#if GOOGLE_CUDA
188228
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
229+
#endif // GOOGLE_CUDA
189230
#undef BM_BIAS_ADD_GRAD_ALL
190231
#undef BM_BIAS_ADD_GRAD
191232

@@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
223264
BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
224265
BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
225266
BM_BCAST_ADD_ROW_ALL(cpu);
267+
#if GOOGLE_CUDA
226268
BM_BCAST_ADD_ROW_ALL(gpu);
269+
#endif // GOOGLE_CUDA
270+
#ifdef TENSORFLOW_USE_SYCL
271+
BM_BCAST_ADD_ROW_ALL(sycl);
272+
#endif // TENSORFLOW_USE_SYCL
227273
#undef BM_BCAST_ADD_ROW_ALL
228274
#undef BM_BCAST_ADD_ROW
229275

@@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu);
244290
BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
245291
BM_BCAST_ADD_COL(DEVICE, 4096, 512);
246292
BM_BCAST_ADD_COL_ALL(cpu);
293+
#if GOOGLE_CUDA
247294
BM_BCAST_ADD_COL_ALL(gpu);
295+
#endif // GOOGLE_CUDA
296+
#ifdef TENSORFLOW_USE_SYCL
297+
BM_BCAST_ADD_COL_ALL(sycl);
298+
#endif // TENSORFLOW_USE_SYCL
248299
#undef BM_BCAST_ADD_COL_ALL
249300
#undef BM_BCAST_ADD_COL
250301

tensorflow/core/kernels/reduction_ops_max.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(
6666
.TypeConstraint<int32>("Tidx") \
6767
.HostMemory("reduction_indices"), \
6868
ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
69-
// REGISTER_SYCL_KERNELS(float);
69+
REGISTER_SYCL_KERNELS(float);
7070
#undef REGISTER_SYCL_KERNELS
7171

7272
REGISTER_KERNEL_BUILDER(

tensorflow/python/client/device_lib_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def testListLocalDevices(self):
3434
# GPU test
3535
if test.is_gpu_available():
3636
self.assertGreater(len(devices), 1)
37-
self.assertTrue("GPU" in [d.device_type for d in devices])
37+
self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
3838

3939

4040
if __name__ == "__main__":

tensorflow/workspace.bzl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
6666
name = "eigen_archive",
6767
urls = [
6868
#"http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/60578b474802.tar.gz",
69-
"https://bitbucket.org/benoitsteiner/opencl/get/5c067614e3e1.tar.gz",
69+
"https://bitbucket.org/benoitsteiner/opencl/get/796628790f36.tar.gz",
7070
],
7171
#sha256 = "7527cda827aff351981ebd910012e16be4d899c28a9ae7f143ae60e7f3f7b83d",
72-
strip_prefix = "benoitsteiner-opencl-5c067614e3e1",
72+
strip_prefix = "benoitsteiner-opencl-796628790f36",
7373
build_file = str(Label("//third_party:eigen.BUILD")),
7474
)
7575

0 commit comments

Comments
 (0)