diff --git a/README.md b/README.md index bbff7c3e0f8..0a24ce6c436 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Hello, TensorFlow! >>> ``` -##For more information +## For more information * [TensorFlow website](http://tensorflow.org) * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf) diff --git a/configure b/configure index 5a9a7c0d30a..95e9fb64d8f 100755 --- a/configure +++ b/configure @@ -504,7 +504,7 @@ if [ "$TF_NEED_OPENCL" == "1" ]; then while true; do fromuser="" if [ -z "$HOST_CXX_COMPILER" ]; then - default_cxx_host_compiler=$(which clang++-3.6 || true) + default_cxx_host_compiler=$(which g++-4.8 || true) read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER fromuser="1" if [ -z "$HOST_CXX_COMPILER" ]; then @@ -527,7 +527,7 @@ done while true; do fromuser="" if [ -z "$HOST_C_COMPILER" ]; then - default_c_host_compiler=$(which clang-3.6 || true) + default_c_host_compiler=$(which gcc-4.8 || true) read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER fromuser="1" if [ -z "$HOST_C_COMPILER" ]; then diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md index c7ff4a2921e..47d2c862ec0 100644 --- a/tensorflow/contrib/tfprof/README.md +++ b/tensorflow/contrib/tfprof/README.md @@ -6,7 +6,7 @@ Author: Xin Pan (xpan@google.com, github: panyx0718) Consultants: Jon Shlens, Pete Warden -###Major Features +### Major Features 1. Measure model parameters, float operations, tensor shapes. 2. Measure op execution times, requested memory size and device placement. diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 145d2a838bf..8087a377b8a 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1597,6 +1597,7 @@ cc_library( hdrs = if_not_windows([ "common_runtime/sycl/sycl_allocator.h", "common_runtime/sycl/sycl_device.h", + "common_runtime/sycl/sycl_util.h", "common_runtime/sycl/sycl_device_context.h", ]), copts = tf_copts(), diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc index 76576b64b3b..05103f01f17 100644 --- a/tensorflow/core/common_runtime/direct_session_test.cc +++ b/tensorflow/core/common_runtime/direct_session_test.cc @@ -871,8 +871,6 @@ class BlockingOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp); REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc(""); -REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp); - static void TestSessionInterOpThreadsImpl(bool use_function_lib) { FunctionDefLibrary library_graph_def; if (use_function_lib) { @@ -910,6 +908,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) { ->set_opt_level(OptimizerOptions_Level_L0); (*options.config.mutable_device_count())["CPU"] = 2; (*options.config.mutable_device_count())["GPU"] = 0; + (*options.config.mutable_device_count())["SYCL"] = 0; options.config.add_session_inter_op_thread_pool(); auto* p = options.config.add_session_inter_op_thread_pool(); diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index 6f92cd09d3b..0e2e3644217 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -138,7 +138,8 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelWarmup) { DirectSession* ds = static_cast(session.get()); CostModelManager::CostModelMap cost_models; ds->ExportCostModels(&cost_models); - CHECK_EQ(cost_models.size(), 1); + ASSERT_GE(2, cost_models.size()); + ASSERT_LE(1, cost_models.size()); const CostModel* cm = (*cost_models.begin()).second; EXPECT_EQ(measure_steps, cm->GetUpdateTimes()); } @@ -155,10 +156,16 @@ static void TestHWAccelerator(bool enableHWTrace) { test::FillValues(&x_tensor, {1, 1}); Node* x = test::graph::Constant(&graph, x_tensor); x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0"); +#ifdef TENSORFLOW_USE_SYCL + x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); +#endif // TENSORFLOW_USE_SYCL // y = A * x Node* y = test::graph::Matmul(&graph, a, x, false, false); y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0"); +#ifdef TENSORFLOW_USE_SYCL +y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); +#endif // TENSORFLOW_USE_SYCL Node* y_neg = test::graph::Unary(&graph, "Neg", y); y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); @@ -169,6 +176,9 @@ static void TestHWAccelerator(bool enableHWTrace) { SessionOptions options; (*options.config.mutable_device_count())["CPU"] = 1; (*options.config.mutable_device_count())["GPU"] = 1; +#ifdef TENSORFLOW_USE_SYCL + (*options.config.mutable_device_count())["SYCL"] = 1; +#endif // TENSORFLOW_USE_SYCL options.config.set_allow_soft_placement(true); options.config.mutable_graph_options()->set_build_cost_model(1); std::unique_ptr session(NewSession(options)); diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc index 80c483e70b0..a755fbe4c4d 100644 --- a/tensorflow/core/common_runtime/memory_types.cc +++ b/tensorflow/core/common_runtime/memory_types.cc @@ -45,12 +45,12 @@ struct EndpointEq { static Status ProcessMemoryTypes( DeviceType device_type, const Graph* g, std::function fn) { - if (device_type != DEVICE_GPU) { - // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always + if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) { + // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always // compatible. return Status::OK(); } - // For GPU device, HOST_MEMORY and DEVICE_MEMORY is not + // For GPU and SYCL device, HOST_MEMORY and DEVICE_MEMORY is not // compatible. I.e., a conversion/transfer must be done. // // {node id, slot id} -> memory type. diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc index 06d7daea9cd..55eade0566c 100644 --- a/tensorflow/core/common_runtime/memory_types_test.cc +++ b/tensorflow/core/common_runtime/memory_types_test.cc @@ -34,6 +34,9 @@ TEST(MemoryTypeChecker, Int32OK) { // There is a kernel for adding two int32s on host memory. TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g)); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g)); +#endif // TENSORFLOW_USE_SYCL delete g; } @@ -53,6 +56,15 @@ TEST(MemoryTypeChecker, Int32NotOk) { TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g)); TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g)); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + // There is no kernel for casting int32/host memory to float/device + // memory. + EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_SYCL, g))); + + // But we can insert _HostSend/_HostRecv to ensure the invariant. + TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g)); + TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g)); +#endif // TENSORFLOW_USE_SYCL delete g; } @@ -74,6 +86,12 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) { // int Switch's output on GPU has HOST_MEMORY constraint. EXPECT_EQ(memory_type, HOST_MEMORY); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred); + TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type)); + // int Switch's output on GPU has HOST_MEMORY constraint. + EXPECT_EQ(memory_type, HOST_MEMORY); +#endif // TENSORFLOW_USE_SYCL delete g; } diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc index a643fc72580..594c582ff30 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -18,6 +18,8 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/sycl/sycl_device.h" +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" + namespace tensorflow { class SYCLDeviceFactory : public DeviceFactory { @@ -34,7 +36,7 @@ class SYCLDeviceFactory : public DeviceFactory { devices->push_back( new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(), SYCLDevice::GetShortDeviceDescription(), - cl::sycl::gpu_selector(), cpu_allocator())); + GetSYCLDevice(), cpu_allocator())); } return Status::OK(); } diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h new file mode 100644 index 00000000000..dc334394730 --- /dev/null +++ b/tensorflow/core/common_runtime/sycl/sycl_util.h @@ -0,0 +1,57 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if !TENSORFLOW_USE_SYCL +#error This file must only be included when building TensorFlow with SYCL support +#endif + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_ + +// For DMA helper +#include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/framework/tensor.h" + +namespace tensorflow { + inline void* GetBase(const Tensor* src) { + return const_cast(DMAHelper::base(src)); + } + + inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); } + + inline cl::sycl::device GetSYCLDevice() { + // Obtain list of supported devices from Eigen + for (const auto& device :Eigen::get_sycl_supported_devices()) { + if(device.is_gpu()) { + // returns first found GPU + return device; + } + } + + // Currently Intel GPU is not supported + LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, trying OpenCL CPU"; + + for (const auto& device :Eigen::get_sycl_supported_devices()) { + if(device.is_cpu()) { + // returns first found CPU + return device; + } + } + // Currently Intel GPU is not supported + LOG(FATAL) << "No OpenCL GPU nor CPU found that is supported by ComputeCpp"; + } +} + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_ diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc index 24b9dd799aa..d5aec89f52b 100644 --- a/tensorflow/core/debug/debug_gateway.cc +++ b/tensorflow/core/debug/debug_gateway.cc @@ -84,7 +84,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot, // Determine if the tensor is on device (GPU) or host (CPU). // The second part of the check is necessary because even an OpKernel on // may have output tensors allocated on CPU. - if (device->name().find("gpu:") != string::npos && + if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) && !ctx->output_alloc_attr(output_slot).on_host()) { // GPU tensors: Copy it to host (CPU). DeviceContext* device_ctxt = ctx->op_device_context(); diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc index d6f656c5ca3..b50f6465be0 100644 --- a/tensorflow/core/debug/debug_gateway_test.cc +++ b/tensorflow/core/debug/debug_gateway_test.cc @@ -45,6 +45,8 @@ class SessionDebugMinusAXTest : public ::testing::Test { #if GOOGLE_CUDA const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0"; +#elif defined(TENSORFLOW_USE_SYCL) + const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0"; #else const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0"; #endif @@ -302,6 +304,8 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) { // through RunMetadata, given whether GPU is involved. #if GOOGLE_CUDA ASSERT_EQ(2, run_metadata.partition_graphs().size()); +#elif defined(TENSORFLOW_USE_SYCL) + ASSERT_EQ(2, run_metadata.partition_graphs().size()); #else ASSERT_EQ(1, run_metadata.partition_graphs().size()); #endif @@ -336,7 +340,7 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) { ASSERT_EQ(1, debug_nan_count_tensor_vals[0].scalar()()); } -#ifndef GOOGLE_CUDA +#if !defined(GOOGLE_CUDA) && !defined(TENSORFLOW_USE_SYCL) // TODO(cais): Reinstate the following test for concurrent debugged runs on // a GPU once the root cause of the ~0.5% flakiness has been addressed. // (b/34081273) @@ -499,6 +503,8 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test { #if GOOGLE_CUDA const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0"; +#elif defined(TENSORFLOW_USE_SYCL) + const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0"; #else const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0"; #endif @@ -599,6 +605,8 @@ class SessionDebugVariableTest : public ::testing::Test { #if GOOGLE_CUDA const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0"; +#elif defined(TENSORFLOW_USE_SYCL) + const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0"; #else const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0"; #endif @@ -818,6 +826,8 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) { #if GOOGLE_CUDA ASSERT_EQ(2, run_metadata.partition_graphs().size()); +#elif defined(TENSORFLOW_USE_SYCL) + ASSERT_EQ(2, run_metadata.partition_graphs().size()); #else ASSERT_EQ(1, run_metadata.partition_graphs().size()); #endif @@ -855,13 +865,17 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) { ASSERT_EQ(2, debug_nan_count_tensor_vals[0].scalar()()); } -#if GOOGLE_CUDA +#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_SYCL) class SessionDebugGPUSwitchTest : public ::testing::Test { public: void Initialize() { Graph graph(OpRegistry::Global()); +#ifdef GOOGLE_CUDA const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0"; +#elif TENSORFLOW_USE_SYCL + const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0"; +#endif Tensor vb(DT_BOOL, TensorShape({})); vb.scalar()() = true; diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index a56b8cb4b39..9d9b0afcbaa 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -94,9 +94,9 @@ OpKernel::OpKernel(OpKernelConstruction* context) OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(), context->graph_def_version())); - // Kernels executing on GPU tie very few resources on the CPU where the + // Kernels executing on GPU/SYCL tie very few resources on the CPU where the // scheduler runs: we consider them as inexpensive. - expensive_ = context->device_type() != DeviceType(DEVICE_GPU); + expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL); } OpKernel::~OpKernel() {} diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc index f0ab5520f11..f80530b5c01 100644 --- a/tensorflow/core/graph/testlib.cc +++ b/tensorflow/core/graph/testlib.cc @@ -36,6 +36,10 @@ namespace tensorflow { REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), HostConstantOp); REGISTER_KERNEL_BUILDER( Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), HostConstantOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp); +#endif // TENSORFLOW_USE_SYCL // Register the HostConst Op // Returns a constant tensor on the host. Useful for writing C++ tests diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 708b65a6fe9..fff501e17e7 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -32,6 +32,7 @@ load( "tf_kernel_library", "cc_header_only_library", ) +load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests") load( @@ -433,7 +434,7 @@ ARRAY_DEPS = [ "//tensorflow/core:proto_text", "//tensorflow/core:protos_all_cc", "//third_party/eigen3", -] +] + if_sycl(["//tensorflow/core:sycl_runtime"]) cc_library( name = "array_not_windows", diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc index 0f5186eb077..661f4784d37 100644 --- a/tensorflow/core/kernels/aggregate_ops.cc +++ b/tensorflow/core/kernels/aggregate_ops.cc @@ -165,9 +165,6 @@ REGISTER_KERNEL_BUILDER(Name("AddN") REGISTER_ADDN(float, SYCL); REGISTER_ADDN(double, SYCL); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("AddN") .Device(DEVICE_SYCL) .TypeConstraint("T") diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc index 7c95d4dd20f..e9d3811b9d3 100644 --- a/tensorflow/core/kernels/batch_norm_op.cc +++ b/tensorflow/core/kernels/batch_norm_op.cc @@ -28,6 +28,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL template class BatchNormOp : public OpKernel { @@ -207,6 +210,18 @@ TF_CALL_float(REGISTER_GPU_KERNEL); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_SYCL +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T"), \ + BatchNormOp); + +TF_CALL_float(REGISTER_KERNEL); +TF_CALL_double(REGISTER_KERNEL); +#undef REGISTER_KERNEL +#endif // TENSORFLOW_USE_SYCL + #define REGISTER_KERNEL(T) \ REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \ .Device(DEVICE_CPU) \ @@ -254,4 +269,17 @@ TF_CALL_float(REGISTER_GPU_KERNEL); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_SYCL +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T"), \ + BatchNormGradOp); + +TF_CALL_float(REGISTER_KERNEL); +TF_CALL_double(REGISTER_KERNEL); +#undef REGISTER_KERNEL + +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc index 562934ed63b..c9021becfe8 100644 --- a/tensorflow/core/kernels/cast_op.cc +++ b/tensorflow/core/kernels/cast_op.cc @@ -239,12 +239,11 @@ class SyclCastOp : public CastOpBase { }; #define REGISTER_CAST_SYCL(srctype, dsttype) \ - REGISTER_KERNEL_BUILDER(Name("Cast") \ - .TypeConstraint("SrcT") \ - .TypeConstraint("DstT") \ + REGISTER_KERNEL_BUILDER(Name("Cast") \ + .TypeConstraint("SrcT") \ + .TypeConstraint("DstT") \ .Device(DEVICE_SYCL), \ SyclCastOp) - CURRY_TYPES2(REGISTER_CAST_SYCL, bool); CURRY_TYPES2(REGISTER_CAST_SYCL, int32); CURRY_TYPES2(REGISTER_CAST_SYCL, int64); @@ -268,4 +267,3 @@ REGISTER_KERNEL_BUILDER( CpuCastOp); #endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h index 1ee0796ac14..60ca2d281e1 100644 --- a/tensorflow/core/kernels/cast_op_impl.h +++ b/tensorflow/core/kernels/cast_op_impl.h @@ -171,4 +171,3 @@ GetSyclCastFromDouble(DataType dst_dtype); } // namespace tensorflow #endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_ - diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc index fca9cd60ec1..69ed7604558 100644 --- a/tensorflow/core/kernels/cast_op_impl_int32.cc +++ b/tensorflow/core/kernels/cast_op_impl_int32.cc @@ -38,10 +38,9 @@ GetGpuCastFromInt32(DataType dst_dtype) { typedef Eigen::SyclDevice SYCLDevice; std::function GetSyclCastFromInt32(DataType dst_dtype) { - CURRY_TYPES3(CAST_CASE, CPUDevice, int32); + CURRY_TYPES3(CAST_CASE, SYCLDevice, int32); return nullptr; } #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow - diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc index c0a543708d3..7a8363ca39c 100644 --- a/tensorflow/core/kernels/cast_op_impl_int64.cc +++ b/tensorflow/core/kernels/cast_op_impl_int64.cc @@ -19,9 +19,6 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -#ifdef TENSORFLOW_USE_SYCL -typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL std::function GetCpuCastFromInt64(DataType dst_dtype) { diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc index f89948350c3..da495876bbc 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.cc +++ b/tensorflow/core/kernels/concat_lib_cpu.cc @@ -89,7 +89,7 @@ void ConcatSYCL(const Eigen::SyclDevice& d, const std::vector::ConstMatrix>>&, \ typename TTypes::Matrix* output); -TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL) +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL) #undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc index 9628a7efa4b..1a05d371ce7 100644 --- a/tensorflow/core/kernels/concat_op.cc +++ b/tensorflow/core/kernels/concat_op.cc @@ -230,7 +230,8 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2") .HostMemory("axis"), \ ConcatV2Op) -TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); + REGISTER_KERNEL_BUILDER(Name("Concat") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -246,6 +247,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2") .HostMemory("axis") .HostMemory("output"), ConcatV2Op); + #undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index ac842005af2..9721f7f0685 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -30,6 +30,10 @@ limitations under the License. #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/platform/macros.h" +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL + namespace tensorflow { ConstantOp::ConstantOp(OpKernelConstruction* ctx) @@ -52,18 +56,6 @@ ConstantOp::~ConstantOp() {} REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Const").Device(DEVICE_SYCL).TypeConstraint("dtype"), \ - ConstantOp); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -REGISTER_SYCL_KERNEL(bool); -REGISTER_SYCL_KERNEL(int64); -#undef REGISTER_SYCL_KERNEL -#endif - #if GOOGLE_CUDA #define REGISTER_KERNEL(D, TYPE) \ REGISTER_KERNEL_BUILDER( \ @@ -85,6 +77,22 @@ REGISTER_KERNEL(GPU, bool); #undef REGISTER_KERNEL #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(D, TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Const").Device(DEVICE_##D).TypeConstraint("dtype"), \ + ConstantOp); +REGISTER_SYCL_KERNEL(SYCL, float); +REGISTER_SYCL_KERNEL(SYCL, double); +REGISTER_SYCL_KERNEL(SYCL, uint8); +REGISTER_SYCL_KERNEL(SYCL, int8); +REGISTER_SYCL_KERNEL(SYCL, uint16); +REGISTER_SYCL_KERNEL(SYCL, int16); +REGISTER_SYCL_KERNEL(SYCL, int64); +REGISTER_SYCL_KERNEL(SYCL, bool); +#undef REGISTER_SYCL_KERNEL +#endif + HostConstantOp::HostConstantOp(OpKernelConstruction* ctx) : OpKernel(ctx), tensor_(ctx->output_type(0)) { const TensorProto* proto = nullptr; @@ -116,9 +124,6 @@ REGISTER_KERNEL_BUILDER(Name("Const") #endif #ifdef TENSORFLOW_USE_SYCL -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Const") .Device(DEVICE_SYCL) .HostMemory("output") @@ -144,12 +149,33 @@ struct FillFunctor { }; #ifdef TENSORFLOW_USE_SYCL +namespace Eigen { +namespace internal { + +template +struct scalar_const_op { + const T val; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + scalar_const_op(const scalar_const_op & x) + : val(x.val) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T v) : val(v) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()() const { + return val; + } +}; +} +} + // Partial specialization of FillFunctor. template struct FillFunctor { void operator()(const SYCLDevice& d, typename TTypes::Flat out, - typename TTypes::ConstScalar in) { - To32Bit(out).device(d) = To32Bit(out).constant(in()); + T in) { + Eigen::internal::scalar_const_op f(in); + To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f); } }; #endif // TENSORFLOW_USE_SYCL @@ -184,6 +210,43 @@ class FillOp : public OpKernel { } }; +#ifdef TENSORFLOW_USE_SYCL +template +class FillOp : public OpKernel { + public: + explicit FillOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& Tdims = context->input(0); + OP_REQUIRES( + context, IsLegacyVector(Tdims.shape()), + errors::InvalidArgument("dims must be a vector of int32, got shape ", + Tdims.shape().DebugString())); + const Tensor& Tvalue = context->input(1); + + T data_host; + auto device = context->eigen_sycl_device(); + auto size = sizeof(T); + auto src_ptr = GetBase(&Tvalue); + device.memcpyDeviceToHost(&data_host, static_cast(src_ptr), size); + + OP_REQUIRES(context, IsLegacyScalar(Tvalue.shape()), + errors::InvalidArgument("value must be a scalar, got shape ", + Tvalue.shape().DebugString())); + auto dims = Tdims.flat(); + TensorShape shape; + OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( + reinterpret_cast(dims.data()), + dims.size(), &shape)); + Tensor* out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, shape, &out)); + functor::FillFunctor functor; + functor(context->eigen_device(), out->flat(), + data_host); + } +}; +#endif // TENSORFLOW_USE_SYCL + #define REGISTER_KERNEL(D, TYPE) \ REGISTER_KERNEL_BUILDER(Name("Fill") \ .Device(DEVICE_##D) \ @@ -199,8 +262,14 @@ REGISTER_KERNEL(CPU, quint8); #undef REGISTER_CPU_KERNEL #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL(SYCL, float) -REGISTER_KERNEL(SYCL, double) +REGISTER_KERNEL(SYCL, float); +REGISTER_KERNEL(SYCL, double); +REGISTER_KERNEL(SYCL, uint8); +REGISTER_KERNEL(SYCL, int8); +REGISTER_KERNEL(SYCL, uint16); +REGISTER_KERNEL(SYCL, int16); +REGISTER_KERNEL(SYCL, int64); + REGISTER_KERNEL_BUILDER(Name("Fill") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -261,8 +330,10 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU); #undef REGISTER_CPU #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL(float, SYCL); REGISTER_KERNEL(bool, SYCL); +REGISTER_KERNEL(float, SYCL); +REGISTER_KERNEL(double, SYCL); +REGISTER_KERNEL(int64, SYCL); REGISTER_KERNEL_BUILDER(Name("ZerosLike") .Device(DEVICE_SYCL) .TypeConstraint("T") diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 6a79be5a952..23b59e15686 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -67,6 +67,22 @@ void SwitchOp::Compute(OpKernelContext* context) { .TypeConstraint("T"), \ SwitchOp) +#if TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("pred") \ + .TypeConstraint("T"), \ + SwitchOp) + +#define REGISTER_SYCL_REF_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("pred") \ + .TypeConstraint("T"), \ + SwitchOp) +#endif // TENSORFLOW_USE_SYCL + TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH); TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH); TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH); @@ -74,10 +90,19 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH); +#ifdef TENSORFLOW_USE_SYCL +TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH); +TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH); +#endif // TENSORFLOW_USE_SYCL + #undef REGISTER_CPU_SWITCH #undef REGISTER_CPU_REF_SWITCH #undef REGISTER_GPU_SWITCH #undef REGISTER_GPU_REF_SWITCH +#ifdef TENSORFLOW_USE_SYCL +#undef REGISTER_SYCL_SWITCH +#undef REGISTER_SYCL_REF_SWITCH +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -112,28 +137,36 @@ REGISTER_GPU_HOST_REF_KERNEL(string); #undef REGISTER_GPU_HOST_KERNEL #undef REGISTER_GPU_HOST_REF_KERNEL -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("Switch") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T") \ - .HostMemory("pred"), \ +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("pred") \ + .HostMemory("output_false") \ + .HostMemory("output_true") \ + .TypeConstraint("T"), \ SwitchOp) -REGISTER_SYCL_KERNEL(bool); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#define REGISTER_SYCL_REF_SWITCH(type) \ - REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("pred") \ - .TypeConstraint("T"), \ +#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("pred") \ + .HostMemory("output_false") \ + .HostMemory("output_true") \ + .TypeConstraint("T"), \ SwitchOp) -REGISTER_SYCL_REF_SWITCH(bool); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH); -#undef REGISTER_SYCL_KERNEL -#undef REGISTER_SYCL_REF_SWITCH +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_REF_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(bool); +REGISTER_SYCL_HOST_REF_KERNEL(bool); +REGISTER_SYCL_HOST_KERNEL(string); +REGISTER_SYCL_HOST_REF_KERNEL(string); +#undef REGISTER_SYCL_HOST_KERNEL +#undef REGISTER_SYCL_HOST_REF_KERNEL #endif // TENSORFLOW_USE_SYCL class RefSelectOp : public OpKernel { @@ -233,52 +266,76 @@ REGISTER_GPU_REF_KERNEL(bool); #undef REGISTER_GPU_KERNEL #undef REGISTER_GPU_REF_KERNEL -#if TENSORFLOW_USE_SYCL +// Special GPU kernels for int32 and string. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +#define REGISTER_GPU_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Merge") \ + .Device(DEVICE_GPU) \ + .HostMemory("inputs") \ + .HostMemory("output") \ + .HostMemory("value_index") \ + .TypeConstraint("T"), \ + MergeOp); \ + REGISTER_KERNEL_BUILDER(Name("RefMerge") \ + .Device(DEVICE_GPU) \ + .HostMemory("inputs") \ + .HostMemory("output") \ + .HostMemory("value_index") \ + .TypeConstraint("T"), \ + MergeOp) + +REGISTER_GPU_HOST_KERNEL(int32); +REGISTER_GPU_HOST_KERNEL(string); +REGISTER_GPU_HOST_KERNEL(ResourceHandle); + +#undef REGISTER_GPU_HOST_KERNEL + +#ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Merge") \ .Device(DEVICE_SYCL) \ .TypeConstraint("T") \ .HostMemory("value_index"), \ - MergeOp) -REGISTER_SYCL_KERNEL(bool); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); + MergeOp); #define REGISTER_SYCL_REF_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("RefMerge") \ .Device(DEVICE_SYCL) \ .TypeConstraint("T") \ .HostMemory("value_index"), \ - MergeOp) -REGISTER_SYCL_REF_KERNEL(bool); + MergeOp); + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); +REGISTER_SYCL_KERNEL(bool); +REGISTER_SYCL_REF_KERNEL(bool); + #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL -// Special GPU kernels for int32 and string. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. -#define REGISTER_GPU_HOST_KERNEL(type) \ +#define REGISTER_SYCL_HOST_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Merge") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .HostMemory("inputs") \ .HostMemory("output") \ .HostMemory("value_index") \ .TypeConstraint("T"), \ MergeOp); \ REGISTER_KERNEL_BUILDER(Name("RefMerge") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .HostMemory("inputs") \ .HostMemory("output") \ .HostMemory("value_index") \ .TypeConstraint("T"), \ MergeOp) -REGISTER_GPU_HOST_KERNEL(int32); -REGISTER_GPU_HOST_KERNEL(string); -REGISTER_GPU_HOST_KERNEL(ResourceHandle); +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(string); +REGISTER_SYCL_HOST_KERNEL(ResourceHandle); -#undef REGISTER_GPU_HOST_KERNEL +#undef REGISTER_SYCL_HOST_KERNEL +#endif // TENSORFLOW_USE_SYCL void EnterOp::Compute(OpKernelContext* context) { if (IsRefType(context->input_dtype(0))) { @@ -306,46 +363,22 @@ REGISTER_GPU_REF_KERNEL(bool); #undef REGISTER_GPU_KERNEL #undef REGISTER_GPU_REF_KERNEL -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ Name("Enter").Device(DEVICE_SYCL).TypeConstraint("T"), EnterOp) -REGISTER_SYCL_KERNEL(bool); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); - -#define REGISTER_SYCL_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint("T"), EnterOp) -REGISTER_SYCL_REF_KERNEL(bool); + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); +REGISTER_SYCL_KERNEL(bool); +REGISTER_SYCL_REF_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_REF_KERNEL -#define REGISTER_SYCL_HOST_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("Enter") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("output") \ - .TypeConstraint("T"), \ - EnterOp) - -#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("RefEnter") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("output") \ - .TypeConstraint("T"), \ - EnterOp) - -REGISTER_SYCL_HOST_KERNEL(int32); -REGISTER_SYCL_HOST_REF_KERNEL(int32); -REGISTER_SYCL_HOST_KERNEL(string); -REGISTER_SYCL_HOST_REF_KERNEL(string); -REGISTER_SYCL_HOST_KERNEL(ResourceHandle); - -#undef REGISTER_SYCL_HOST_KERNEL -#undef REGISTER_SYCL_HOST_REF_KERNEL -#endif +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -375,6 +408,33 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle); #undef REGISTER_GPU_HOST_KERNEL #undef REGISTER_GPU_HOST_REF_KERNEL +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Enter") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + EnterOp) + +#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("RefEnter") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + EnterOp) + +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_REF_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(string); +REGISTER_SYCL_HOST_REF_KERNEL(string); +REGISTER_SYCL_HOST_KERNEL(ResourceHandle); + +#undef REGISTER_SYCL_HOST_KERNEL +#undef REGISTER_SYCL_HOST_REF_KERNEL +#endif // TENSORFLOW_USE_SYCL + void ExitOp::Compute(OpKernelContext* context) { if (IsRefType(context->input_dtype(0))) { context->forward_ref_input_to_ref_output(0, 0); @@ -399,65 +459,63 @@ REGISTER_GPU_KERNEL(bool); #undef REGISTER_GPU_KERNEL #undef REGISTER_GPU_REF_KERNEL -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("Exit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp) -REGISTER_SYCL_KERNEL(bool); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Exit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); +#define REGISTER_SYCL_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("RefExit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); -#define REGISTER_SYCL_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("RefExit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp) -REGISTER_SYCL_REF_KERNEL(bool); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); +REGISTER_SYCL_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_REF_KERNEL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. -#define REGISTER_SYCL_HOST_KERNEL(type) \ +#define REGISTER_GPU_HOST_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Exit") \ - .Device(DEVICE_SYCL) \ + .Device(DEVICE_GPU) \ .HostMemory("data") \ .HostMemory("output") \ .TypeConstraint("T"), \ ExitOp); \ REGISTER_KERNEL_BUILDER(Name("RefExit") \ - .Device(DEVICE_SYCL) \ + .Device(DEVICE_GPU) \ .HostMemory("data") \ .HostMemory("output") \ .TypeConstraint("T"), \ ExitOp) -REGISTER_SYCL_HOST_KERNEL(int32); -REGISTER_SYCL_HOST_KERNEL(string); -#undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +REGISTER_GPU_HOST_KERNEL(int32); +REGISTER_GPU_HOST_KERNEL(string); -// Special GPU kernels for int32 and string. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. -#define REGISTER_GPU_HOST_KERNEL(type) \ +#undef REGISTER_GPU_HOST_KERNEL + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_HOST_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Exit") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .HostMemory("data") \ .HostMemory("output") \ .TypeConstraint("T"), \ ExitOp); \ REGISTER_KERNEL_BUILDER(Name("RefExit") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .HostMemory("data") \ .HostMemory("output") \ .TypeConstraint("T"), \ ExitOp) -REGISTER_GPU_HOST_KERNEL(int32); -REGISTER_GPU_HOST_KERNEL(string); +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(string); -#undef REGISTER_GPU_HOST_KERNEL +#undef REGISTER_SYCL_HOST_KERNEL +#endif // TENSORFLOW_USE_SYCL void NextIterationOp::Compute(OpKernelContext* context) { if (IsRefType(context->input_dtype(0))) { @@ -485,6 +543,21 @@ REGISTER_GPU_KERNEL(bool); #undef REGISTER_GPU_KERNEL +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ + NextIterationOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint("T"),\ + NextIterationOp) + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); +REGISTER_SYCL_KERNEL(bool); + +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL + // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. @@ -507,31 +580,7 @@ REGISTER_GPU_HOST_KERNEL(string); #undef REGISTER_GPU_HOST_KERNEL -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("NextIteration") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("output") \ - .TypeConstraint("T"), \ - NextIterationOp) - REGISTER_SYCL_KERNEL(bool); - TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#define REGISTER_SYCL_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("RefNextIteration") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("output") \ - .TypeConstraint("T"), \ - NextIterationOp) - REGISTER_SYCL_REF_KERNEL(bool); - TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); -#undef REGISTER_SYCL_KERNEL -#undef REGISTER_SYCL_REF_KERNEL - -// Special GPU kernels for int32 and string. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. +#ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_HOST_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("NextIteration") \ .Device(DEVICE_SYCL) \ @@ -548,6 +597,7 @@ REGISTER_GPU_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(string); + #undef REGISTER_SYCL_HOST_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc index 8cf1eac41ed..5fd38d9dc25 100644 --- a/tensorflow/core/kernels/cwise_op_abs.cc +++ b/tensorflow/core/kernels/cwise_op_abs.cc @@ -22,17 +22,6 @@ REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32, REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128); #endif -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Abs") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64); REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128); @@ -48,4 +37,13 @@ REGISTER_KERNEL_BUILDER(Name("Abs") UnaryOp>); #endif +#if TENSORFLOW_USE_SYCL +REGISTER3(UnaryOp, SYCL, "Abs", functor::abs, float, double, int64); +REGISTER_KERNEL_BUILDER(Name("Abs") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .TypeConstraint("T"), + UnaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc index 65801da3c7c..12cc6c8bdd4 100644 --- a/tensorflow/core/kernels/cwise_op_acos.cc +++ b/tensorflow/core/kernels/cwise_op_acos.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Acos") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double); #endif + +#if TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc index f6e9b59cf8d..acf1f2ad491 100644 --- a/tensorflow/core/kernels/cwise_op_add_1.cc +++ b/tensorflow/core/kernels/cwise_op_add_1.cc @@ -19,26 +19,6 @@ namespace tensorflow { REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32, int64); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Add") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL - -REGISTER_KERNEL_BUILDER(Name("Add") - .Device(DEVICE_SYCL) - .HostMemory("x") - .HostMemory("y") - .HostMemory("z") - .TypeConstraint("T"), - BinaryOp>); -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double); @@ -54,4 +34,15 @@ REGISTER_KERNEL_BUILDER(Name("Add") BinaryOp>); #endif + +#if TENSORFLOW_USE_SYCL +REGISTER2(BinaryOp, SYCL, "Add", functor::add, float, double); +REGISTER_KERNEL_BUILDER(Name("Add") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc index c9ebfe759b1..c28e27d95ae 100644 --- a/tensorflow/core/kernels/cwise_op_asin.cc +++ b/tensorflow/core/kernels/cwise_op_asin.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Asin") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double); #endif + +#if TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc index 72645b303fc..7d73de48102 100644 --- a/tensorflow/core/kernels/cwise_op_atan.cc +++ b/tensorflow/core/kernels/cwise_op_atan.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Atan") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double); #endif + +#if TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index c74e10576d5..0111e9d5fd1 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Ceil") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double); #endif + +#if TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc index 634c90adc63..d4b3b0e3935 100644 --- a/tensorflow/core/kernels/cwise_op_cos.cc +++ b/tensorflow/core/kernels/cwise_op_cos.cc @@ -19,19 +19,11 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Cos") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc index 1e2300832fc..d44c1bf473e 100644 --- a/tensorflow/core/kernels/cwise_op_div.cc +++ b/tensorflow/core/kernels/cwise_op_div.cc @@ -24,32 +24,6 @@ REGISTER5(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16, int16, int32, int64); REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Div") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); \ - REGISTER_KERNEL_BUILDER( \ - Name("RealDiv") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); -REGISTER_SYCL_KERNEL(float) -REGISTER_SYCL_KERNEL(double) -#undef REGISTER_SYCL_KERNEL -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. -REGISTER_KERNEL_BUILDER(Name("Div") - .Device(DEVICE_SYCL) - .HostMemory("x") - .HostMemory("y") - .HostMemory("z") - .TypeConstraint("T"), - BinaryOp>); -#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8, uint16, int16, int64, complex64, complex128); @@ -70,4 +44,15 @@ REGISTER_KERNEL_BUILDER(Name("Div") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(BinaryOp, SYCL, "Div", functor::div, float, double); +REGISTER2(BinaryOp, SYCL, "RealDiv", functor::div, float, double); +REGISTER_KERNEL_BUILDER(Name("Div") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc index 93ea768836f..dfac8960e7c 100644 --- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc +++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc @@ -35,8 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("Equal") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double); - +REGISTER5(BinaryOp, SYCL, "Equal", functor::equal_to, float, double, uint8, + int8, int16); REGISTER_KERNEL_BUILDER(Name("Equal") .Device(DEVICE_SYCL) .HostMemory("x") diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc index 2e3a60cf794..9d4d6544276 100644 --- a/tensorflow/core/kernels/cwise_op_exp.cc +++ b/tensorflow/core/kernels/cwise_op_exp.cc @@ -19,19 +19,11 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Exp") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double); #endif + +#if TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc index 5573c2bcc2f..4f723080060 100644 --- a/tensorflow/core/kernels/cwise_op_expm1.cc +++ b/tensorflow/core/kernels/cwise_op_expm1.cc @@ -22,6 +22,6 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double, REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double); #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float); +REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index 59e32d7f6f4..5a142b9ce9f 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -18,19 +18,10 @@ limitations under the License. namespace tensorflow { REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Floor") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc index 6b5a806aa21..ba89899fb32 100644 --- a/tensorflow/core/kernels/cwise_op_greater.cc +++ b/tensorflow/core/kernels/cwise_op_greater.cc @@ -34,11 +34,8 @@ REGISTER_KERNEL_BUILDER(Name("Greater") BinaryOp>); #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float); +REGISTER2(BinaryOp, SYCL, "Greater", functor::greater, float, double); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Greater") .Device(DEVICE_SYCL) .HostMemory("x") @@ -47,5 +44,4 @@ REGISTER_KERNEL_BUILDER(Name("Greater") .TypeConstraint("T"), BinaryOp>); #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc index ac215282561..8f0c483aecd 100644 --- a/tensorflow/core/kernels/cwise_op_greater_equal.cc +++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc @@ -35,7 +35,7 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float); +REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double); REGISTER_KERNEL_BUILDER(Name("GreaterEqual") .Device(DEVICE_SYCL) diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc index 0faeffa95ca..53ec1c1c63f 100644 --- a/tensorflow/core/kernels/cwise_op_isfinite.cc +++ b/tensorflow/core/kernels/cwise_op_isfinite.cc @@ -19,20 +19,12 @@ namespace tensorflow { REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("IsFinite") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc index df63006b3fd..4b34744304f 100644 --- a/tensorflow/core/kernels/cwise_op_isinf.cc +++ b/tensorflow/core/kernels/cwise_op_isinf.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("IsInf") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc index e1cf7a86375..ad2dd3f722c 100644 --- a/tensorflow/core/kernels/cwise_op_isnan.cc +++ b/tensorflow/core/kernels/cwise_op_isnan.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("IsNan") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc index a38f1024a9a..136c3666dfc 100644 --- a/tensorflow/core/kernels/cwise_op_less.cc +++ b/tensorflow/core/kernels/cwise_op_less.cc @@ -35,7 +35,6 @@ REGISTER_KERNEL_BUILDER(Name("Less") #endif #ifdef TENSORFLOW_USE_SYCL REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64); - REGISTER_KERNEL_BUILDER(Name("Less") .Device(DEVICE_SYCL) .HostMemory("x") diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc index 3a2cc2ae0e8..97a2508d129 100644 --- a/tensorflow/core/kernels/cwise_op_less_equal.cc +++ b/tensorflow/core/kernels/cwise_op_less_equal.cc @@ -35,8 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float); - +REGISTER6(BinaryOp, SYCL, "LessEqual", functor::less_equal, float, double, + int64, uint8, int8, int16); REGISTER_KERNEL_BUILDER(Name("LessEqual") .Device(DEVICE_SYCL) .HostMemory("x") @@ -45,5 +45,4 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual") .TypeConstraint("T"), BinaryOp>); #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc index 5e74e778c76..7fdfdff0e38 100644 --- a/tensorflow/core/kernels/cwise_op_log.cc +++ b/tensorflow/core/kernels/cwise_op_log.cc @@ -19,19 +19,11 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Log") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc index edb821318e8..25ad7b24bb1 100644 --- a/tensorflow/core/kernels/cwise_op_log1p.cc +++ b/tensorflow/core/kernels/cwise_op_log1p.cc @@ -19,19 +19,11 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Log1p") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc index 7311f25ec0c..87d54e380b4 100644 --- a/tensorflow/core/kernels/cwise_op_maximum.cc +++ b/tensorflow/core/kernels/cwise_op_maximum.cc @@ -35,11 +35,7 @@ REGISTER_KERNEL_BUILDER(Name("Maximum") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float); - -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. +REGISTER3(BinaryOp, SYCL, "Maximum", functor::maximum, float, double, int64); REGISTER_KERNEL_BUILDER(Name("Maximum") .Device(DEVICE_SYCL) .HostMemory("x") @@ -48,5 +44,4 @@ REGISTER_KERNEL_BUILDER(Name("Maximum") .TypeConstraint("T"), BinaryOp>); #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc index 99e5a766203..442171193bf 100644 --- a/tensorflow/core/kernels/cwise_op_minimum.cc +++ b/tensorflow/core/kernels/cwise_op_minimum.cc @@ -35,8 +35,7 @@ REGISTER_KERNEL_BUILDER(Name("Minimum") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float); - +REGISTER3(BinaryOp, SYCL, "Minimum", functor::minimum, float, double, int64); REGISTER_KERNEL_BUILDER(Name("Minimum") .Device(DEVICE_SYCL) .HostMemory("x") diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc index 5273522626b..1ee09b161e0 100644 --- a/tensorflow/core/kernels/cwise_op_mul_1.cc +++ b/tensorflow/core/kernels/cwise_op_mul_1.cc @@ -20,24 +20,6 @@ namespace tensorflow { REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8, int32); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Mul") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); -REGISTER_SYCL_KERNEL(float) -REGISTER_SYCL_KERNEL(double) -#undef REGISTER_SYCL_KERNEL -REGISTER_KERNEL_BUILDER(Name("Mul") - .Device(DEVICE_SYCL) - .HostMemory("x") - .HostMemory("y") - .HostMemory("z") - .TypeConstraint("T"), - BinaryOp>); -#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double, uint8); @@ -53,4 +35,14 @@ REGISTER_KERNEL_BUILDER(Name("Mul") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER3(BinaryOp, SYCL, "Mul", functor::mul, float, double, uint8); +REGISTER_KERNEL_BUILDER(Name("Mul") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc index c4a9b228835..d1ae5ba374a 100644 --- a/tensorflow/core/kernels/cwise_op_neg.cc +++ b/tensorflow/core/kernels/cwise_op_neg.cc @@ -19,40 +19,27 @@ namespace tensorflow { REGISTER7(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32, complex64, int64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Neg") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); +#if GOOGLE_CUDA +REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64); // A special GPU kernel for int32. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Neg") - .Device(DEVICE_SYCL) + .Device(DEVICE_GPU) .HostMemory("x") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); +#endif -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - -#if GOOGLE_CUDA -REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64); - -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. +#ifdef TENSORFLOW_USE_SYCL +REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64); REGISTER_KERNEL_BUILDER(Name("Neg") - .Device(DEVICE_GPU) + .Device(DEVICE_SYCL) .HostMemory("x") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc index f1780168e45..5fb0735ac19 100644 --- a/tensorflow/core/kernels/cwise_op_pow.cc +++ b/tensorflow/core/kernels/cwise_op_pow.cc @@ -19,20 +19,11 @@ namespace tensorflow { REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32, int64, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Pow") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double, int64); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(BinaryOp, SYCL, "Pow", functor::pow, float, double); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc index e192f89782d..c8e28a56a34 100644 --- a/tensorflow/core/kernels/cwise_op_round.cc +++ b/tensorflow/core/kernels/cwise_op_round.cc @@ -19,15 +19,14 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double, int32, int64); +#if GOOGLE_CUDA +REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double, + int32, int64); +#endif #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double); namespace functor { DEFINE_UNARY2(round, float, double); } // namespace functor #endif - -#if GOOGLE_CUDA -REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double, - int32, int64); -#endif } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc index f23725f48e3..bcd8f515645 100644 --- a/tensorflow/core/kernels/cwise_op_rsqrt.cc +++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc @@ -19,21 +19,12 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Rsqrt") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Rsqrt", functor::rsqrt, float, double); +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float, Eigen::half, double, complex64, complex128); diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc index 1612429ddf5..626d70bbe1b 100644 --- a/tensorflow/core/kernels/cwise_op_select.cc +++ b/tensorflow/core/kernels/cwise_op_select.cc @@ -184,7 +184,9 @@ REGISTER_SELECT_GPU(complex128); SelectOp); REGISTER_SELECT_SYCL(float); +REGISTER_SELECT_SYCL(double); REGISTER_SELECT_SYCL(int32); +REGISTER_SELECT_SYCL(int64); #undef REGISTER_SELECT_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc index dedd414db55..a4084d5ad17 100644 --- a/tensorflow/core/kernels/cwise_op_sign.cc +++ b/tensorflow/core/kernels/cwise_op_sign.cc @@ -34,10 +34,7 @@ REGISTER_KERNEL_BUILDER(Name("Sign") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. +REGISTER3(UnaryOp, SYCL, "Sign", functor::sign, float, double, int64); REGISTER_KERNEL_BUILDER(Name("Sign") .Device(DEVICE_SYCL) .HostMemory("x") diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc index ab54c61b56d..b91ff1ac30b 100644 --- a/tensorflow/core/kernels/cwise_op_sin.cc +++ b/tensorflow/core/kernels/cwise_op_sin.cc @@ -19,19 +19,11 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Sin") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double); +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc index 55acf648db0..2de84a94169 100644 --- a/tensorflow/core/kernels/cwise_op_sqrt.cc +++ b/tensorflow/core/kernels/cwise_op_sqrt.cc @@ -19,26 +19,22 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Sqrt") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double); +#endif // TENSORFLOW_USE_SYCL + REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float, Eigen::half, double, complex64, complex128); #if GOOGLE_CUDA REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float, Eigen::half, double); #endif + +//#ifdef TENSORFLOW_USE_SYCL +//REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double); +//#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc index afcacfec1c7..07a4b0b084d 100644 --- a/tensorflow/core/kernels/cwise_op_square.cc +++ b/tensorflow/core/kernels/cwise_op_square.cc @@ -19,18 +19,6 @@ namespace tensorflow { REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double, int32, int64, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Square") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC - #if GOOGLE_CUDA REGISTER4(UnaryOp, GPU, "Square", functor::square, float, Eigen::half, double, int64); @@ -45,4 +33,14 @@ REGISTER_KERNEL_BUILDER(Name("Square") .TypeConstraint("T"), UnaryOp>); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER3(UnaryOp, SYCL, "Square", functor::square, float, double, int64); +REGISTER_KERNEL_BUILDER(Name("Square") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .TypeConstraint("T"), + UnaryOp>); +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc index eab1e2a09c9..eb173c7040d 100644 --- a/tensorflow/core/kernels/cwise_op_sub.cc +++ b/tensorflow/core/kernels/cwise_op_sub.cc @@ -24,28 +24,7 @@ REGISTER7(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32, // int32 version of this op is needed, so explicitly include it. REGISTER(BinaryOp, CPU, "Sub", functor::sub, int32); #endif // __ANDROID_TYPES_SLIM__ -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Sub") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); - REGISTER_SYCL_KERNEL(float); - REGISTER_SYCL_KERNEL(double); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. -REGISTER_KERNEL_BUILDER(Name("Sub") - .Device(DEVICE_SYCL) - .HostMemory("x") - .HostMemory("y") - .HostMemory("z") - .TypeConstraint("T"), - BinaryOp>); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER6(BinaryOp, GPU, "Sub", functor::sub, float, Eigen::half, double, int64, complex64, complex128); @@ -62,4 +41,14 @@ REGISTER_KERNEL_BUILDER(Name("Sub") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER3(BinaryOp, SYCL, "Sub", functor::sub, float, double, int64); +REGISTER_KERNEL_BUILDER(Name("Sub") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc index 9c850c94207..4f6c2fce428 100644 --- a/tensorflow/core/kernels/cwise_op_tan.cc +++ b/tensorflow/core/kernels/cwise_op_tan.cc @@ -18,19 +18,11 @@ limitations under the License. namespace tensorflow { REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Tan") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC - #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double); +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc index 1dbc13061ba..19a7a0a17c9 100644 --- a/tensorflow/core/kernels/cwise_op_tanh.cc +++ b/tensorflow/core/kernels/cwise_op_tanh.cc @@ -20,22 +20,14 @@ namespace tensorflow { REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double, complex64, complex128); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Tanh") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC - #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double); +#endif // TENSORFLOW_USE_SYC + REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float, Eigen::half, double, complex64, complex128); #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h index 77b330f5899..d853b628e4d 100644 --- a/tensorflow/core/kernels/cwise_ops_gradients.h +++ b/tensorflow/core/kernels/cwise_ops_gradients.h @@ -151,39 +151,30 @@ struct functor_traits> { namespace tensorflow { namespace functor { +template +struct SimpleBinaryFunctor {}; template -struct SimpleBinaryFunctor { +struct SimpleBinaryFunctorBase { void operator()(const Device& d, typename Functor::tout_type out, typename Functor::tin_type in0, - typename Functor::tin_type in1); + typename Functor::tin_type in1) { + out.device(d) = in0.binaryExpr(in1, typename Functor::func()); + } }; // Partial specialization of BinaryFunctor for CPU devices typedef Eigen::ThreadPoolDevice CPUDevice; template -struct SimpleBinaryFunctor { - void operator()(const CPUDevice& d, typename Functor::tout_type out, - typename Functor::tin_type in0, - typename Functor::tin_type in1) { - out.device(d) = in0.binaryExpr(in1, typename Functor::func()); - } -}; - +struct SimpleBinaryFunctor + : SimpleBinaryFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL -// Partial specialization of BinaryFunctor for SYCL devices typedef Eigen::SyclDevice SYCLDevice; template -struct SimpleBinaryFunctor { - void operator()(const SYCLDevice& d, typename Functor::tout_type out, - typename Functor::tin_type in0, - typename Functor::tin_type in1) { - out.device(d) = in0.binaryExpr(in1, typename Functor::func()); - } -}; - +struct SimpleBinaryFunctor + : SimpleBinaryFunctorBase {}; #endif // TENSORFLOW_USE_SYCL template diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc index 55a7657ea84..f1d99851476 100644 --- a/tensorflow/core/kernels/debug_ops.cc +++ b/tensorflow/core/kernels/debug_ops.cc @@ -28,25 +28,25 @@ REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_CPU), CopyOp); REGISTER_KERNEL_BUILDER(Name("CopyHost").Device(DEVICE_CPU), CopyOp); -#ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_SYCL), CopyOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_GPU), CopyOp); REGISTER_KERNEL_BUILDER(Name("CopyHost") - .Device(DEVICE_SYCL) + .Device(DEVICE_GPU) .HostMemory("input") .HostMemory("output"), CopyOp); -#endif // TENSORFLOW_USE_SYCL +#endif -#if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_GPU), CopyOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_SYCL), CopyOp); REGISTER_KERNEL_BUILDER(Name("CopyHost") - .Device(DEVICE_GPU) + .Device(DEVICE_SYCL) .HostMemory("input") .HostMemory("output"), CopyOp); -#endif +#endif // TENSORFLOW_USE_SYCL // Register debug identity (non-ref and ref) ops. REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU), @@ -126,15 +126,16 @@ TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_SYCL -#define REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(type) \ +#define REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT(type) \ REGISTER_KERNEL_BUILDER(Name("DebugNumericSummary") \ .Device(DEVICE_SYCL) \ .HostMemory("input") \ .HostMemory("output") \ .TypeConstraint("T"), \ DebugNumericSummaryOp); -REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float); -REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double); +TF_CALL_bool(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT); +TF_CALL_INTEGRAL_TYPES(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT); +TF_CALL_float(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT); +TF_CALL_double(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT); #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 2efa95c381f..84204caa45a 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -17,6 +17,9 @@ limitations under the License. #define TENSORFLOW_KERNELS_DEBUG_OP_H_ #include "tensorflow/core/common_runtime/gpu/gpu_util.h" +#ifdef TENSORFLOW_USE_SYCL +#include "tensorflow/core/common_runtime/sycl/sycl_util.h" +#endif // TENSORFLOW_USE_SYCL #include "tensorflow/core/debug/debug_io_utils.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" @@ -63,6 +66,20 @@ class CopyOp : public OpKernel { // The input tensor is on the host (CPU): deep-copy from CPU to CPU. *copied_tensor = tensor::DeepCopy(src_tensor); } +#elif defined(TENSORFLOW_USE_SYCL) + Device* device = static_cast(context->device()); + // Determine if the input tensor is not on CPU (e.g., on GPU). + bool off_host_input = device->device_type() == DEVICE_SYCL && + !context->input_alloc_attr(0).on_host(); + if(off_host_input) { + auto size = src_tensor.NumElements() * sizeof(src_tensor.dtype()); + auto dst_ptr = GetBase(copied_tensor); + auto src_ptr = GetBase(&src_tensor); + typedef decltype(src_tensor.dtype()) ttype; + device->eigen_sycl_device()->memcpy(dst_ptr, static_cast(src_ptr), size); + } else { + *copied_tensor = tensor::DeepCopy(src_tensor); + } #else *copied_tensor = tensor::DeepCopy(src_tensor); #endif diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc index 767f143727c..6da63620c2d 100644 --- a/tensorflow/core/kernels/dense_update_ops.cc +++ b/tensorflow/core/kernels/dense_update_ops.cc @@ -126,6 +126,9 @@ class DenseUpdateOp : public OpKernel { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -136,26 +139,6 @@ TF_CALL_ALL_TYPES(REGISTER_KERNELS); TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS); #undef REGISTER_KERNELS -#if TENSORFLOW_USE_SYCL -typedef Eigen::SyclDevice SYCLDevice; -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("Assign") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - AssignOpT); \ - REGISTER_KERNEL_BUILDER( \ - Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint("T"), \ - DenseUpdateOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint("T"), \ - DenseUpdateOp); - -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif - #if GOOGLE_CUDA // Only register 'Assign' on GPU for the subset of types also supported by // 'Variable' (see variable_ops.cc.) @@ -175,6 +158,16 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ +REGISTER_KERNEL_BUILDER( \ + Name("Assign").Device(DEVICE_SYCL).TypeConstraint("T"), \ + AssignOpT); + +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); +#undef REGISTER_SYCL_KERNELS +#endif // TENSORFLOW_USE_SYCL + #define REGISTER_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ Name("AssignAdd").Device(DEVICE_CPU).TypeConstraint("T"), \ @@ -214,4 +207,16 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS #endif // end GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint("T"), \ + DenseUpdateOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint("T"), \ + DenseUpdateOp); + +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNELS); +#undef REGISTER_KERNELS +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc index 08ae787c863..2fb092c1637 100644 --- a/tensorflow/core/kernels/dynamic_stitch_op.cc +++ b/tensorflow/core/kernels/dynamic_stitch_op.cc @@ -165,20 +165,6 @@ class DynamicStitchOp : public OpKernel { TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH); #undef REGISTER_DYNAMIC_STITCH -#ifdef TENSORFLOW_USE_SYCL -#define REGISTER_DYNAMIC_STITCH_SYCL(type) \ - REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T") \ - .HostMemory("indices") \ - .HostMemory("data") \ - .HostMemory("merged"), \ - DynamicStitchOp) - -TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_SYCL); -#undef REGISTER_DYNAMIC_STITCH_SYCL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA #define REGISTER_DYNAMIC_STITCH_GPU(type) \ REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \ @@ -194,4 +180,18 @@ TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_GPU); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +// TODO{lukeiwanski}: That most likely will case trouble +#define REGISTER_DYNAMIC_STITCH_SYCL(type) \ + REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("indices") \ + .HostMemory("data") \ + .HostMemory("merged"), \ + DynamicStitchOp) + +TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_SYCL); +#undef REGISTER_DYNAMIC_STITCH_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc index 0df8f9d3edf..49b94c26b6a 100644 --- a/tensorflow/core/kernels/fill_functor.cc +++ b/tensorflow/core/kernels/fill_functor.cc @@ -56,16 +56,21 @@ DEFINE_SETZERO_CPU(complex128); template void SetZeroFunctor::operator()( const Eigen::SyclDevice& d, typename TTypes::Flat out) { - out.device(d) = out.constant(T(0)); + To32Bit(out).device(d) = To32Bit(out).constant(T(0)); } #define DEFINE_SETZERO_SYCL(T) \ template struct SetZeroFunctor; -DEFINE_SETZERO_SYCL(float); DEFINE_SETZERO_SYCL(bool); +DEFINE_SETZERO_SYCL(float); DEFINE_SETZERO_SYCL(double); +DEFINE_SETZERO_SYCL(uint8); +DEFINE_SETZERO_SYCL(int8); +DEFINE_SETZERO_SYCL(uint16); +DEFINE_SETZERO_SYCL(int16); +DEFINE_SETZERO_SYCL(int32); +DEFINE_SETZERO_SYCL(int64); #undef DEFINE_SETZERO_SYCL #endif // TENSORFLOW_USE_SYCL - } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc index 94fe22ed310..062945239a3 100644 --- a/tensorflow/core/kernels/matmul_op.cc +++ b/tensorflow/core/kernels/matmul_op.cc @@ -344,6 +344,7 @@ TF_CALL_half(REGISTER_GPU); .Label("eigen"), \ MatMulOp) TF_CALL_float(REGISTER_SYCL); +TF_CALL_double(REGISTER_SYCL); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc index a6650f369ba..9297064dd66 100644 --- a/tensorflow/core/kernels/pack_op.cc +++ b/tensorflow/core/kernels/pack_op.cc @@ -118,6 +118,12 @@ class PackOp : public OpKernel { return; } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + if (std::is_same::value) { + ConcatSYCL(c->eigen_sycl_device(), inputs_flat, &output_flat); + return; + } +#endif // TENSORFLOW_USE_SYCL ConcatCPU(c->device(), inputs_flat, &output_flat); } } @@ -160,26 +166,18 @@ REGISTER_KERNEL_BUILDER(Name("Pack") #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL - #define REGISTER_SYCL(type) \ REGISTER_KERNEL_BUILDER( \ Name("Pack").Device(DEVICE_SYCL).TypeConstraint("T"), \ PackOp) -REGISTER_SYCL(float); -REGISTER_SYCL(double); -#undef REGISTER_SYCL - -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); REGISTER_KERNEL_BUILDER(Name("Pack") .Device(DEVICE_SYCL) .HostMemory("values") .HostMemory("output") .TypeConstraint("T"), PackOp); - +#undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc index 91984319c60..4c431935794 100644 --- a/tensorflow/core/kernels/pad_op.cc +++ b/tensorflow/core/kernels/pad_op.cc @@ -212,12 +212,7 @@ REGISTER_KERNEL_BUILDER(Name("Pad") .HostMemory("paddings"), \ PadOp) -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); - -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL); REGISTER_KERNEL_BUILDER(Name("Pad") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -226,6 +221,7 @@ REGISTER_KERNEL_BUILDER(Name("Pad") .HostMemory("paddings") .HostMemory("output"), PadOp); +#undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc index f3c7e0f26b1..0baf4484c3d 100644 --- a/tensorflow/core/kernels/random_op.cc +++ b/tensorflow/core/kernels/random_op.cc @@ -48,6 +48,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL namespace functor { using random::PhiloxRandom; @@ -564,4 +567,193 @@ TF_CALL_int64(REGISTER_INT); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + +namespace functor { + +using namespace cl; + +template +struct FillPhiloxRandomKernel; + +template +struct FillPhiloxRandomKernel { + typedef typename Distribution::ResultElementType T; + using write_accessor = sycl::accessor; + + FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist) + : data_(data), + gen_(gen), + dist_(dist) { + } + + void operator()(sycl::nd_item<1> item) { + const size_t kGroupSize = Distribution::kResultElementCount; + + const size_t item_id = item.get_global(0); + const size_t total_item_count = item.get_global_range(0); + size_t offset = item_id * kGroupSize; + gen_.Skip(item_id); + + const size_t size = data_.get_size() / sizeof(T); + T* data = ConvertToActualTypeSycl(T, data_); + + while (offset + kGroupSize <= size) { + const typename Distribution::ResultType samples = dist_(&gen_); + for (size_t i = 0; i < kGroupSize; ++i) { + data[offset + i] = samples[i]; + } + + offset += (total_item_count - 1) * kGroupSize; + gen_.Skip(total_item_count - 1); + } + + const typename Distribution::ResultType samples = dist_(&gen_); + for (size_t i = 0; i < kGroupSize; ++i) { + if (offset >= size) { + return; + } + data[offset] = samples[i]; + ++offset; + } + } + + private: + write_accessor data_; + random::PhiloxRandom gen_; + Distribution dist_; +}; + + +template +struct FillPhiloxRandomKernel { + typedef typename Distribution::ResultElementType T; + using write_accessor = sycl::accessor; + + FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist) + : data_(data), + gen_(gen), + dist_(dist) { + } + + void operator()(sycl::nd_item<1> item) { + using random::PhiloxRandom; + using random::SingleSampleAdapter; + + const size_t kReservedSamplesPerOutput = 256; + const size_t kGroupSize = Distribution::kResultElementCount; + const size_t kGeneratorSkipPerOutputGroup = kGroupSize * + kReservedSamplesPerOutput / + PhiloxRandom::kResultElementCount; + + const size_t item_id = item.get_global(0); + const size_t total_item_count = item.get_global_range(0); + size_t group_index = item_id; + size_t offset = group_index * kGroupSize; + + T* data = ConvertToActualTypeSycl(T, data_); + const size_t size = data_.get_size() / sizeof(T); + + while (offset < size) { + // Since each output takes a variable number of samples, we need to + // realign the generator to the beginning for the current output group + PhiloxRandom gen = gen_; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter single_samples(&gen); + + const typename Distribution::ResultType samples = dist_(&single_samples); + + for (size_t i = 0; i < kGroupSize; ++i) { + if (offset >= size) { + return; + } + data[offset] = samples[i]; + ++offset; + } + + offset += (total_item_count - 1) * kGroupSize; + group_index += total_item_count; + } + } + + private: + write_accessor data_; + random::PhiloxRandom gen_; + Distribution dist_; +}; + +template +class FillRandomKernel; +// Partial specialization for SYCL to fill the entire region with randoms +// It splits the work into several tasks and run them in parallel +template +void FillPhiloxRandom::operator()( + OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen, + typename Distribution::ResultElementType* data, int64 size, + Distribution dist) { + + const size_t group_size = device.maxSyclThreadsPerBlock(); + const size_t group_count = (size + group_size - 1) / group_size; + + auto buffer = device.get_sycl_buffer(data); + + device.sycl_queue().submit([&](sycl::handler& cgh) { + auto access = buffer.template get_access(cgh); + + FillPhiloxRandomKernel task(access, gen, dist); + cgh.parallel_for>( + sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)), + task + ); + }); +} + +} + +#define REGISTER(TYPE) \ + template struct functor::FillPhiloxRandom< \ + SYCLDevice, random::UniformDistribution >; \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp >); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp >); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + SYCLDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE> >); + +#define REGISTER_INT(IntType) \ + REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .HostMemory("minval") \ + .HostMemory("maxval") \ + .TypeConstraint("Tout"), \ + RandomUniformIntOp); + +TF_CALL_float(REGISTER); +TF_CALL_double(REGISTER); +TF_CALL_int32(REGISTER_INT); +TF_CALL_int64(REGISTER_INT); + +#undef REGISTER +#undef REGISTER_INT + +#endif // TENSORFLOW_USE_SYCL + } // end namespace tensorflow diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h index b52901c38e3..97bcaf1a49a 100644 --- a/tensorflow/core/kernels/random_op.h +++ b/tensorflow/core/kernels/random_op.h @@ -54,6 +54,18 @@ struct FillPhiloxRandom { }; #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +// Declares the partially SYCL-specialized functor struct. +template +struct FillPhiloxRandom { + void operator()(OpKernelContext* ctx, const SYCLDevice& d, + random::PhiloxRandom gen, + typename Distribution::ResultElementType* data, int64 size, + Distribution dist); +}; +#endif // TENSORFLOW_USE_SYCL + } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h index 19071b47f14..625cea42282 100644 --- a/tensorflow/core/kernels/reduction_ops_common.h +++ b/tensorflow/core/kernels/reduction_ops_common.h @@ -268,31 +268,6 @@ struct ReduceFunctor template struct ReduceFunctor : ReduceFunctorBase{}; - -template -struct ReduceFunctor > { - template - static void Reduce(const SYCLDevice& d, OUT_T out, IN_T in, - const ReductionAxes& reduction_axes, - const Eigen::internal::MeanReducer& reducer) { - typedef typename IN_T::Index Index; - // Eigen sum reductions are much faster on GPU than mean reductions: - // Simply trigger them by computing the sum of the weighted inputs. - Index num_coeffs_to_reduce = 1; - for (int i = 0; i < Eigen::internal::array_size::value; - ++i) { - num_coeffs_to_reduce *= in.dimension(reduction_axes[i]); - } - T scale = T(1.0) / num_coeffs_to_reduce; - out.device(d) = (in * scale).sum(reduction_axes); - } - - template - static void FillIdentity(const SYCLDevice& d, OUT_T out, - const Eigen::internal::MeanReducer& reducer) { - FillIdentityEigenImpl(d, out, reducer); - } -}; #endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc index 5ab97d1eeec..d243e7c55f4 100644 --- a/tensorflow/core/kernels/reduction_ops_max.cc +++ b/tensorflow/core/kernels/reduction_ops_max.cc @@ -67,7 +67,7 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), \ ReductionOp>); REGISTER_SYCL_KERNELS(float); -#undef REGISTER_SYCL_KERNELS +REGISTER_SYCL_KERNELS(double); REGISTER_KERNEL_BUILDER( Name("Max") @@ -78,6 +78,7 @@ REGISTER_KERNEL_BUILDER( .TypeConstraint("T") .TypeConstraint("Tidx"), ReductionOp>); +#undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc index e018cb55dd1..a733851809b 100644 --- a/tensorflow/core/kernels/reduction_ops_mean.cc +++ b/tensorflow/core/kernels/reduction_ops_mean.cc @@ -54,6 +54,7 @@ REGISTER_GPU_KERNELS(double); .HostMemory("reduction_indices"), \ ReductionOp>); REGISTER_SYCL_KERNELS(float); +REGISTER_SYCL_KERNELS(double); #undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc index ec240421b9a..1e394bea41f 100644 --- a/tensorflow/core/kernels/reduction_ops_min.cc +++ b/tensorflow/core/kernels/reduction_ops_min.cc @@ -67,7 +67,7 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), \ ReductionOp>); REGISTER_SYCL_KERNELS(float); -#undef REGISTER_SYCL_KERNELS +REGISTER_SYCL_KERNELS(double); REGISTER_KERNEL_BUILDER( Name("Min") @@ -78,6 +78,7 @@ REGISTER_KERNEL_BUILDER( .TypeConstraint("T") .TypeConstraint("Tidx"), ReductionOp>); +#undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc index e04c655dabb..e3e678628de 100644 --- a/tensorflow/core/kernels/reduction_ops_prod.cc +++ b/tensorflow/core/kernels/reduction_ops_prod.cc @@ -54,19 +54,10 @@ REGISTER_GPU_KERNELS(double); .TypeConstraint("Tidx") \ .HostMemory("reduction_indices"), \ ReductionOp>); +REGISTER_SYCL_KERNELS(int32); REGISTER_SYCL_KERNELS(float); REGISTER_SYCL_KERNELS(double); #undef REGISTER_SYCL_KERNELS - -REGISTER_KERNEL_BUILDER( - Name("Prod") - .Device(DEVICE_SYCL) - .TypeConstraint("T") - .TypeConstraint("Tidx") - .HostMemory("input") - .HostMemory("output") - .HostMemory("reduction_indices"), - ReductionOp>); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc index 938ca66a0cb..eddd71fc28e 100644 --- a/tensorflow/core/kernels/reduction_ops_sum.cc +++ b/tensorflow/core/kernels/reduction_ops_sum.cc @@ -74,11 +74,8 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), \ ReductionOp>); REGISTER_SYCL_KERNELS(float); -#undef REGISTER_SYCL_KERNELS +REGISTER_SYCL_KERNELS(double); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER( Name("Sum") .Device(DEVICE_SYCL) @@ -88,6 +85,7 @@ REGISTER_KERNEL_BUILDER( .HostMemory("output") .HostMemory("reduction_indices"), ReductionOp>); +#undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index d70398bea57..d8d30e87e22 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -156,7 +156,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint("T"), \ EluGradOp) -REGISTER_SYCL_KERNELS(float); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h index e2e0bd48dd1..365c6201a54 100644 --- a/tensorflow/core/kernels/relu_op.h +++ b/tensorflow/core/kernels/relu_op.h @@ -175,10 +175,6 @@ void EluGradOp::OperateNoTemplate(OpKernelContext* context, } // namespace tensorflow -#ifdef TENSORFLOW_USE_SYCL -#undef EIGEN_USE_SYCL -#endif // TENSORFLOW_USE_SYCL - #undef EIGEN_USE_THREADS #endif // TENSORFLOW_KERNELS_RELU_OP_H_ diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc index 6589a546243..a46fcebb42d 100644 --- a/tensorflow/core/kernels/reshape_op.cc +++ b/tensorflow/core/kernels/reshape_op.cc @@ -34,6 +34,20 @@ REGISTER_KERNEL_BUILDER(Name("Reshape") TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); #undef REGISTER_GPU_KERNEL +#if GOOGLE_CUDA +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Reshape") + .Device(DEVICE_GPU) + .HostMemory("tensor") + .HostMemory("shape") + .HostMemory("output") + .TypeConstraint("T") + .TypeConstraint("Tshape"), + ReshapeOp); +#endif + #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Reshape") \ @@ -42,8 +56,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); .TypeConstraint("T") \ .TypeConstraint("Tshape"), \ ReshapeOp); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#undef REGISTER_SYCL_KERNEL +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL); REGISTER_KERNEL_BUILDER(Name("Reshape") .Device(DEVICE_SYCL) @@ -53,20 +66,6 @@ REGISTER_KERNEL_BUILDER(Name("Reshape") .TypeConstraint("T") .TypeConstraint("Tshape"), ReshapeOp); +#undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL - -#if GOOGLE_CUDA -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. -REGISTER_KERNEL_BUILDER(Name("Reshape") - .Device(DEVICE_GPU) - .HostMemory("tensor") - .HostMemory("shape") - .HostMemory("output") - .TypeConstraint("T") - .TypeConstraint("Tshape"), - ReshapeOp); -#endif - } // namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc index 596dac9087a..ca5b2f42b9c 100644 --- a/tensorflow/core/kernels/reverse_op.cc +++ b/tensorflow/core/kernels/reverse_op.cc @@ -144,9 +144,9 @@ class ReverseOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &output)); -#define HANDLE_REVERSE(NDIMS) \ - case NDIMS: \ - HandleReverseCase(context, dims.vec(), output); \ +#define HANDLE_REVERSE(NDIMS) \ + case NDIMS: \ + HandleReverseCase(context, dims.vec(), output); \ return; switch (input_dims) { @@ -367,7 +367,10 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2") .TypeConstraint("Tidx") \ .HostMemory("axis"), \ ReverseV2Op) +TF_CALL_uint8(REGISTER_SYCL_KERNELS); +TF_CALL_int8(REGISTER_SYCL_KERNELS); TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); REGISTER_KERNEL_BUILDER(Name("Reverse") .Device(DEVICE_SYCL) @@ -385,5 +388,4 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2") .HostMemory("output"), ReverseV2Op); #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h index 63add61ba72..ac9590f6118 100644 --- a/tensorflow/core/kernels/scatter_functor.h +++ b/tensorflow/core/kernels/scatter_functor.h @@ -75,6 +75,50 @@ struct Assign { } }; +#ifdef TENSORFLOW_USE_SYCL +template +struct AssignSYCL {}; +template <> +struct AssignSYCL { + template + static void Run(Device d, Params p, Update u) { + p.device(d) = u; + } +}; + +template <> +struct AssignSYCL { + template + static void Run(Device d, Params p, Update u) { + p.device(d) += u; + } +}; + +template <> +struct AssignSYCL { + template + static void Run(Device d, Params p, Update u) { + p.device(d) -= u; + } +}; + +template <> +struct AssignSYCL { + template + static void Run(Device d, Params p, Update u) { + p.device(d) = p * u; + } +}; + +template <> +struct AssignSYCL { + template + static void Run(Device d, Params p, Update u) { + p.device(d) = p / u; + } +}; +#endif // TENSORFLOW_USE_SYCL + } // namespace internal } // namespace scatter_op @@ -110,6 +154,31 @@ struct ScatterFunctorBase { } }; +#ifdef TENSORFLOW_USE_SYCL +template +struct ScatterFunctorBase { + Index operator()(OpKernelContext* c, const SYCLDevice& d, + typename TTypes::Matrix params, + typename TTypes::ConstMatrix updates, + typename TTypes::ConstFlat indices) { + // indices and params sizes were validated in DoCompute(). + const Index N = static_cast(indices.size()); + const Index limit = static_cast(params.dimension(0)); + for (Index i = 0; i < N; i++) { + // Grab the index and check its validity. An earlier version of the + // code checked it and then grabbed it from memory a second time, which + // was a security risk since it could have changed in between. + const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i)); + if (!FastBoundsCheck(index, limit)) return i; + // Copy last Ndim-1 dimensions of updates[i] to params[index] + scatter_op::internal::AssignSYCL::Run(d, params.template chip<0>(index), + updates.template chip<0>(i)); + } + return -1; + } +}; +#endif // TENSORFLOW_USE_SYCL + template struct ScatterFunctorBase { Index operator()(OpKernelContext* c, const CPUDevice& d, diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc index 8eaeafd19aa..704399e413f 100644 --- a/tensorflow/core/kernels/scatter_nd_op.cc +++ b/tensorflow/core/kernels/scatter_nd_op.cc @@ -31,6 +31,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL // Check whether updates.shape = indices.shape[:batch_dim] + // params_shape[slice_dim:] @@ -415,6 +418,19 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \ + REGISTER_SCATTER_ND_ADD_SUB(type, SYCL); + +#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \ + REGISTER_SCATTER_ND_UPDATE(type, SYCL); + +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL); +#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL +#undef REGISTER_SCATTER_ND_UPDATE_SYCL +#endif // TENSORFLOW_USE_SYCL + #undef REGISTER_SCATTER_ND_ADD #undef REGISTER_SCATTER_ND_ADD_SUB #undef REGISTER_SCATTER_ND_ADD_SUB_CPU diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h index bbe2c6864ff..788797b668d 100644 --- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h +++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h @@ -38,6 +38,9 @@ limitations under the License. namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL class OpKernelContext; @@ -186,6 +189,92 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH) #undef REGISTER_SCATTER_ND_INDEX #undef REGISTER_SCATTER_ND_FULL +#ifdef TENSORFLOW_USE_SYCL +// Implementation of update functor for SYCL. +template +struct ScatterNdFunctor { + Index operator()( + const SYCLDevice& d, const Index slice_size, + const Eigen::array output_shape_prefix, + typename TTypes::Tensor Tparams, + typename TTypes::ConstTensor Tindices, + typename TTypes::ConstTensor Tupdates, + typename TTypes::Tensor Toutput) { + // error_loc is -1 if there's no out-of-bounds index, + // otherwise it is the location of an OOB index in Tindices. + Index error_loc = -1; + + const Eigen::DenseIndex batch_size = Tindices.dimension(0); + + Index batch_strides[IXDIM]; + for (int dim = IXDIM - 1; dim >= 0; --dim) { + if (dim == IXDIM - 1) { + batch_strides[dim] = 1; + } else { + batch_strides[dim] = + batch_strides[dim + 1] * output_shape_prefix[dim + 1]; + } + } + + for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) { + Index i = 0; + bool out_of_bounds = false; + for (int dim = 0; dim < IXDIM; ++dim) { + const Index ix_d = internal::SubtleMustCopy(Tindices(loc, dim)); + out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]); + i += ix_d * batch_strides[dim]; + } + if (TF_PREDICT_FALSE(out_of_bounds)) { + error_loc = loc; + break; + } else { + auto input_chip = Toutput.template chip<0>(i); + auto output_chip = input_chip.device(d); + auto update_chip = Tupdates.template chip<0>(loc); + update_executor::UpdateExecutor< + decltype(input_chip), decltype(update_chip), decltype(output_chip), + OP>::Execute(input_chip, update_chip, output_chip); + } + } + + return error_loc; + } +}; + +#define REGISTER_SCATTER_ND_FULL_SYCL(T, Index, op) \ + template Index \ + ScatterNdFunctor::operator()( \ + const SYCLDevice& d, const Index slice_size, \ + const Eigen::array \ + output_shape_prefix, \ + typename TTypes::Tensor Tparams, \ + typename TTypes::ConstTensor Tindices, \ + typename TTypes::ConstTensor Tupdates, \ + typename TTypes::Tensor Toutput) + +#define REGISTER_SCATTER_ND_INDEX_SYCL(type, op) \ + REGISTER_SCATTER_ND_FULL_SYCL(type, int32, op); \ + REGISTER_SCATTER_ND_FULL_SYCL(type, int64, op) + +#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \ + REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ASSIGN); + +#define REGISTER_SCATTER_ND_MATH_SYCL(type) \ + REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ADD); \ + REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::SUB); + +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL) +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MATH_SYCL) +REGISTER_SCATTER_ND_UPDATE_SYCL(int32); +REGISTER_SCATTER_ND_MATH_SYCL(int32); + +#undef REGISTER_SCATTER_ND_MATH_SYCL +#undef REGISTER_SCATTER_ND_UPDATE_SYCL +#undef REGISTER_SCATTER_ND_INDEX_SYCL +#undef REGISTER_SCATTER_ND_FULL_SYCL + +#endif // TENSORFLOW_USE_SYCL + } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc index 1c7d50e161c..53ddb5a3d24 100644 --- a/tensorflow/core/kernels/sendrecv_ops.cc +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -78,11 +78,11 @@ void SendOp::Compute(OpKernelContext* ctx) { REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp); -#if TENSORFLOW_USE_SYCL +#ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp); REGISTER_KERNEL_BUILDER( Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp); -#endif +#endif // TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER( @@ -142,17 +142,17 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp); -#if TENSORFLOW_USE_SYCL +#ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp); -#endif +#endif // TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER( Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp); -#if TENSORFLOW_USE_SYCL +#ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER( Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp); -#endif +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc index c8ea9230201..831b5900376 100644 --- a/tensorflow/core/kernels/sequence_ops.cc +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -91,11 +91,6 @@ class RangeOp : public OpKernel { #define REGISTER_GPU_KERNEL(T) REGISTER_KERNEL(DEVICE_GPU, T) #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T) -TF_CALL_float(REGISTER_SYCL_KERNEL); -TF_CALL_double(REGISTER_SYCL_KERNEL); -TF_CALL_int32(REGISTER_SYCL_KERNEL); -TF_CALL_int64(REGISTER_SYCL_KERNEL); -#undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL TF_CALL_float(REGISTER_CPU_KERNEL); @@ -112,6 +107,14 @@ TF_CALL_int64(REGISTER_GPU_KERNEL); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +TF_CALL_float(REGISTER_SYCL_KERNEL); +TF_CALL_double(REGISTER_SYCL_KERNEL); +TF_CALL_int32(REGISTER_SYCL_KERNEL); +TF_CALL_int64(REGISTER_SYCL_KERNEL); +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL + #undef REGISTER_KERNEL #undef REGISTER_CPU_KERNEL #undef REGISTER_GPU_KERNEL diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc index 177a32464ba..a7d1c86699d 100644 --- a/tensorflow/core/kernels/shape_ops.cc +++ b/tensorflow/core/kernels/shape_ops.cc @@ -32,76 +32,77 @@ REGISTER_KERNEL_BUILDER(Name("Shape") .TypeConstraint("out_type"), ShapeOp); -#ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ +#if GOOGLE_CUDA +#define REGISTER_GPU_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Shape") \ - .Device(DEVICE_SYCL) \ + .Device(DEVICE_GPU) \ .HostMemory("output") \ .TypeConstraint("out_type") \ .TypeConstraint("T"), \ ShapeOp); \ REGISTER_KERNEL_BUILDER(Name("Shape") \ - .Device(DEVICE_SYCL) \ + .Device(DEVICE_GPU) \ .HostMemory("output") \ .TypeConstraint("out_type") \ .TypeConstraint("T"), \ ShapeOp); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#undef REGISTER_SYCL_KERNEL +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Shape") - .Device(DEVICE_SYCL) + .Device(DEVICE_GPU) .HostMemory("input") .HostMemory("output") .TypeConstraint("T") .TypeConstraint("out_type"), ShapeOp); REGISTER_KERNEL_BUILDER(Name("Shape") - .Device(DEVICE_SYCL) + .Device(DEVICE_GPU) .HostMemory("input") .HostMemory("output") .TypeConstraint("T") .TypeConstraint("out_type"), ShapeOp); -#endif // TENSORFLOW_USE_SYCL +#endif -#if GOOGLE_CUDA -#define REGISTER_GPU_KERNEL(type) \ +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Shape") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .HostMemory("output") \ .TypeConstraint("out_type") \ .TypeConstraint("T"), \ ShapeOp); \ REGISTER_KERNEL_BUILDER(Name("Shape") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .HostMemory("output") \ .TypeConstraint("out_type") \ .TypeConstraint("T"), \ ShapeOp); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); -#undef REGISTER_GPU_KERNEL +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Shape") - .Device(DEVICE_GPU) + .Device(DEVICE_SYCL) .HostMemory("input") .HostMemory("output") .TypeConstraint("T") .TypeConstraint("out_type"), ShapeOp); REGISTER_KERNEL_BUILDER(Name("Shape") - .Device(DEVICE_GPU) + .Device(DEVICE_SYCL) .HostMemory("input") .HostMemory("output") .TypeConstraint("T") .TypeConstraint("out_type"), ShapeOp); -#endif + +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL // ShapeN --------------------------------------- REGISTER_KERNEL_BUILDER(Name("ShapeN") @@ -152,7 +153,7 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN") ShapeNOp); #endif -#if TENSORFLOW_USE_SYCL +#ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("ShapeN") \ .Device(DEVICE_SYCL) \ @@ -168,11 +169,7 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN") ShapeNOp) TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#undef REGISTER_SYCL_KERNEL -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("ShapeN") .Device(DEVICE_SYCL) .HostMemory("input") @@ -187,68 +184,65 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN") .TypeConstraint("T") .TypeConstraint("out_type"), ShapeNOp); -#endif // TENSORFLOW_USE_SYCL +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL // Rank ------------------------------------------ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"), RankOp); -#ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ +#if GOOGLE_CUDA +#define REGISTER_GPU_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Rank") \ - .Device(DEVICE_SYCL) \ + .Device(DEVICE_GPU) \ .TypeConstraint("T") \ .HostMemory("output"), \ RankOp); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL // A special GPU kernel for int32 and bool. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Rank") - .Device(DEVICE_SYCL) + .Device(DEVICE_GPU) .TypeConstraint("T") .HostMemory("input") .HostMemory("output"), RankOp); REGISTER_KERNEL_BUILDER(Name("Rank") - .Device(DEVICE_SYCL) + .Device(DEVICE_GPU) .TypeConstraint("T") .HostMemory("input") .HostMemory("output"), RankOp); -#endif // TENSORFLOW_USE_SYCL +#endif -#if GOOGLE_CUDA -#define REGISTER_GPU_KERNEL(type) \ +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ REGISTER_KERNEL_BUILDER(Name("Rank") \ - .Device(DEVICE_GPU) \ + .Device(DEVICE_SYCL) \ .TypeConstraint("T") \ .HostMemory("output"), \ RankOp); -TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); -#undef REGISTER_GPU_KERNEL +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -// A special GPU kernel for int32 and bool. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Rank") - .Device(DEVICE_GPU) + .Device(DEVICE_SYCL) .TypeConstraint("T") .HostMemory("input") .HostMemory("output"), RankOp); REGISTER_KERNEL_BUILDER(Name("Rank") - .Device(DEVICE_GPU) + .Device(DEVICE_SYCL) .TypeConstraint("T") .HostMemory("input") .HostMemory("output"), RankOp); -#endif +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL // Size ------------------------------------------ REGISTER_KERNEL_BUILDER(Name("Size") @@ -312,13 +306,8 @@ REGISTER_KERNEL_BUILDER(Name("Size") .TypeConstraint("out_type") \ .HostMemory("output"), \ SizeOp); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL) #undef REGISTER_SYCL_KERNEL - -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Size") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -371,10 +360,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims") .TypeConstraint("Tdim") \ .HostMemory("dim"), \ ExpandDimsOp); -REGISTER_SYCL_KERNEL(float) -REGISTER_SYCL_KERNEL(double) - -#undef REGISTER_SYCL_KERNEL +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); REGISTER_KERNEL_BUILDER(Name("ExpandDims") .Device(DEVICE_SYCL) @@ -384,6 +370,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims") .HostMemory("dim") .HostMemory("output"), ExpandDimsOp); +#undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL // Squeeze --------------------------------------- @@ -408,18 +395,13 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze") SqueezeOp); #endif -#if TENSORFLOW_USE_SYCL +#ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(type) \ REGISTER_KERNEL_BUILDER( \ Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint("T"),\ SqueezeOp); -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); #undef REGISTER_SYCL_KERNEL - -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Squeeze") .Device(DEVICE_SYCL) .TypeConstraint("T") diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc index 2a9ff40f8ca..ee6f9a28cd2 100644 --- a/tensorflow/core/kernels/slice_op.cc +++ b/tensorflow/core/kernels/slice_op.cc @@ -328,8 +328,9 @@ namespace functor { DECLARE_SYCL_SPEC(T, 6); \ DECLARE_SYCL_SPEC(T, 7); -TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N); DECLARE_FOR_N(int32); +DECLARE_FOR_N(bool); #undef DECLARE_FOR_N #undef DECLARE_SYCL_SPEC @@ -344,11 +345,8 @@ DECLARE_FOR_N(int32); .TypeConstraint("Index"), \ SliceOp) -TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Slice") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -358,7 +356,6 @@ REGISTER_KERNEL_BUILDER(Name("Slice") .HostMemory("size") .HostMemory("output"), SliceOp); - #undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc index de11de32f12..8345a98a0d3 100644 --- a/tensorflow/core/kernels/softmax_op.cc +++ b/tensorflow/core/kernels/softmax_op.cc @@ -90,6 +90,8 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("Softmax").Device(DEVICE_SYCL).TypeConstraint("T"), SoftmaxOp); +REGISTER_KERNEL_BUILDER( + Name("Softmax").Device(DEVICE_SYCL).TypeConstraint("T"), + SoftmaxOp); #endif // TENSORFLOW_USE_SYCL - } // namespace tensorflow diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc index e377e4d97a4..6583f96a917 100644 --- a/tensorflow/core/kernels/split_lib_cpu.cc +++ b/tensorflow/core/kernels/split_lib_cpu.cc @@ -50,16 +50,12 @@ void Split::operator()( typename TTypes::ConstTensor input, const Eigen::DSizes& slice_indices, const Eigen::DSizes& slice_sizes) { - if (output.size() < 131072) { - output = input.slice(slice_indices, slice_sizes); - } else { output.device(d) = input.slice(slice_indices, slice_sizes); - } } #define DEFINE_SYCL_KERNELS(T) template struct Split; -TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS) +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS); #endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc index cca2fc41c2c..d83c9f9c80b 100644 --- a/tensorflow/core/kernels/split_op.cc +++ b/tensorflow/core/kernels/split_op.cc @@ -247,7 +247,6 @@ class SplitOpGPU : public SplitOpBase { #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL - template class SplitOpSYCL : public SplitOpBase { public: @@ -312,8 +311,7 @@ class SplitOpSYCL : public SplitOpBase { } } }; - -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_SPLIT(type) \ REGISTER_KERNEL_BUILDER(Name("Split") \ @@ -349,7 +347,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); .HostMemory("split_dim"), \ SplitOpSYCL) -TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); #undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc index 241d39ba442..54dc280780d 100644 --- a/tensorflow/core/kernels/stack_ops.cc +++ b/tensorflow/core/kernels/stack_ops.cc @@ -40,6 +40,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL class Stack : public ResourceBase { public: @@ -182,6 +185,10 @@ class StackOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp); REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"), StackOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_SYCL).HostMemory("handle"), + StackOp); +#endif // TENSORFLOW_USE_SYCL template class StackPushOp : public AsyncOpKernel { @@ -213,7 +220,11 @@ class StackPushOp : public AsyncOpKernel { static constexpr int kCopyThreshold = 2048; static constexpr double kOccupancy = 0.7; if (swap_memory_ && !alloc_attrs.on_host() && - std::is_same::value && + ( std::is_same::value +#ifdef TENSORFLOW_USE_SYCL + || std::is_same::value +#endif // TENSORFLOW_USE_SYCL + ) && tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) { DeviceContext* device_ctxt = ctx->op_device_context(); auto device = static_cast(ctx->device()); @@ -289,6 +300,31 @@ REGISTER_GPU_HOST_KERNEL(bool); #undef REGISTER_GPU_HOST_KERNEL +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("StackPush") \ + .Device(DEVICE_SYCL) \ + .HostMemory("handle") \ + .TypeConstraint("T"), \ + StackPushOp); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL); + +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("StackPush") \ + .Device(DEVICE_SYCL) \ + .HostMemory("handle") \ + .HostMemory("elem") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + StackPushOp) + +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(bool); +#undef REGISTER_SYCL_KERNEL +#undef REGISTER_SYCL_HOST_KERNEL +#endif // TENSORFLOW_USE_SYCL + class StackPopOp : public AsyncOpKernel { public: explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {} @@ -359,6 +395,31 @@ REGISTER_GPU_HOST_KERNEL(bool); #undef REGISTER_GPU_HOST_KERNEL +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("StackPop") \ + .Device(DEVICE_SYCL) \ + .HostMemory("handle") \ + .TypeConstraint("elem_type"), \ + StackPopOp) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL); + +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("StackPop") \ + .Device(DEVICE_SYCL) \ + .HostMemory("handle") \ + .HostMemory("elem") \ + .TypeConstraint("elem_type"), \ + StackPopOp) + +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(bool); + +#undef REGISTER_SYCL_KERNEL +#undef REGISTER_SYCL_HOST_KERNEL +#endif // TENSORFLOW_USE_SYCL + class StackCloseOp : public OpKernel { public: explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -376,5 +437,8 @@ class StackCloseOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp); REGISTER_KERNEL_BUILDER( Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp); - +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("StackClose").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc index 10593516f71..2b40f07099c 100644 --- a/tensorflow/core/kernels/strided_slice_op.cc +++ b/tensorflow/core/kernels/strided_slice_op.cc @@ -480,12 +480,8 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") .TypeConstraint("Index"), \ StridedSliceAssignOp) -REGISTER_SYCL(float); -REGISTER_SYCL(double); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); -// A special GPU kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("StridedSlice") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -516,6 +512,5 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") .HostMemory("end") .HostMemory("strides"), StridedSliceAssignOp) -#undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h index d820db348e5..0ca62ff6404 100644 --- a/tensorflow/core/kernels/strided_slice_op_impl.h +++ b/tensorflow/core/kernels/strided_slice_op_impl.h @@ -295,7 +295,7 @@ DECLARE_FOR_N_CPU(bfloat16); INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM) TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL); -TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL); DECLARE_FOR_N_SYCL(int32); #undef DECLARE_FOR_N_SYCL diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc index 9822b021ebc..bc9229f6eac 100644 --- a/tensorflow/core/kernels/tile_ops.cc +++ b/tensorflow/core/kernels/tile_ops.cc @@ -261,7 +261,9 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); #ifdef TENSORFLOW_USE_SYCL TF_CALL_float(HANDLE_TYPE_NAME_SYCL); TF_CALL_double(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int16(HANDLE_TYPE_NAME_SYCL); TF_CALL_int32(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int64(HANDLE_TYPE_NAME_SYCL); #endif // TENSORFLOW_USE_SYCL #undef HANDLE_TYPE_NAME_CPU @@ -514,7 +516,9 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); TF_CALL_float(HANDLE_TYPE_NAME_SYCL); TF_CALL_double(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int16(HANDLE_TYPE_NAME_SYCL); TF_CALL_int32(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int64(HANDLE_TYPE_NAME_SYCL); #undef HANDLE_TYPE_NAME_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h index f06cc5514c8..db3f0464391 100644 --- a/tensorflow/core/kernels/tile_ops_cpu_impl.h +++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h @@ -69,9 +69,13 @@ typedef Eigen::SyclDevice SYCLDevice; #define DEFINE_DIM(T, NDIM) template struct Tile; #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM) +TF_CALL_bool(DEFINE_TYPE); TF_CALL_float(DEFINE_TYPE); TF_CALL_double(DEFINE_TYPE); +TF_CALL_uint8(DEFINE_TYPE); TF_CALL_int32(DEFINE_TYPE); +TF_CALL_int16(DEFINE_TYPE); +TF_CALL_int64(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE @@ -82,9 +86,13 @@ TF_CALL_int32(DEFINE_TYPE); template struct ReduceAndReshape; #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM) +TF_CALL_bool(DEFINE_TYPE); TF_CALL_float(DEFINE_TYPE); TF_CALL_double(DEFINE_TYPE); +TF_CALL_uint8(DEFINE_TYPE); +TF_CALL_int16(DEFINE_TYPE); TF_CALL_int32(DEFINE_TYPE); +TF_CALL_int64(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 5c2d371430f..39923a1f2ef 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -420,13 +420,6 @@ TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); -#ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); -TF_CALL_float(REGISTER_SYCL_KERNELS); -TF_CALL_double(REGISTER_SYCL_KERNELS); -#undef REGISTER_SYCL_KERNELS -#endif - #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { @@ -447,6 +440,14 @@ REGISTER_KERNELS(GPU, Eigen::half); REGISTER_KERNELS(GPU, float); REGISTER_KERNELS(GPU, double); #endif + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); +TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); +#undef REGISTER_SYCL_KERNELS +#endif // TENSORFLOW_USE_SYCL + #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h index 99f7d8e56d5..7af5568c1ae 100644 --- a/tensorflow/core/kernels/transpose_functor.h +++ b/tensorflow/core/kernels/transpose_functor.h @@ -75,6 +75,44 @@ void Transpose(const Device& d, const Tensor& in, break; } } + +#ifdef TENSORFLOW_USE_SYCL +// For SYCL lets always go through Eigen +template +void TransposeSYCL(const Device& d, const Tensor& in, + const gtl::ArraySlice perm, Tensor* out) { + switch (in.dims()) { + case 1: + TransposeUsingEigen(d, in, perm, out); + break; + case 2: + TransposeUsingEigen(d, in, perm, out); + break; + case 3: + TransposeUsingEigen(d, in, perm, out); + break; + case 4: + TransposeUsingEigen(d, in, perm, out); + break; + case 5: + TransposeUsingEigen(d, in, perm, out); + break; + case 6: + TransposeUsingEigen(d, in, perm, out); + break; + case 7: + TransposeUsingEigen(d, in, perm, out); + break; + case 8: + TransposeUsingEigen(d, in, perm, out); + break; + default: + errors::Unimplemented("Unsupported TransposeUsingEigen for: ", + in.dims()); + break; + } +} +#endif // TENSORFLOW_USE_SYCL } // namespace internal } // namespace tensorflow diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc index 3681b9a1291..5fa38d3f11c 100644 --- a/tensorflow/core/kernels/transpose_functor_cpu.cc +++ b/tensorflow/core/kernels/transpose_functor_cpu.cc @@ -125,11 +125,38 @@ Status DoTranspose(const SYCLDevice& d, const Tensor& in, CHECK_EQ(in.dims(), perm.size()); CHECK_EQ(in.dtype(), out->dtype()); switch (in.dtype()) { + case DT_BOOL: + case DT_INT8: + case DT_QINT8: + case DT_QUINT8: + case DT_UINT8: + tensorflow::internal::TransposeSYCL(d, in, perm, out); + break; + + case DT_BFLOAT16: + case DT_HALF: + case DT_INT16: + case DT_QINT16: + case DT_QUINT16: + case DT_UINT16: + tensorflow::internal::TransposeSYCL(d, in, perm, out); + break; case DT_FLOAT: - case DT_DOUBLE: case DT_INT32: - internal::Transpose(d, in, perm, out); + case DT_QINT32: + tensorflow::internal::TransposeSYCL(d, in, perm, out); + break; + + case DT_COMPLEX64: + case DT_DOUBLE: + case DT_INT64: + tensorflow::internal::TransposeSYCL(d, in, perm, out); + break; + + case DT_COMPLEX128: + tensorflow::internal::TransposeSYCL(d, in, perm, + out); break; default: diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 4d303f01732..8e6cf505a21 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -224,10 +224,7 @@ Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, .TypeConstraint("Tperm") \ .HostMemory("perm"), \ TransposeSyclOp); -REGISTER(float); -REGISTER(bool); -REGISTER(int32); +TF_CALL_POD_TYPES(REGISTER); #undef REGISTER #endif - } // namespace tensorflow diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc index e4c79ae17bb..c3bebfcbf9d 100644 --- a/tensorflow/core/kernels/unpack_op.cc +++ b/tensorflow/core/kernels/unpack_op.cc @@ -159,20 +159,15 @@ REGISTER_KERNEL_BUILDER(Name("Unpack") Name("Unpack").Device(DEVICE_SYCL).TypeConstraint("T"), \ UnpackOp) -REGISTER_SYCL(float); -REGISTER_SYCL(double); -#undef REGISTER_SYCL +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); -// A special SYCL kernel for int32. -// TODO(b/25387198): Also enable int32 in device memory. This kernel -// registration requires all int32 inputs and outputs to be in host memory. REGISTER_KERNEL_BUILDER(Name("Unpack") .Device(DEVICE_SYCL) .HostMemory("value") .HostMemory("output") .TypeConstraint("T"), UnpackOp); - +#undef REGISTER_SYCL #endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc index 7a4d9dc6503..30137b2bef0 100644 --- a/tensorflow/core/kernels/variable_ops.cc +++ b/tensorflow/core/kernels/variable_ops.cc @@ -32,36 +32,6 @@ REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU), IsVariableInitializedOp); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Variable") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("dtype"), \ - VariableOp); \ - REGISTER_KERNEL_BUILDER(Name("VariableV2") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("dtype"), \ - VariableOp); \ - REGISTER_KERNEL_BUILDER(Name("TemporaryVariable") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("dtype"), \ - TemporaryVariableOp); \ - REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - DestroyTemporaryVariableOp); \ - REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("dtype") \ - .HostMemory("is_initialized"), \ - IsVariableInitializedOp); - -REGISTER_SYCL_KERNEL(float); -REGISTER_SYCL_KERNEL(double); -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA // Only register 'Variable' on GPU for the subset of types also supported by // 'Assign' (see dense_update_ops.cc.) @@ -90,4 +60,29 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Variable").Device(DEVICE_SYCL).TypeConstraint("dtype"), \ + VariableOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint("dtype"),\ + VariableOp); \ + REGISTER_KERNEL_BUILDER(Name("TemporaryVariable") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("dtype"), \ + TemporaryVariableOp); \ + REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T"), \ + DestroyTemporaryVariableOp); \ + REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("dtype") \ + .HostMemory("is_initialized"), \ + IsVariableInitializedOp); + +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL); +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h index 03b155344ce..c15a6436d6e 100644 --- a/tensorflow/core/lib/random/random_distributions.h +++ b/tensorflow/core/lib/random/random_distributions.h @@ -27,6 +27,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/lib/random/philox_random.h" + namespace tensorflow { namespace random { @@ -373,7 +374,7 @@ class TruncatedNormalDistribution { BoxMullerFloat(x0, x1, &f[0], &f[1]); for (int i = 0; i < 2; ++i) { - if (fabs(f[i]) < kTruncateValue) { + if (Eigen::numext::abs(f[i]) < kTruncateValue) { results[index++] = Eigen::half(f[i]); if (index >= kResultElementCount) { return results; @@ -416,7 +417,7 @@ class TruncatedNormalDistribution { BoxMullerFloat(x0, x1, &f[0], &f[1]); for (int i = 0; i < 2; ++i) { - if (fabs(f[i]) < kTruncateValue) { + if (Eigen::numext::abs(f[i]) < kTruncateValue) { results[index++] = f[i]; if (index >= kResultElementCount) { return results; @@ -458,7 +459,7 @@ class TruncatedNormalDistribution { BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]); for (int i = 0; i < 2; ++i) { - if (fabs(d[i]) < kTruncateValue) { + if (Eigen::numext::abs(d[i]) < kTruncateValue) { results[index++] = d[i]; if (index >= kResultElementCount) { return results; @@ -483,12 +484,12 @@ void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) { u1 = epsilon; } const float v1 = 2.0f * M_PI * Uint32ToFloat(x1); - const float u2 = sqrt(-2.0f * log(u1)); -#if defined(__linux__) - sincosf(v1, f0, f1); + const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1)); +#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) + *f0 = Eigen::numext::sin(v1); + *f1 = Eigen::numext::cos(v1); #else - *f0 = sinf(v1); - *f1 = cosf(v1); + sincosf(v1, f0, f1); #endif *f0 *= u2; *f1 *= u2; @@ -509,12 +510,12 @@ void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, u1 = epsilon; } const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3); - const double u2 = sqrt(-2.0 * log(u1)); -#if defined(__linux__) - sincos(v1, d0, d1); + const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1)); +#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) + *d0 = Eigen::numext::sin(v1); + *d1 = Eigen::numext::cos(v1); #else - *d0 = sin(v1); - *d1 = cos(v1); + sincos(v1, d0, d1); #endif *d0 *= u2; *d1 *= u2; diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnv.md b/tensorflow/g3doc/api_docs/cc/ClassEnv.md index 43f75fefb9f..0d06b376d32 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassEnv.md +++ b/tensorflow/g3doc/api_docs/cc/ClassEnv.md @@ -6,7 +6,7 @@ Callers may wish to provide a custom Env object to get fine grain control. All Env implementations are safe for concurrent access from multiple threads without any external synchronization. -###Member Details +### Member Details #### `tensorflow::Env::Env()` {#tensorflow_Env_Env} diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md index e367f5f042d..8d0efd16381 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md +++ b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md @@ -4,7 +4,7 @@ An implementation of Env that forwards all calls to another Env . May be useful to clients who wish to override just part of the functionality of another Env . -###Member Details +### Member Details #### `tensorflow::EnvWrapper::EnvWrapper(Env *t)` {#tensorflow_EnvWrapper_EnvWrapper} diff --git a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md b/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md index ac2c26093de..8c599c05809 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md +++ b/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md @@ -4,7 +4,7 @@ Manages the partially known dimensions of a Tensor and their sizes. -###Member Details +### Member Details #### `tensorflow::PartialTensorShape::PartialTensorShape()` {#tensorflow_PartialTensorShape_PartialTensorShape} diff --git a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md b/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md index ca3666ba8fd..a5c25fecd9e 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md +++ b/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md @@ -4,7 +4,7 @@ Static helper routines for ` PartialTensorShape `. Includes a few common predica -###Member Details +### Member Details #### `string tensorflow::PartialTensorShapeUtils::PartialShapeListString(const gtl::ArraySlice< PartialTensorShape > &shapes)` {#string_tensorflow_PartialTensorShapeUtils_PartialShapeListString} diff --git a/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md b/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md index 1a1526f66d5..5e595fe35a4 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md +++ b/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md @@ -4,7 +4,7 @@ A file abstraction for randomly reading the contents of a file. -###Member Details +### Member Details #### `tensorflow::RandomAccessFile::RandomAccessFile()` {#tensorflow_RandomAccessFile_RandomAccessFile} diff --git a/tensorflow/g3doc/api_docs/cc/ClassSession.md b/tensorflow/g3doc/api_docs/cc/ClassSession.md index 6829548530d..326ec07a998 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassSession.md +++ b/tensorflow/g3doc/api_docs/cc/ClassSession.md @@ -41,7 +41,7 @@ A Session allows concurrent calls to Run() , though a Session must be created / Only one thread must call Close() , and Close() must only be called after all other calls to Run() have returned. -###Member Details +### Member Details #### `tensorflow::Session::Session()` {#tensorflow_Session_Session} diff --git a/tensorflow/g3doc/api_docs/cc/ClassStatus.md b/tensorflow/g3doc/api_docs/cc/ClassStatus.md index 8956af75ec3..bd2db0c08b1 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassStatus.md +++ b/tensorflow/g3doc/api_docs/cc/ClassStatus.md @@ -4,7 +4,7 @@ Denotes success or failure of a call in Tensorflow. -###Member Details +### Member Details #### `tensorflow::Status::Status()` {#tensorflow_Status_Status} diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensor.md b/tensorflow/g3doc/api_docs/cc/ClassTensor.md index b909bffe3a5..57b8b11da48 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassTensor.md +++ b/tensorflow/g3doc/api_docs/cc/ClassTensor.md @@ -4,7 +4,7 @@ Represents an n-dimensional array of values. -###Member Details +### Member Details #### `tensorflow::Tensor::Tensor()` {#tensorflow_Tensor_Tensor} diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md index 51fad8c2fa1..3fdb593c931 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md +++ b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md @@ -8,7 +8,7 @@ A tensor's shape is denoted by its number of dimensions and a size for each If you know the exact shape of your Tensor when you create the TensorShape object, you can specify it then, or you can create a TensorShape with zero dimensions and one element, and call AddDim() to add dimensions later. -###Member Details +### Member Details #### `uint8 tensorflow::TensorShape::buf[16][16]` {#uint8_tensorflow_TensorShape_buf_16_} diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md index 7d8c36ddec5..a7a71f89f90 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md +++ b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md @@ -4,7 +4,7 @@ Static helper routines for ` TensorShape `. Includes a few common predicates on -###Member Details +### Member Details #### `static bool tensorflow::TensorShapeUtils::IsScalar(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsScalar} diff --git a/tensorflow/g3doc/api_docs/cc/ClassThread.md b/tensorflow/g3doc/api_docs/cc/ClassThread.md index 56127d72ad9..2c967a25654 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassThread.md +++ b/tensorflow/g3doc/api_docs/cc/ClassThread.md @@ -4,7 +4,7 @@ Represents a thread used to run a Tensorflow function. -###Member Details +### Member Details #### `tensorflow::Thread::Thread()` {#tensorflow_Thread_Thread} diff --git a/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md b/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md index a7e250d6977..5123601d382 100644 --- a/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md +++ b/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md @@ -4,7 +4,7 @@ A file abstraction for sequential writing. The implementation must provide buffering since callers may append small fragments at a time to the file. -###Member Details +### Member Details #### `tensorflow::WritableFile::WritableFile()` {#tensorflow_WritableFile_WritableFile} diff --git a/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md b/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md index f0dbe1a304a..cc008dffe1c 100644 --- a/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md +++ b/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md @@ -4,7 +4,7 @@ Configuration information for a Session . -###Member Details +### Member Details #### `Env* tensorflow::SessionOptions::env` {#Env_tensorflow_SessionOptions_env} diff --git a/tensorflow/g3doc/api_docs/cc/StructState.md b/tensorflow/g3doc/api_docs/cc/StructState.md index a0335b20e06..7e45ab5c6f3 100644 --- a/tensorflow/g3doc/api_docs/cc/StructState.md +++ b/tensorflow/g3doc/api_docs/cc/StructState.md @@ -4,7 +4,7 @@ -###Member Details +### Member Details #### `tensorflow::error::Code tensorflow::Status::State::code` {#tensorflow_error_Code_tensorflow_Status_State_code} diff --git a/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md b/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md index 509491f27c2..9d6f703235d 100644 --- a/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md +++ b/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md @@ -4,7 +4,7 @@ Represents the value of one dimension in a TensorShape . -###Member Details +### Member Details #### `int64 tensorflow::TensorShapeDim::size` {#int64_tensorflow_TensorShapeDim_size} diff --git a/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md b/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md index 35db265ecd1..0127995eab8 100644 --- a/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md +++ b/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md @@ -4,7 +4,7 @@ Options to configure a Thread . Note that the options are all hints, and the underlying implementation may choose to ignore it. -###Member Details +### Member Details #### `size_t tensorflow::ThreadOptions::stack_size` {#size_t_tensorflow_ThreadOptions_stack_size} diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md index 1253942a949..5e9ea5cda35 100644 --- a/tensorflow/g3doc/api_docs/python/math_ops.md +++ b/tensorflow/g3doc/api_docs/python/math_ops.md @@ -2091,7 +2091,7 @@ Computes the SVD of each inner matrix in `tensor` such that # a is a tensor. # s is a tensor of singular values. # u is a tensor of left singular vectors. -#v is a tensor of right singular vectors. +# v is a tensor of right singular vectors. s, u, v = svd(a) s = svd(a, compute_uv=False) ``` diff --git a/tensorflow/g3doc/tutorials/image_recognition/index.md b/tensorflow/g3doc/tutorials/image_recognition/index.md index 09150009539..ab67af73a83 100644 --- a/tensorflow/g3doc/tutorials/image_recognition/index.md +++ b/tensorflow/g3doc/tutorials/image_recognition/index.md @@ -61,7 +61,7 @@ vision tasks. We're excited to see what the community will do with this model. -##Usage with Python API +## Usage with Python API `classify_image.py` downloads the trained model from `tensorflow.org` when the program is run for the first time. You'll need about 200M of free space diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py index e981fe4f968..e251e4aa0ef 100644 --- a/tensorflow/python/debug/cli/analyzer_cli_test.py +++ b/tensorflow/python/debug/cli/analyzer_cli_test.py @@ -490,7 +490,8 @@ def setUpClass(cls): cls._is_gpu_available = test.is_gpu_available() if cls._is_gpu_available: - cls._main_device = "/job:localhost/replica:0/task:0/gpu:0" + gpu_name = test_util.gpu_device_name() + cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name else: cls._main_device = "/job:localhost/replica:0/task:0/cpu:0" @@ -1195,7 +1196,8 @@ def setUpClass(cls): cls._is_gpu_available = test.is_gpu_available() if cls._is_gpu_available: - cls._main_device = "/job:localhost/replica:0/task:0/gpu:0" + gpu_name = test_util.gpu_device_name() + cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name else: cls._main_device = "/job:localhost/replica:0/task:0/cpu:0" diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py index c6b229f786b..b268b762cec 100644 --- a/tensorflow/python/debug/lib/session_debug_testlib.py +++ b/tensorflow/python/debug/lib/session_debug_testlib.py @@ -57,7 +57,8 @@ def setUpClass(cls): if test.is_gpu_available(): cls._expected_partition_graph_count = 2 cls._expected_num_devices = 2 - cls._main_device = "/job:localhost/replica:0/task:0/gpu:0" + gpu_name = test_util.gpu_device_name() + cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name else: cls._expected_partition_graph_count = 1 cls._expected_num_devices = 1 diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py index 6c7cbbff9cb..e75501d5adb 100644 --- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py +++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py @@ -1391,9 +1391,8 @@ def b1(i, x): self.assertEqual(45, rx.eval()) def _testWhileGrad_ColocateGradients(self, colocate): - gpu_dev_name = test.gpu_device_name() if test.is_gpu_available( - ) else "/gpu:0" - gpu_short_name = gpu_dev_name.split("/")[-1] + gpu_dev_name = test.gpu_device_name() if test.is_gpu_available() else "/gpu:0" + gpu_short_name = gpu_dev_name.split('/')[-1].lower() with self.test_session(graph=ops.Graph()) as sess: v = constant_op.constant(2.0, name="v") diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py index 81eee48d2e8..1f797813f4a 100644 --- a/tensorflow/python/kernel_tests/stage_op_test.py +++ b/tensorflow/python/kernel_tests/stage_op_test.py @@ -31,7 +31,7 @@ def testSimple(self): with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) - with ops.device('/gpu:0'): + with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.float32]) stage = stager.put([v]) y = stager.get() diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md index 3a55fe8ece4..45bead00744 100644 --- a/tensorflow/tools/tfprof/README.md +++ b/tensorflow/tools/tfprof/README.md @@ -5,7 +5,7 @@ Author: Xin Pan (xpan@google.com, github: panyx0718) Consultants: Jon Shlens, Pete Warden -###Major Features +### Major Features 1. Measure model parameters, float operations, tensor shapes. 2. Measure op execution times, requested memory size and device placement. @@ -442,7 +442,7 @@ TensorFlow checkpoint. It defines _checkpoint_variable op type. It also provides checkpointed tensors' values. -##Options +## Options `-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure. diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 318fc78f16f..594f02ab771 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -73,11 +73,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", urls = [ - "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/290bfb42684a.tar.gz", - "https://bitbucket.org/eigen/eigen/get/290bfb42684a.tar.gz", + "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/123510871143.tar.gz", + "https://bitbucket.org/benoitsteiner/opencl/get/123510871143.tar.gz", ], - sha256 = "269c8bf20e8ac1aa8f5caf1ab2ca7be4909ec6ae085177a647aae138cd069b12", - strip_prefix = "eigen-eigen-290bfb42684a", + sha256 = "f0d1d09fed01642893ed237c3f41842429f32009bd3f52eb09fd040970641792", + strip_prefix = "benoitsteiner-opencl-123510871143", build_file = str(Label("//third_party:eigen.BUILD")), ) diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl index 66dd9aea7be..3c509b8a7cf 100755 --- a/third_party/sycl/crosstool/computecpp.tpl +++ b/third_party/sycl/crosstool/computecpp.tpl @@ -63,7 +63,7 @@ def main(): bc_out = filename + '.sycl' # strip asan for the device - computecpp_device_compiler_flags = ['-sycl-compress-name', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-isystem', + computecpp_device_compiler_flags = ['-sycl-compress-name', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-isystem', COMPUTECPP_INCLUDE, '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop', '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt'] computecpp_device_compiler_flags += [flag for flag in compiler_flags if not flag.startswith(('-fsanitize'))] @@ -74,7 +74,6 @@ def main(): if not flag.startswith(('-MF', '-MD',)) if not '.d' in flag ] - host_compiler_flags[host_compiler_flags.index('-c')] = "--include" host_compiler_flags = ['-xc++', '-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags