benoitsteiner · bryant1410 · Jan 1, 2017 · Jan 20, 2017 · Jan 30, 2017 · Jan 31, 2017
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ Hello, TensorFlow!
 >>>
 ```
 
-##For more information
+## For more information
 
 * [TensorFlow website](http://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)

diff --git a/configure b/configure
@@ -504,7 +504,7 @@ if [ "$TF_NEED_OPENCL" == "1" ]; then
 while true; do
   fromuser=""
   if [ -z "$HOST_CXX_COMPILER" ]; then
-    default_cxx_host_compiler=$(which clang++-3.6 || true)
+    default_cxx_host_compiler=$(which g++-4.8 || true)
     read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER
     fromuser="1"
     if [ -z "$HOST_CXX_COMPILER" ]; then
@@ -527,7 +527,7 @@ done
 while true; do
   fromuser=""
   if [ -z "$HOST_C_COMPILER" ]; then
-    default_c_host_compiler=$(which clang-3.6 || true)
+    default_c_host_compiler=$(which gcc-4.8 || true)
     read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER
     fromuser="1"
     if [ -z "$HOST_C_COMPILER" ]; then

diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
@@ -6,7 +6,7 @@ Author: Xin Pan ([email protected], github: panyx0718)
 
 Consultants: Jon Shlens, Pete Warden
 
-###Major Features
+### Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
 2.  Measure op execution times, requested memory size and device placement.

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -1597,6 +1597,7 @@ cc_library(
     hdrs = if_not_windows([
         "common_runtime/sycl/sycl_allocator.h",
         "common_runtime/sycl/sycl_device.h",
+        "common_runtime/sycl/sycl_util.h",
         "common_runtime/sycl/sycl_device_context.h",
     ]),
     copts = tf_copts(),

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -871,8 +871,6 @@ class BlockingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
 REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");
 
-REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp);
-
 static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
   FunctionDefLibrary library_graph_def;
   if (use_function_lib) {
@@ -910,6 +908,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
       ->set_opt_level(OptimizerOptions_Level_L0);
   (*options.config.mutable_device_count())["CPU"] = 2;
   (*options.config.mutable_device_count())["GPU"] = 0;
+  (*options.config.mutable_device_count())["SYCL"] = 0;
 
   options.config.add_session_inter_op_thread_pool();
   auto* p = options.config.add_session_inter_op_thread_pool();

diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -138,7 +138,8 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelWarmup) {
   DirectSession* ds = static_cast<DirectSession*>(session.get());
   CostModelManager::CostModelMap cost_models;
   ds->ExportCostModels(&cost_models);
-  CHECK_EQ(cost_models.size(), 1);
+  ASSERT_GE(2, cost_models.size());
+  ASSERT_LE(1, cost_models.size());
   const CostModel* cm = (*cost_models.begin()).second;
   EXPECT_EQ(measure_steps, cm->GetUpdateTimes());
 }
@@ -155,10 +156,16 @@ static void TestHWAccelerator(bool enableHWTrace) {
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+#ifdef TENSORFLOW_USE_SYCL
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+#ifdef TENSORFLOW_USE_SYCL
+y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -169,6 +176,9 @@ static void TestHWAccelerator(bool enableHWTrace) {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 1;
   (*options.config.mutable_device_count())["GPU"] = 1;
+#ifdef TENSORFLOW_USE_SYCL
+  (*options.config.mutable_device_count())["SYCL"] = 1;
+#endif // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));

diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
@@ -45,12 +45,12 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     DeviceType device_type, const Graph* g,
     std::function<Status(const Edge*, MemoryType, MemoryType)> fn) {
-  if (device_type != DEVICE_GPU) {
-    // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+    // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
   }
-  // For GPU device, HOST_MEMORY and DEVICE_MEMORY is not
+  // For GPU and SYCL device, HOST_MEMORY and DEVICE_MEMORY is not
   // compatible. I.e., a conversion/transfer must be done.
   //
   // {node id, slot id} -> memory type.

diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -34,6 +34,9 @@ TEST(MemoryTypeChecker, Int32OK) {
   // There is a kernel for adding two int32s on host memory.
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -53,6 +56,15 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  // There is no kernel for casting int32/host memory to float/device
+  // memory.
+  EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_SYCL, g)));
+
+  // But we can insert _HostSend/_HostRecv to ensure the invariant.
+  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
+  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -74,6 +86,12 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred);
+  TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
+  // int Switch's output on GPU has HOST_MEMORY constraint.
+  EXPECT_EQ(memory_type, HOST_MEMORY);
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 

diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device.h"
 
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+
 namespace tensorflow {
 
 class SYCLDeviceFactory : public DeviceFactory {
@@ -34,7 +36,7 @@ class SYCLDeviceFactory : public DeviceFactory {
       devices->push_back(
           new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(),
                          SYCLDevice::GetShortDeviceDescription(),
-                         cl::sycl::gpu_selector(), cpu_allocator()));
+                         GetSYCLDevice(), cpu_allocator()));
     }
     return Status::OK();
   }

diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building TensorFlow with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
+
+// For DMA helper
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+  inline void* GetBase(const Tensor* src) {
+    return const_cast<void*>(DMAHelper::base(src));
+  }
+
+  inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
+
+  inline cl::sycl::device GetSYCLDevice() {
+    // Obtain list of supported devices from Eigen
+    for (const auto& device :Eigen::get_sycl_supported_devices()) {
+      if(device.is_gpu()) {
+        // returns first found GPU
+        return device;
+      }
+    }
+
+    // Currently Intel GPU is not supported
+    LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, trying OpenCL CPU";
+
+    for (const auto& device :Eigen::get_sycl_supported_devices()) {
+      if(device.is_cpu()) {
+        // returns first found CPU
+        return device;
+      }
+    }
+    // Currently Intel GPU is not supported
+    LOG(FATAL) << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+  }
+}
+
+#endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
@@ -84,7 +84,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if (device->name().find("gpu:") != string::npos &&
+    if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();

diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
@@ -45,6 +45,8 @@ class SessionDebugMinusAXTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -302,6 +304,8 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
 // through RunMetadata, given whether GPU is involved.
 #if GOOGLE_CUDA
   ASSERT_EQ(2, run_metadata.partition_graphs().size());
+#elif defined(TENSORFLOW_USE_SYCL)
+  ASSERT_EQ(2, run_metadata.partition_graphs().size());
 #else
   ASSERT_EQ(1, run_metadata.partition_graphs().size());
 #endif
@@ -336,7 +340,7 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
   ASSERT_EQ(1, debug_nan_count_tensor_vals[0].scalar<int64>()());
 }
 
-#ifndef GOOGLE_CUDA
+#if !defined(GOOGLE_CUDA) && !defined(TENSORFLOW_USE_SYCL)
 // TODO(cais): Reinstate the following test for concurrent debugged runs on
 //   a GPU once the root cause of the ~0.5% flakiness has been addressed.
 //   (b/34081273)
@@ -499,6 +503,8 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -599,6 +605,8 @@ class SessionDebugVariableTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -818,6 +826,8 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
 
 #if GOOGLE_CUDA
   ASSERT_EQ(2, run_metadata.partition_graphs().size());
+#elif defined(TENSORFLOW_USE_SYCL)
+  ASSERT_EQ(2, run_metadata.partition_graphs().size());
 #else
   ASSERT_EQ(1, run_metadata.partition_graphs().size());
 #endif
@@ -855,13 +865,17 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   ASSERT_EQ(2, debug_nan_count_tensor_vals[0].scalar<int64>()());
 }
 
-#if GOOGLE_CUDA
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_SYCL)
 class SessionDebugGPUSwitchTest : public ::testing::Test {
  public:
   void Initialize() {
     Graph graph(OpRegistry::Global());
 
+#ifdef GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif TENSORFLOW_USE_SYCL
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
+#endif
 
     Tensor vb(DT_BOOL, TensorShape({}));
     vb.scalar<bool>()() = true;

diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
@@ -94,9 +94,9 @@ OpKernel::OpKernel(OpKernelConstruction* context)
   OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU tie very few resources on the CPU where the
+  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}

diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
@@ -36,6 +36,10 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), HostConstantOp);
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), HostConstantOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
+#endif // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
@@ -32,6 +32,7 @@ load(
     "tf_kernel_library",
     "cc_header_only_library",
 )
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
@@ -433,7 +434,7 @@ ARRAY_DEPS = [
     "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
     "//third_party/eigen3",
-]
+] + if_sycl(["//tensorflow/core:sycl_runtime"])
 
 cc_library(
     name = "array_not_windows",

diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
@@ -165,9 +165,6 @@ REGISTER_KERNEL_BUILDER(Name("AddN")
 REGISTER_ADDN(float, SYCL);
 REGISTER_ADDN(double, SYCL);
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("AddN")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")