benoitsteiner
diff --git a/‎configure.py
Lines changed: 14 additions & 1 deletion b/‎configure.py
Lines changed: 14 additions & 1 deletion
diff --git a/‎tensorflow/BUILD
Lines changed: 1 addition & 1 deletion b/‎tensorflow/BUILD
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/c/eager/c_api.cc
Lines changed: 12 additions & 4 deletions b/‎tensorflow/c/eager/c_api.cc
Lines changed: 12 additions & 4 deletions
diff --git a/‎tensorflow/compiler/aot/compile.cc
Lines changed: 5 additions & 1 deletion b/‎tensorflow/compiler/aot/compile.cc
Lines changed: 5 additions & 1 deletion
diff --git a/‎tensorflow/compiler/aot/tfcompile.bzl
Lines changed: 28 additions & 3 deletions b/‎tensorflow/compiler/aot/tfcompile.bzl
Lines changed: 28 additions & 3 deletions
diff --git a/‎tensorflow/compiler/plugin/BUILD
Lines changed: 14 additions & 0 deletions b/‎tensorflow/compiler/plugin/BUILD
Lines changed: 14 additions & 0 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/BUILD
Lines changed: 3 additions & 1 deletion b/‎tensorflow/compiler/tf2xla/kernels/BUILD
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/compiler/tf2xla/kernels/index_ops.cc
Lines changed: 14 additions & 5 deletions b/‎tensorflow/compiler/tf2xla/kernels/index_ops.cc
Lines changed: 14 additions & 5 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
Lines changed: 3 additions & 0 deletions b/‎tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
Lines changed: 3 additions & 0 deletions b/‎tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
Lines changed: 3 additions & 0 deletions
@@ -963,6 +963,19 @@ def set_monolithic():
   write_to_bazelrc('build --define framework_shared_object=true')
 
 
+def create_android_bazelrc_configs():
+  # Flags for --config=android
+  write_to_bazelrc('build:android --crosstool_top=//external:android/crosstool')
+  write_to_bazelrc(
+      'build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain')
+  # Flags for --config=android_arm
+  write_to_bazelrc('build:android_arm --config=android')
+  write_to_bazelrc('build:android_arm --cpu=armeabi-v7a')
+  # Flags for --config=android_arm64
+  write_to_bazelrc('build:android_arm64 --config=android')
+  write_to_bazelrc('build:android_arm64 --cpu=arm64-v8a')
+
+
 def main():
   # Make a copy of os.environ to be clear when functions and getting and setting
   # environment variables.
@@ -1033,7 +1046,7 @@ def main():
   set_cc_opt_flags(environ_cp)
   set_mkl()
   set_monolithic()
-
+  create_android_bazelrc_configs()
 
 if __name__ == '__main__':
   main()
@@ -331,6 +331,7 @@ filegroup(
         "//tensorflow/compiler/jit/kernels:all_files",
         "//tensorflow/compiler/jit/legacy_flags:all_files",
         "//tensorflow/compiler/jit/ops:all_files",
+        "//tensorflow/compiler/plugin:all_files",
         "//tensorflow/compiler/tests:all_files",
         "//tensorflow/compiler/tf2xla:all_files",
         "//tensorflow/compiler/tf2xla/cc:all_files",
@@ -456,7 +457,6 @@ filegroup(
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/contrib/verbs:all_files",
-        "//tensorflow/contrib/xla_tf_graph:all_files",
         "//tensorflow/core:all_files",
         "//tensorflow/core/debug:all_files",
         "//tensorflow/core/distributed_runtime:all_files",
 
@@ -440,11 +440,19 @@ tensorflow::Status ValidateInputTypeAndPlacement(
     if (expected_device != actual_device) {
       switch (ctx->policy) {
         case TFE_DEVICE_PLACEMENT_EXPLICIT:
+          // TODO(xpan): See if we could bubble python related error up
+          // to python level.
           return tensorflow::errors::InvalidArgument(
-              "cannot compute ", op->name, " as input #", i,
-              " was expected to be on ", expected_device->name(),
-              " but is actually on ", actual_device->name(),
-              " (operation running on ", op_device->name(), ")");
+              "Tensors on conflicting devices:"
+              " cannot compute ",
+              op->name, " as input #", i, " was expected to be on ",
+              expected_device->name(), " but is actually on ",
+              actual_device->name(), " (operation running on ",
+              op_device->name(), ")",
+              " Tensors can be copied explicitly using .gpu() or .cpu(),"
+              " or transparently copied by using tfe.enable_eager_execution("
+              "tfe.DEVICE_PLACEMENT_SILENT). Copying tensors between devices"
+              " may slow down your model");
         case TFE_DEVICE_PLACEMENT_WARN:
           LOG(WARNING) << "before computing " << op->name << " input #" << i
                        << " was expected to be on " << expected_device->name()
 
@@ -100,8 +100,12 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   if (!flags.out_session_module.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
                         computation.Snapshot());
+    // Serialize the SessionModule deterministically so that all the outputs of
+    // a tf_library genrule are deterministic.
+    string proto;
+    TF_RET_CHECK(SerializeToStringDeterministic(*module, &proto));
     TF_RETURN_IF_ERROR(
-        WriteBinaryProto(Env::Default(), flags.out_session_module, *module));
+        WriteStringToFile(Env::Default(), flags.out_session_module, proto));
   }
   xla::cpu::CpuAotCompilationOptions aot_opts(
       flags.target_triple, flags.target_cpu, flags.target_features,
 
@@ -129,7 +129,6 @@ def tf_library(name, graph, config,
   # Rule that runs tfcompile to produce the header and object file.
   header_file = name + ".h"
   object_file = name + ".o"
-  session_module_pb = name + "_session_module.pb"
   ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
   native.genrule(
       name=("gen_" + name),
@@ -140,7 +139,6 @@ def tf_library(name, graph, config,
       outs=[
           header_file,
           object_file,
-          session_module_pb,
       ],
       cmd=("$(location " + tfcompile_tool + ")" +
            " --graph=$(location " + tfcompile_graph + ")" +
@@ -150,7 +148,6 @@ def tf_library(name, graph, config,
            " --target_triple=" + target_llvm_triple() +
            " --out_header=$(@D)/" + header_file +
            " --out_object=$(@D)/" + object_file +
-           " --out_session_module=$(@D)/" + session_module_pb +
            " " + (tfcompile_flags or "")),
       tools=[tfcompile_tool],
       visibility=visibility,
@@ -168,6 +165,34 @@ def tf_library(name, graph, config,
       tags=tags,
   )
 
+  # Rule that runs tfcompile to produce the SessionModule proto, useful for
+  # debugging.  TODO(b/64813587): Once the SessionModule proto is
+  # deterministic, move this into the main rule above.
+  session_module_pb = name + "_session_module.pb"
+  native.genrule(
+      name=(name + "_session_module"),
+      srcs=[
+          tfcompile_graph,
+          config,
+      ],
+      outs=[
+          session_module_pb,
+      ],
+      cmd=("$(location " + tfcompile_tool + ")" +
+           " --graph=$(location " + tfcompile_graph + ")" +
+           " --config=$(location " + config + ")" +
+           " --entry_point=" + ep +
+           " --cpp_class=" + cpp_class +
+           " --target_triple=" + target_llvm_triple() +
+           " --out_session_module=$(@D)/" + session_module_pb +
+           " " + (tfcompile_flags or "")),
+      tools=[tfcompile_tool],
+      visibility=visibility,
+      testonly=testonly,
+      local=1,
+      tags=tags,
+  )
+
   # The cc_library rule packaging up the header and object file, and needed
   # kernel implementations.
   need_xla_data_proto = (tfcompile_flags and
 
@@ -40,3 +40,17 @@ cc_library(
         #"//tensorflow/compiler/plugin/example:example_lib",
     ],
 )
+
+#-----------------------------------------------------------------------------
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
@@ -5,7 +5,6 @@ package(
 )
 
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load("//tensorflow/compiler/xla:xla.bzl", "export_dynamic_linkopts")
 
 tf_kernel_library(
     name = "xla_ops",
@@ -83,6 +82,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
@@ -152,6 +152,7 @@ cc_library(
     srcs = ["index_ops_kernel_argmax_float_1d.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -163,6 +164,7 @@ cc_library(
     srcs = ["index_ops_kernel_argmax_float_2d.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
 
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -82,16 +83,24 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
   std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
   std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
   // Compute a mask that has 1s for elements equal to the maximum.
-  xla::ComputationDataHandle mask = b->ConvertElementType(
+  xla::ComputationDataHandle partial_mask = b->ConvertElementType(
       b->Eq(input, input_max, broadcast_dims), xla_index_type);
 
-  // Multiply by the vector [0, 1, 2, ...] to convert each 1 into its index.
-  // TODO(phawkins): add a bitwise And operator to HLO, use a bitwise and
-  // instead of a multiplication here.
+  // In order to make identity elements for a bitwise And, we:
+  //   Left shift the 1 to the leftmost bit, yielding 0x10...0
+  //   Arithmetic right shift the 1 back to the rightmost bit, yielding 0xFF...F
+  int32 bits_in_type =
+      xla::ShapeUtil::ByteSizeOfPrimitiveType(xla_index_type) * 8 - 1;
+  xla::ComputationDataHandle shift_amount =
+      XlaHelpers::IntegerLiteral(b, index_type, bits_in_type);
+  xla::ComputationDataHandle full_mask = b->ShiftRightArithmetic(
+      b->ShiftLeft(partial_mask, shift_amount), shift_amount);
+
+  // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its index.
   xla::ComputationDataHandle iota;
   OP_REQUIRES_OK(ctx, XlaHelpers::Iota(b, index_type, axis_size, &iota));
   xla::ComputationDataHandle product =
-      b->Mul(mask, iota, /*broadcast_dimensions=*/{axis});
+      b->And(full_mask, iota, /*broadcast_dimensions=*/{axis});
 
   // If there are multiple maximum elements, choose the one with the highest
   // index.
 
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/macros.h"
@@ -47,3 +48,5 @@ EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
 extern "C" void TF_EXPORT argmax_float_1d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_1d_xla_impl(out, data);
 }
+
+REGISTER_CUSTOM_CALL_TARGET(argmax_float_1d_xla_impl);
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/macros.h"
@@ -49,3 +50,5 @@ EIGEN_STRONG_INLINE void argmax_float_2d_xla_impl(void* out, void** data) {
 extern "C" void TF_EXPORT argmax_float_2d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_2d_xla_impl(out, data);
 }
+
+REGISTER_CUSTOM_CALL_TARGET(argmax_float_2d_xla_impl);