UNSWComputing
diff --git a/‎.jenkins/pytorch/test.sh
+3-6 b/‎.jenkins/pytorch/test.sh
+3-6
diff --git a/‎aten/src/ATen/native/DispatchStub.cpp
+44 b/‎aten/src/ATen/native/DispatchStub.cpp
+44
diff --git a/‎aten/src/ATen/native/DispatchStub.h
+125 b/‎aten/src/ATen/native/DispatchStub.h
+125
diff --git a/‎aten/src/ATen/native/ReduceOps.cpp
+7-4 b/‎aten/src/ATen/native/ReduceOps.cpp
+7-4
diff --git a/‎aten/src/ATen/native/SoftMax.cpp
+10-4 b/‎aten/src/ATen/native/SoftMax.cpp
+10-4
diff --git a/‎aten/src/ATen/native/UnaryOps.cpp
+26-2 b/‎aten/src/ATen/native/UnaryOps.cpp
+26-2
@@ -44,13 +44,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
-export ATEN_DISABLE_AVX=
-export ATEN_DISABLE_AVX2=
 if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
-  export ATEN_DISABLE_AVX=1
-fi
-if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
-  export ATEN_DISABLE_AVX2=1
+  export ATEN_CPU_CAPABILITY=default
+elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
+  export ATEN_CPU_CAPABILITY=avx
 fi
 
 test_python_nn() {
 
@@ -0,0 +1,44 @@
+#include "DispatchStub.h"
+
+#include <ATen/Error.h>
+
+#include <cpuinfo.h>
+#include <cstdlib>
+#include <cstring>
+
+namespace at { namespace native {
+
+static CPUCapability compute_cpu_capability() {
+  auto envar = std::getenv("ATEN_CPU_CAPABILITY");
+  if (envar) {
+    if (strcmp(envar, "avx2") == 0) {
+      return CPUCapability::AVX2;
+    }
+    if (strcmp(envar, "avx") == 0) {
+      return CPUCapability::AVX;
+    }
+    if (strcmp(envar, "default") == 0) {
+      return CPUCapability::DEFAULT;
+    }
+    AT_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar);
+  }
+
+#ifndef __powerpc__
+  if (cpuinfo_initialize()) {
+    if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) {
+      return CPUCapability::AVX2;
+    }
+    if (cpuinfo_has_x86_avx()) {
+      return CPUCapability::AVX;
+    }
+  }
+#endif
+  return CPUCapability::DEFAULT;
+}
+
+CPUCapability get_cpu_capability() {
+  static CPUCapability capability = compute_cpu_capability();
+  return capability;
+}
+
+}}  // namespace at::native
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <ATen/Error.h>
+#include <ATen/ScalarType.h>
+#include <type_traits>
+
+// Implements instruction set specific function dispatch.
+//
+// Kernels that may make use of specialized instruction sets (e.g. AVX) are
+// compiled multiple times with different compiler flags (e.g. -mavx). A
+// DispatchStub contains a table of function pointers for a kernel. At runtime,
+// the fastest available kernel is chosen based on the features reported by
+// cpuinfo.
+//
+// Example:
+//
+// In native/MyKernel.h:
+//   using fn_type = void(*)(const Tensor& x);
+//   DECLARE_DISPATCH(fn_type, stub);
+//
+// In native/MyKernel.cpp
+//   DEFINE_DISPATCH(stub);
+//
+// In native/cpu/MyKernel.cpp:
+//   void kernel(const Tensor& x) { ... }
+//   REGISTER_DISPATCH(stub, &kernel);
+//
+// To call:
+//   stub(kCPU, tensor);
+
+// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wundefined-var-template"
+#endif
+
+namespace at { namespace native {
+
+enum class CPUCapability {
+  DEFAULT = 0,
+  AVX = 1,
+  AVX2 = 2,
+  NUM_OPTIONS
+};
+
+CPUCapability get_cpu_capability();
+
+template <typename FnPtr, typename T>
+struct DispatchStub {
+  static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
+
+  template <typename... ArgTypes>
+  void operator()(Backend backend, ArgTypes... args) {
+    if (backend == Backend::CPU) {
+      if (!cpu_dispatch_ptr) {
+        cpu_dispatch_ptr = choose_cpu_impl();
+      }
+      (*cpu_dispatch_ptr)(args...);
+    } else if (backend == Backend::CUDA) {
+      AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
+      (*cuda_dispatch_ptr)(args...);
+    } else {
+      AT_ERROR("DispatchStub: unsupported backend", backend);
+    }
+  }
+
+  FnPtr choose_cpu_impl() {
+    auto capability = static_cast<int>(get_cpu_capability());
+    (void)capability;
+#ifdef HAVE_AVX2_CPU_DEFINITION
+    if (capability >= static_cast<int>(CPUCapability::AVX2)) {
+      AT_ASSERTM(AVX2, "DispatchStub: missing AVX2 kernel");
+      return AVX2;
+    }
+#endif
+#ifdef HAVE_AVX_CPU_DEFINITION
+    if (capability >= static_cast<int>(CPUCapability::AVX)) {
+      AT_ASSERTM(AVX, "DispatchStub: missing AVX kernel");
+      return AVX;
+    }
+#endif
+    AT_ASSERTM(DEFAULT, "DispatchStub: missing default kernel");
+    return DEFAULT;
+  }
+
+  FnPtr cpu_dispatch_ptr = nullptr;
+  FnPtr cuda_dispatch_ptr = nullptr;
+  static FnPtr DEFAULT;
+#ifdef HAVE_AVX_CPU_DEFINITION
+  static FnPtr AVX;
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+  static FnPtr AVX2;
+#endif
+};
+
+namespace {
+template <typename FnPtr, typename T>
+struct RegisterDispatch {
+  RegisterDispatch(DispatchStub<FnPtr, T>& stub, FnPtr value) {
+    stub.cuda_dispatch_ptr = value;
+  }
+};
+} // anonymous namespace
+
+#define DECLARE_DISPATCH(fn, name) \
+  extern struct name : DispatchStub<fn, name> {} name
+
+#define DEFINE_DISPATCH(name) struct name name
+
+#if defined(__CUDACC__)
+#define REGISTER_DISPATCH(name, fn) \
+  static RegisterDispatch<decltype(fn), struct name> name ## __register(name, fn);
+#elif defined(CPU_CAPABILITY)
+#define REGISTER_DISPATCH(name, fn) \
+  template <> decltype(fn) DispatchStub<decltype(fn), struct name>::CPU_CAPABILITY = fn;
+#endif
+
+
+}} // namespace at::native
+
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
@@ -17,6 +17,9 @@
 namespace at {
 namespace native {
 
+DEFINE_DISPATCH(sum_kernel);
+DEFINE_DISPATCH(prod_kernel);
+
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
   ScalarType scalarType = self.type().scalarType();
   ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType);
@@ -127,7 +130,7 @@ Tensor sum(const Tensor &self) {
 Tensor _sum_cpu(const Tensor& self) {
   if (self.is_contiguous()) {
     Tensor result = at::empty({}, self.type());
-    sum_kernel(result, self, at::nullopt);
+    sum_kernel(kCPU, result, self, at::nullopt);
     return result;
   }
   return self._sumall();
@@ -148,7 +151,7 @@ Tensor prod(const Tensor &self) {
 Tensor _prod_cpu(const Tensor &self) {
   if (self.is_contiguous()) {
     Tensor result = at::empty({}, self.type());
-    prod_kernel(result, self, at::nullopt);
+    prod_kernel(kCPU, result, self, at::nullopt);
     return result;
   }
   return self._prodall();
@@ -222,7 +225,7 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
     return result;
   if (self.is_contiguous() && result.is_contiguous()) {
     _dimreduce_setup(result, self, dim);
-    sum_kernel(result, self, dim);
+    sum_kernel(kCPU, result, self, dim);
     if (!keepdim) result.squeeze_(dim);
     return result;
   }
@@ -260,7 +263,7 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
     return result;
   if (self.is_contiguous() && result.is_contiguous()) {
     _dimreduce_setup(result, self, dim);
-    prod_kernel(result, self, dim);
+    prod_kernel(kCPU, result, self, dim);
     if (!keepdim) result.squeeze_(dim);
     return result;
   }
 
@@ -128,7 +128,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) {
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
-    softmax_lastdim_kernel(output, input);
+    softmax_lastdim_kernel(kCPU, output, input);
   } else {
     AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] {
       host_softmax<scalar_t, false>(output, input, dim);
@@ -147,7 +147,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) {
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
-    log_softmax_lastdim_kernel(output, input);
+    log_softmax_lastdim_kernel(kCPU, output, input);
   } else {
     AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] {
       host_softmax<scalar_t, true>(output, input, dim);
@@ -176,7 +176,7 @@ Tensor softmax_backward_cpu(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
-    softmax_backward_lastdim_kernel(grad_input, grad, output);
+    softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
   } else {
     AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] {
       host_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
@@ -205,13 +205,19 @@ Tensor log_softmax_backward_cpu(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
-    log_softmax_backward_lastdim_kernel(grad_input, grad, output);
+    log_softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
   } else {
     AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] {
       host_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
     });
   }
   return grad_input;
 }
+
+DEFINE_DISPATCH(softmax_lastdim_kernel);
+DEFINE_DISPATCH(log_softmax_lastdim_kernel);
+DEFINE_DISPATCH(softmax_backward_lastdim_kernel);
+DEFINE_DISPATCH(log_softmax_backward_lastdim_kernel);
+
 }
 }
@@ -92,14 +92,14 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
   Tensor& _##op##__cpu(Tensor& self_) {                         \
     if (self_.numel() > 0) {                                    \
       Tensor self = sort_strides(self_);                        \
-      op##Impl(self, self);                                     \
+      op##Impl(kCPU, self, self);                               \
     }                                                           \
     return self_;                                               \
   }                                                             \
   Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
     result.resize_(self.sizes());                               \
     if (result.numel() > 0) {                                   \
-      op##Impl(result, self);                                   \
+      op##Impl(kCPU, result, self);                             \
     }                                                           \
     return result;                                              \
   }
@@ -145,5 +145,29 @@ IMPLEMENT_UNARY_OP_VEC(tan)
 IMPLEMENT_UNARY_OP_VEC(tanh)
 IMPLEMENT_UNARY_OP_VEC(trunc)
 
+DEFINE_DISPATCH(absImpl);
+DEFINE_DISPATCH(acosImpl);
+DEFINE_DISPATCH(asinImpl);
+DEFINE_DISPATCH(atanImpl);
+DEFINE_DISPATCH(ceilImpl);
+DEFINE_DISPATCH(cosImpl);
+DEFINE_DISPATCH(erfImpl);
+DEFINE_DISPATCH(erfcImpl);
+DEFINE_DISPATCH(expImpl);
+DEFINE_DISPATCH(expm1Impl);
+DEFINE_DISPATCH(floorImpl);
+DEFINE_DISPATCH(logImpl);
+DEFINE_DISPATCH(log10Impl);
+DEFINE_DISPATCH(log1pImpl);
+DEFINE_DISPATCH(log2Impl);
+DEFINE_DISPATCH(roundImpl);
+DEFINE_DISPATCH(rsqrtImpl);
+DEFINE_DISPATCH(sigmoidImpl);
+DEFINE_DISPATCH(sinImpl);
+DEFINE_DISPATCH(sqrtImpl);
+DEFINE_DISPATCH(tanImpl);
+DEFINE_DISPATCH(tanhImpl);
+DEFINE_DISPATCH(truncImpl);
+
 }
 } // namespace at