pytorch
diff --git a/‎.github/workflows/float8nocompile_test.yaml
+53 b/‎.github/workflows/float8nocompile_test.yaml
+53
diff --git a/‎.github/workflows/torchao_experimental_test.yml
+9-4 b/‎.github/workflows/torchao_experimental_test.yml
+9-4
diff --git a/‎dev-requirements.txt
-3 b/‎dev-requirements.txt
-3
diff --git a/‎scripts/clean_release_notes.py
+1-1 b/‎scripts/clean_release_notes.py
+1-1
diff --git a/‎test/quantization/test_galore_quant.py
-2 b/‎test/quantization/test_galore_quant.py
-2
diff --git a/‎test/quantization/test_qat.py
-71 b/‎test/quantization/test_qat.py
-71
diff --git a/‎torchao/_executorch_ops.py
-2 b/‎torchao/_executorch_ops.py
-2
diff --git a/‎torchao/csrc/cuda/fp6_llm/fp6_linear.cu
+14-38 b/‎torchao/csrc/cuda/fp6_llm/fp6_linear.cu
+14-38
diff --git a/‎torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
+5-14 b/‎torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
+5-14
diff --git a/‎torchao/experimental/CMakeLists.txt
-1 b/‎torchao/experimental/CMakeLists.txt
-1
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
+1-1 b/‎torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
+1-1
@@ -0,0 +1,53 @@
+name: Run Float8nocompile Tests
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+    paths:
+      - 'torchao/prototype/float8nocompile/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+    paths:
+      - 'torchao/prototype/float8nocompile/**'
+
+concurrency:
+  group: floatnocompile_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+# jobs:
+#   test:
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         include:
+#           - name: H100
+#             runs-on: linux.aws.h100
+#             torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
+#             gpu-arch-type: "cuda"
+#             gpu-arch-version: "12.4"
+
+#     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+#     with:
+#       timeout: 300
+#       runner: ${{ matrix.runs-on }}
+#       gpu-arch-type: ${{ matrix.gpu-arch-type }}
+#       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+#       submodules: recursive
+#       script: |
+#         conda create -n venv python=3.9 -y
+#         conda activate venv
+#         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+#         python -m pip install --upgrade pip
+#         pip install ${{ matrix.torch-spec }}
+#         pip install -r dev-requirements.txt
+#         pip install .
+#         cd torchao/prototype/float8nocompile
+#         pytest kernels/ --verbose -s
+#         pytest test/train_test.py --verbose -s
@@ -37,8 +37,10 @@ jobs:
           # of torch and torchao, which we do not want to use
           pip install executorch
           pip install torch==2.7.0.dev20250311 --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
-          pip install -r dev-requirements.txt
-          USE_CPP=1 TORCHAO_BUILD_KLEIDIAI=1 pip install .
+          pip install numpy
+          pip install pytest
+          pip install parameterized
+          USE_CPP=1 TOCHAO_BUILD_KLEIDIAI=1 pip install .
       - name: Run python tests
         run: |
           conda activate venv
@@ -97,8 +99,11 @@ jobs:
           python -c "import torch; print(torch.__version__)"
       - name: Install requirements
         run: |
-          pip install -r dev-requirements.txt
-          pip install pyyaml importlib-metadata
+          pip install cmake
+          pip install parameterized
+          pip install pyyaml
+          pip install numpy
+          pip install importlib-metadata
       - name: Print pip freeze
         run: |
           pip freeze
 
@@ -26,9 +26,6 @@ importlib_metadata
 # Custom CUDA Extensions
 ninja
 
-# CPU kernels
-cmake<4.0.0,>=3.19.0
-
 # Linting
 ruff==0.6.8
 pre-commit
@@ -223,7 +223,7 @@ def format_commit(commit_line: str) -> str:
       After:  * Commit title (https://github.com/pytorch/ao/pull/123)
     """
     # Remove author, put PR link in parentheses
-    commit_line = re.sub(" by @.* in (.*)", r" (\g<1>)", commit_line)
+    commit_line = re.sub(" by @.* in (.*)", r" (\\g<1>)", commit_line)
     # Capitalize first letter
     commit_line = commit_line.lstrip("* ")
     commit_line = "* " + commit_line[0].upper() + commit_line[1:]
 
@@ -38,7 +38,6 @@
 
 
 @pytest.mark.skip("skipping for now, see comments below")
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
 @pytest.mark.parametrize(
     "dim1,dim2,dtype,signed,blocksize",
     TEST_CONFIGS,
@@ -90,7 +89,6 @@ def test_galore_quantize_blockwise(dim1, dim2, dtype, signed, blocksize):
     TEST_CONFIGS,
 )
 @skip_if_rocm("ROCm enablement in progress")
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
 def test_galore_dequant_blockwise(dim1, dim2, dtype, signed, blocksize):
     g = torch.randn(dim1, dim2, device="cuda", dtype=dtype) * 0.01
 
 
@@ -133,18 +133,6 @@ def forward(self, x):
         return x
 
 
-class M4(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.linear = torch.nn.Linear(512, 256, bias=False).to(torch.float)
-
-    def example_inputs(self):
-        return (torch.randn(1, 512).to(torch.float),)
-
-    def forward(self, x):
-        return self.linear(x)
-
-
 class ModelWithLinearBias(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1401,65 +1389,6 @@ def test_qat_linear_bias(self):
         example_inputs = m.example_inputs()
         m(*example_inputs)
 
-    @unittest.skipIf(
-        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
-    )
-    def test_fake_quantize_per_token_vs_convert(self):
-        """
-        Test that the following produce the exact same numerics:
-          1. FakeQuantizer with asymmetric per_token config
-          2. torchao.quantization.utils.per_token_dynamic_quant
-        """
-        from torchao.quantization.utils import per_token_dynamic_quant
-
-        torch.manual_seed(self.SEED)
-        x = torch.randn(1, 235, 2048)
-        config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
-        fake_quantizer = FakeQuantizer(config)
-        fake_quantizer_out = fake_quantizer(x)
-        baseline_out = per_token_dynamic_quant(x)
-        torch.testing.assert_close(fake_quantizer_out, baseline_out, atol=0, rtol=0)
-
-    @unittest.skipIf(
-        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
-    )
-    def test_qat_8da4w_prepare_vs_convert(self):
-        """
-        Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
-        numerics that match exactly over N trials.
-        """
-        from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
-        from torchao.quantization.utils import compute_error
-
-        num_trials = 1000
-        group_size = 16
-        non_inf_sqnr = []
-
-        for seed in range(self.SEED, self.SEED + num_trials):
-            torch.manual_seed(seed)
-            m = M4()
-            torch.manual_seed(seed)
-            x = m.example_inputs()
-
-            quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
-            prepared = quantizer.prepare(m)
-            prepared_out = prepared(*x)
-            converted = quantizer.convert(prepared)
-            converted_out = converted(*x)
-            sqnr = compute_error(prepared_out, converted_out).item()
-            if sqnr != float("inf"):
-                non_inf_sqnr.append(sqnr)
-
-        avg_sqnr = (
-            sum(non_inf_sqnr) / len(non_inf_sqnr) if len(non_inf_sqnr) > 0 else -1
-        )
-        fail_message = "%s/%s trials did not match exactly, average sqnr = %s" % (
-            len(non_inf_sqnr),
-            num_trials,
-            avg_sqnr,
-        )
-        self.assertEqual(len(non_inf_sqnr), 0, fail_message)
-
 
 if __name__ == "__main__":
     unittest.main()
@@ -5,8 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 
-# TODO: delete these ops
-
 
 def _quantized_decomposed_quantize_per_channel_group_wrapper(*args, **kwargs):
     """
 
@@ -21,7 +21,6 @@
 //
 // MODIFICATION NOTE (2024-09-25): added SM75 support (https://github.com/pytorch/ao/pull/942):
 // - Modified the TilingConfig parameters for SM75 to deal with smaller shared memory
-// - Added proper architecture check at both host and device level
 //
 
 
@@ -99,24 +98,7 @@ void        fpx_linear_kernel(cudaStream_t    stream,
     static_assert(std::is_same<InputDataType, half>::value || std::is_same<InputDataType, __nv_bfloat16>::value, "Type must be 'half' or '__nv_bfloat16'");
     assert(M_Global % 256 == 0);
     assert(K_Global % 64 == 0);
-    assert(N_Global > 0);
-
-    // Check GPU Compute Capability before proceeding
-    int device, major, minor;
-    CHECK_CUDA(cudaGetDevice(&device));
-    CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
-    CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
-
-    // Early exit with error for unsupported architectures
-    if ((major < 7) || (major == 7 && minor < 5)) {
-        TORCH_CHECK(false, "Quant-LLM Error: This kernel requires GPU with SM75 (Turing) or higher architecture. "
-                         "Your current device has SM", major, minor, " which is not supported.");
-    }
-
-    const bool is_sm75_gpu = (major == 7) && (minor == 5);
-    if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value) {
-        TORCH_CHECK(false, "Quant-LLM Error: BFloat16 inputs are not supported on SM75 (Turing) GPUs.");
-    }
+    assert(N_Global>0);
 
     // Work around to support more N shapes:
     size_t N_PowerOf2;
@@ -127,6 +109,17 @@ void        fpx_linear_kernel(cudaStream_t    stream,
     if(N_Global>64 && N_Global<=128)    N_PowerOf2 = 128;
     if(N_Global>128)                    N_PowerOf2 = ((N_Global-1)/128+1) * 128;
 
+    // Check GPU Compute Capability
+    int device, major, minor;
+    CHECK_CUDA(cudaGetDevice(&device));
+    CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+    CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
+    const bool is_sm75_gpu = (major == 7) && (minor == 5);
+    if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value)
+        TORCH_CHECK(false, "Bfloat16 inputs are not supported for SM75");
+    if ((major < 7) || (major == 7 && minor < 5))
+        TORCH_CHECK(false, "FP6LLM_API Error: FP6LLM requires GPU with SM75 or higher!\n");
+
     if (is_sm75_gpu && (N_PowerOf2 == 64 || N_PowerOf2 == 128 || N_PowerOf2 % 128 == 0)) {
         // For SM75 and N >= 64, we use a different TilingConfig to deal with smaller shared memory.
         if (Split_K == 1) {
@@ -143,7 +136,7 @@ void        fpx_linear_kernel(cudaStream_t    stream,
                 case 64:    Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K);  break;
                 case 128:   Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K);  break;
                 default:    if (N_PowerOf2 % 128 != 0) {
-                                TORCH_CHECK(false, "Quant-LLM Error: Unsupported N dimension ", N_PowerOf2);
+                                TORCH_CHECK(false, "FP6LLM_API Error: Unsupported N dimension ", N_PowerOf2);
                             }
                             Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K);  break;
             }
@@ -156,7 +149,7 @@ void        fpx_linear_kernel(cudaStream_t    stream,
                 case 64:    Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K);  break;
                 case 128:   Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K);  break;
                 default:    if (N_PowerOf2 % 128 != 0) {
-                                TORCH_CHECK(false, "Quant-LLM Error: Unsupported N dimension ", N_PowerOf2);
+                                TORCH_CHECK(false, "FP6LLM_API Error: Unsupported N dimension ", N_PowerOf2);
                             }
                             Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K);  break;
             }
@@ -217,23 +210,6 @@ torch::Tensor fp_eXmY_linear_forward_cuda(
     torch::Tensor   _scales,
     int64_t         splitK=1)
 {
-    // Check GPU Compute Capability before proceeding
-    int device, major, minor;
-    CHECK_CUDA(cudaGetDevice(&device));
-    CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
-    CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
-
-    // Early exit with error for unsupported architectures
-    if ((major < 7) || (major == 7 && minor < 5)) {
-        TORCH_CHECK(false, "Quant-LLM Error: This kernel requires GPU with SM75 (Turing) or higher architecture. "
-                         "Your current device has SM", major, minor, " which is not supported.");
-    }
-
-    const bool is_sm75_gpu = (major == 7) && (minor == 5);
-    if (is_sm75_gpu && _in_feats.scalar_type() == at::ScalarType::BFloat16) {
-        TORCH_CHECK(false, "Quant-LLM Error: BFloat16 inputs are not supported on SM75 (Turing) GPUs.");
-    }
-
     const int64_t NBITS   = 1 + EXPONENT + MANTISSA;
     int num_in_feats      = _in_feats.size(0);
     int num_in_channels   = _in_feats.size(1);
 
@@ -51,14 +51,17 @@
  * B: col major, FP16
  * C: col major, FP16
  */
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
-template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
+ template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
 __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
                                   const half *B,
                                   OutputDataType* C,
                                   const size_t M_Global, const size_t N_Global, const size_t K_Global,
                                   int Split_K)
 {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+    static_assert(false, "Quant-LLM kernel: At least Turing generation (sm75) is required.");
+    // __trap();  // fails at runtime instead of compile time
+  #endif
   #ifdef DEBUG_MODE
     assert(K_Global%TilingConfig::TILE_K==0);
     assert(M_Global%TilingConfig::TILE_M==0);
@@ -230,15 +233,3 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
       }
     }
 }
-#else
-// Stub implementation for older architectures
-template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
-__global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
-                                  const half *B,
-                                  OutputDataType* C,
-                                  const size_t M_Global, const size_t N_Global, const size_t K_Global,
-                                  int Split_K)
-{
-//  NOOP, should never actually be called
-}
-#endif
@@ -40,7 +40,6 @@ include_directories(${TORCHAO_INCLUDE_DIRS})
 if(TORCHAO_BUILD_CPU_AARCH64)
     message(STATUS "Building with cpu/aarch64")
     add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64)
-    add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT)
 
     # Defines torchao_kernels_aarch64
     add_subdirectory(kernels/cpu/aarch64)
 
@@ -19,7 +19,7 @@ if (TORCHAO_BUILD_CPU_AARCH64)
     # intelligence (AI) workloads tailored for Arm® CPUs.
     FetchContent_Declare(kleidiai
           GIT_REPOSITORY https://git.gitlab.arm.com/kleidi/kleidiai.git
-          GIT_TAG v1.5.0)
+          GIT_TAG v1.2.0)
     FetchContent_MakeAvailable(kleidiai)
 
     target_link_libraries(torchao_kernels_aarch64 PUBLIC kleidiai)