Skip to content

Commit 4acb7e8

Browse files
committed
Update
[ghstack-poisoned]
1 parent d9e267b commit 4acb7e8

File tree

49 files changed

+1613
-5376
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1613
-5376
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Run Float8nocompile Tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
- 'gh/**'
8+
paths:
9+
- 'torchao/prototype/float8nocompile/**'
10+
pull_request:
11+
branches:
12+
- main
13+
- 'gh/**'
14+
paths:
15+
- 'torchao/prototype/float8nocompile/**'
16+
17+
concurrency:
18+
group: floatnocompile_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
23+
24+
# jobs:
25+
# test:
26+
# strategy:
27+
# fail-fast: false
28+
# matrix:
29+
# include:
30+
# - name: H100
31+
# runs-on: linux.aws.h100
32+
# torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
33+
# gpu-arch-type: "cuda"
34+
# gpu-arch-version: "12.4"
35+
36+
# uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
37+
# with:
38+
# timeout: 300
39+
# runner: ${{ matrix.runs-on }}
40+
# gpu-arch-type: ${{ matrix.gpu-arch-type }}
41+
# gpu-arch-version: ${{ matrix.gpu-arch-version }}
42+
# submodules: recursive
43+
# script: |
44+
# conda create -n venv python=3.9 -y
45+
# conda activate venv
46+
# export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
47+
# python -m pip install --upgrade pip
48+
# pip install ${{ matrix.torch-spec }}
49+
# pip install -r dev-requirements.txt
50+
# pip install .
51+
# cd torchao/prototype/float8nocompile
52+
# pytest kernels/ --verbose -s
53+
# pytest test/train_test.py --verbose -s

.github/workflows/torchao_experimental_test.yml

+9-4
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@ jobs:
3737
# of torch and torchao, which we do not want to use
3838
pip install executorch
3939
pip install torch==2.7.0.dev20250311 --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
40-
pip install -r dev-requirements.txt
41-
USE_CPP=1 TORCHAO_BUILD_KLEIDIAI=1 pip install .
40+
pip install numpy
41+
pip install pytest
42+
pip install parameterized
43+
USE_CPP=1 TOCHAO_BUILD_KLEIDIAI=1 pip install .
4244
- name: Run python tests
4345
run: |
4446
conda activate venv
@@ -97,8 +99,11 @@ jobs:
9799
python -c "import torch; print(torch.__version__)"
98100
- name: Install requirements
99101
run: |
100-
pip install -r dev-requirements.txt
101-
pip install pyyaml importlib-metadata
102+
pip install cmake
103+
pip install parameterized
104+
pip install pyyaml
105+
pip install numpy
106+
pip install importlib-metadata
102107
- name: Print pip freeze
103108
run: |
104109
pip freeze

dev-requirements.txt

-3
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@ importlib_metadata
2626
# Custom CUDA Extensions
2727
ninja
2828

29-
# CPU kernels
30-
cmake<4.0.0,>=3.19.0
31-
3229
# Linting
3330
ruff==0.6.8
3431
pre-commit

scripts/clean_release_notes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def format_commit(commit_line: str) -> str:
223223
After: * Commit title (https://github.com/pytorch/ao/pull/123)
224224
"""
225225
# Remove author, put PR link in parentheses
226-
commit_line = re.sub(" by @.* in (.*)", r" (\g<1>)", commit_line)
226+
commit_line = re.sub(" by @.* in (.*)", r" (\\g<1>)", commit_line)
227227
# Capitalize first letter
228228
commit_line = commit_line.lstrip("* ")
229229
commit_line = "* " + commit_line[0].upper() + commit_line[1:]

test/quantization/test_galore_quant.py

-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838

3939

4040
@pytest.mark.skip("skipping for now, see comments below")
41-
@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
4241
@pytest.mark.parametrize(
4342
"dim1,dim2,dtype,signed,blocksize",
4443
TEST_CONFIGS,
@@ -90,7 +89,6 @@ def test_galore_quantize_blockwise(dim1, dim2, dtype, signed, blocksize):
9089
TEST_CONFIGS,
9190
)
9291
@skip_if_rocm("ROCm enablement in progress")
93-
@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
9492
def test_galore_dequant_blockwise(dim1, dim2, dtype, signed, blocksize):
9593
g = torch.randn(dim1, dim2, device="cuda", dtype=dtype) * 0.01
9694

test/quantization/test_qat.py

-71
Original file line numberDiff line numberDiff line change
@@ -133,18 +133,6 @@ def forward(self, x):
133133
return x
134134

135135

136-
class M4(torch.nn.Module):
137-
def __init__(self):
138-
super().__init__()
139-
self.linear = torch.nn.Linear(512, 256, bias=False).to(torch.float)
140-
141-
def example_inputs(self):
142-
return (torch.randn(1, 512).to(torch.float),)
143-
144-
def forward(self, x):
145-
return self.linear(x)
146-
147-
148136
class ModelWithLinearBias(torch.nn.Module):
149137
def __init__(self):
150138
super().__init__()
@@ -1401,65 +1389,6 @@ def test_qat_linear_bias(self):
14011389
example_inputs = m.example_inputs()
14021390
m(*example_inputs)
14031391

1404-
@unittest.skipIf(
1405-
not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
1406-
)
1407-
def test_fake_quantize_per_token_vs_convert(self):
1408-
"""
1409-
Test that the following produce the exact same numerics:
1410-
1. FakeQuantizer with asymmetric per_token config
1411-
2. torchao.quantization.utils.per_token_dynamic_quant
1412-
"""
1413-
from torchao.quantization.utils import per_token_dynamic_quant
1414-
1415-
torch.manual_seed(self.SEED)
1416-
x = torch.randn(1, 235, 2048)
1417-
config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
1418-
fake_quantizer = FakeQuantizer(config)
1419-
fake_quantizer_out = fake_quantizer(x)
1420-
baseline_out = per_token_dynamic_quant(x)
1421-
torch.testing.assert_close(fake_quantizer_out, baseline_out, atol=0, rtol=0)
1422-
1423-
@unittest.skipIf(
1424-
not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
1425-
)
1426-
def test_qat_8da4w_prepare_vs_convert(self):
1427-
"""
1428-
Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
1429-
numerics that match exactly over N trials.
1430-
"""
1431-
from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
1432-
from torchao.quantization.utils import compute_error
1433-
1434-
num_trials = 1000
1435-
group_size = 16
1436-
non_inf_sqnr = []
1437-
1438-
for seed in range(self.SEED, self.SEED + num_trials):
1439-
torch.manual_seed(seed)
1440-
m = M4()
1441-
torch.manual_seed(seed)
1442-
x = m.example_inputs()
1443-
1444-
quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
1445-
prepared = quantizer.prepare(m)
1446-
prepared_out = prepared(*x)
1447-
converted = quantizer.convert(prepared)
1448-
converted_out = converted(*x)
1449-
sqnr = compute_error(prepared_out, converted_out).item()
1450-
if sqnr != float("inf"):
1451-
non_inf_sqnr.append(sqnr)
1452-
1453-
avg_sqnr = (
1454-
sum(non_inf_sqnr) / len(non_inf_sqnr) if len(non_inf_sqnr) > 0 else -1
1455-
)
1456-
fail_message = "%s/%s trials did not match exactly, average sqnr = %s" % (
1457-
len(non_inf_sqnr),
1458-
num_trials,
1459-
avg_sqnr,
1460-
)
1461-
self.assertEqual(len(non_inf_sqnr), 0, fail_message)
1462-
14631392

14641393
if __name__ == "__main__":
14651394
unittest.main()

torchao/_executorch_ops.py

-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
# LICENSE file in the root directory of this source tree.
66
import torch
77

8-
# TODO: delete these ops
9-
108

119
def _quantized_decomposed_quantize_per_channel_group_wrapper(*args, **kwargs):
1210
"""

torchao/csrc/cuda/fp6_llm/fp6_linear.cu

+14-38
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
//
2222
// MODIFICATION NOTE (2024-09-25): added SM75 support (https://github.com/pytorch/ao/pull/942):
2323
// - Modified the TilingConfig parameters for SM75 to deal with smaller shared memory
24-
// - Added proper architecture check at both host and device level
2524
//
2625

2726

@@ -99,24 +98,7 @@ void fpx_linear_kernel(cudaStream_t stream,
9998
static_assert(std::is_same<InputDataType, half>::value || std::is_same<InputDataType, __nv_bfloat16>::value, "Type must be 'half' or '__nv_bfloat16'");
10099
assert(M_Global % 256 == 0);
101100
assert(K_Global % 64 == 0);
102-
assert(N_Global > 0);
103-
104-
// Check GPU Compute Capability before proceeding
105-
int device, major, minor;
106-
CHECK_CUDA(cudaGetDevice(&device));
107-
CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
108-
CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
109-
110-
// Early exit with error for unsupported architectures
111-
if ((major < 7) || (major == 7 && minor < 5)) {
112-
TORCH_CHECK(false, "Quant-LLM Error: This kernel requires GPU with SM75 (Turing) or higher architecture. "
113-
"Your current device has SM", major, minor, " which is not supported.");
114-
}
115-
116-
const bool is_sm75_gpu = (major == 7) && (minor == 5);
117-
if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value) {
118-
TORCH_CHECK(false, "Quant-LLM Error: BFloat16 inputs are not supported on SM75 (Turing) GPUs.");
119-
}
101+
assert(N_Global>0);
120102

121103
// Work around to support more N shapes:
122104
size_t N_PowerOf2;
@@ -127,6 +109,17 @@ void fpx_linear_kernel(cudaStream_t stream,
127109
if(N_Global>64 && N_Global<=128) N_PowerOf2 = 128;
128110
if(N_Global>128) N_PowerOf2 = ((N_Global-1)/128+1) * 128;
129111

112+
// Check GPU Compute Capability
113+
int device, major, minor;
114+
CHECK_CUDA(cudaGetDevice(&device));
115+
CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
116+
CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
117+
const bool is_sm75_gpu = (major == 7) && (minor == 5);
118+
if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value)
119+
TORCH_CHECK(false, "Bfloat16 inputs are not supported for SM75");
120+
if ((major < 7) || (major == 7 && minor < 5))
121+
TORCH_CHECK(false, "FP6LLM_API Error: FP6LLM requires GPU with SM75 or higher!\n");
122+
130123
if (is_sm75_gpu && (N_PowerOf2 == 64 || N_PowerOf2 == 128 || N_PowerOf2 % 128 == 0)) {
131124
// For SM75 and N >= 64, we use a different TilingConfig to deal with smaller shared memory.
132125
if (Split_K == 1) {
@@ -143,7 +136,7 @@ void fpx_linear_kernel(cudaStream_t stream,
143136
case 64: Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K); break;
144137
case 128: Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K); break;
145138
default: if (N_PowerOf2 % 128 != 0) {
146-
TORCH_CHECK(false, "Quant-LLM Error: Unsupported N dimension ", N_PowerOf2);
139+
TORCH_CHECK(false, "FP6LLM_API Error: Unsupported N dimension ", N_PowerOf2);
147140
}
148141
Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, InputDataType, EXPONENT, MANTISSA>(stream, Weight, Scales, B, C, M_Global, N_Global, K_Global, Split_K); break;
149142
}
@@ -156,7 +149,7 @@ void fpx_linear_kernel(cudaStream_t stream,
156149
case 64: Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K); break;
157150
case 128: Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K); break;
158151
default: if (N_PowerOf2 % 128 != 0) {
159-
TORCH_CHECK(false, "Quant-LLM Error: Unsupported N dimension ", N_PowerOf2);
152+
TORCH_CHECK(false, "FP6LLM_API Error: Unsupported N dimension ", N_PowerOf2);
160153
}
161154
Kernel_Ex<TilingConfig<4, 1, 8>, InputDataType, float, EXPONENT, MANTISSA>(stream, Weight, Scales, B, Reduction_Workspace, M_Global, N_Global, K_Global, Split_K); break;
162155
}
@@ -217,23 +210,6 @@ torch::Tensor fp_eXmY_linear_forward_cuda(
217210
torch::Tensor _scales,
218211
int64_t splitK=1)
219212
{
220-
// Check GPU Compute Capability before proceeding
221-
int device, major, minor;
222-
CHECK_CUDA(cudaGetDevice(&device));
223-
CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
224-
CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
225-
226-
// Early exit with error for unsupported architectures
227-
if ((major < 7) || (major == 7 && minor < 5)) {
228-
TORCH_CHECK(false, "Quant-LLM Error: This kernel requires GPU with SM75 (Turing) or higher architecture. "
229-
"Your current device has SM", major, minor, " which is not supported.");
230-
}
231-
232-
const bool is_sm75_gpu = (major == 7) && (minor == 5);
233-
if (is_sm75_gpu && _in_feats.scalar_type() == at::ScalarType::BFloat16) {
234-
TORCH_CHECK(false, "Quant-LLM Error: BFloat16 inputs are not supported on SM75 (Turing) GPUs.");
235-
}
236-
237213
const int64_t NBITS = 1 + EXPONENT + MANTISSA;
238214
int num_in_feats = _in_feats.size(0);
239215
int num_in_channels = _in_feats.size(1);

torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh

+5-14
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,17 @@
5151
* B: col major, FP16
5252
* C: col major, FP16
5353
*/
54-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
55-
template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
54+
template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
5655
__global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
5756
const half *B,
5857
OutputDataType* C,
5958
const size_t M_Global, const size_t N_Global, const size_t K_Global,
6059
int Split_K)
6160
{
61+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
62+
static_assert(false, "Quant-LLM kernel: At least Turing generation (sm75) is required.");
63+
// __trap(); // fails at runtime instead of compile time
64+
#endif
6265
#ifdef DEBUG_MODE
6366
assert(K_Global%TilingConfig::TILE_K==0);
6467
assert(M_Global%TilingConfig::TILE_M==0);
@@ -230,15 +233,3 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
230233
}
231234
}
232235
}
233-
#else
234-
// Stub implementation for older architectures
235-
template<typename TilingConfig, typename InputDataType, typename OutputDataType, int EXPONENT, int MANTISSA>
236-
__global__ void QUANT_GEMM_Kernel(const uint4* Weight, const half* Scales,
237-
const half *B,
238-
OutputDataType* C,
239-
const size_t M_Global, const size_t N_Global, const size_t K_Global,
240-
int Split_K)
241-
{
242-
// NOOP, should never actually be called
243-
}
244-
#endif

torchao/experimental/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ include_directories(${TORCHAO_INCLUDE_DIRS})
4040
if(TORCHAO_BUILD_CPU_AARCH64)
4141
message(STATUS "Building with cpu/aarch64")
4242
add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64)
43-
add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT)
4443

4544
# Defines torchao_kernels_aarch64
4645
add_subdirectory(kernels/cpu/aarch64)

torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ if (TORCHAO_BUILD_CPU_AARCH64)
1919
# intelligence (AI) workloads tailored for Arm® CPUs.
2020
FetchContent_Declare(kleidiai
2121
GIT_REPOSITORY https://git.gitlab.arm.com/kleidi/kleidiai.git
22-
GIT_TAG v1.5.0)
22+
GIT_TAG v1.2.0)
2323
FetchContent_MakeAvailable(kleidiai)
2424

2525
target_link_libraries(torchao_kernels_aarch64 PUBLIC kleidiai)

0 commit comments

Comments
 (0)