Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions modelopt/onnx/quantization/gs_patching.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def _export_tensor_proto(tensor: gs.Constant) -> onnx.TensorProto:
vals = tensor.values
if _onnx_supports_int4() and dtype in [onnx.TensorProto.INT4, onnx.TensorProto.UINT4]:
signed = dtype == onnx.TensorProto.INT4
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(dtype)
vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(np_dtype)
packed_dtype = np.int8 if signed else np.uint8
vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(packed_dtype)

onnx_tensor = onnx.helper.make_tensor(
tensor.name,
Expand Down
35 changes: 34 additions & 1 deletion modelopt/onnx/quantization/int4.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,29 @@
CLIP_MIN = 1e-5


def safe_cupy_array(tensor):
"""Convert ml_dtypes.int4 tensor to numpy.int8 for CuPy compatibility.

In ONNX 1.19, int4 tensors use ml_dtypes.int4 which CuPy doesn't support.
This function converts them to regular numpy.int8 while preserving values.

Args:
tensor: numpy array that may have ml_dtypes.int4 dtype
Returns:
cupy or numpy array (if cupy is not supported) with numpy.int8 dtype if input was ml_dtypes.int4,
otherwise unchanged
"""
try:
import ml_dtypes

if hasattr(tensor, "dtype") and tensor.dtype == ml_dtypes.int4:
return np.asarray(tensor.astype(numpy.int8))
except ImportError:
pass

return np.asarray(tensor)

Comment on lines +102 to +123
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

🧩 Analysis chain

Well-implemented compatibility helper for ml_dtypes.int4.

The safe_cupy_array function correctly handles the conversion of ml_dtypes.int4 tensors to numpy.int8 for CuPy compatibility. The dynamic import of ml_dtypes and conditional casting ensure backward compatibility with ONNX versions that don't use ml_dtypes.int4.

However, verify that all weight-loading scenarios are covered. The previous review comment flagged that weights loaded via numpy_helper.to_array (e.g., lines 531, 557, 750, 842, 931, 1162, 1249) followed by np.asarray might fail with CuPy if those weights are ml_dtypes.int4. While the typical quantization flow loads float32 weights, confirm there are no edge cases where already-quantized INT4 weights are loaded and processed within this module.

Run the following script to identify all weight-loading patterns:


🏁 Script executed:

#!/bin/bash
# Description: Find all numpy_helper.to_array usage followed by np.asarray in int4.py

rg -n -A 5 'numpy_helper\.to_array' modelopt/onnx/quantization/int4.py | \
  rg -B 2 -A 2 'np\.asarray'

Length of output: 1093


Enforce safe_cupy_array on all weight loads
Wrap each raw np.asarray call following numpy_helper.to_array (lines 536, 562, 755, 847, 936, 1167 and line 1253) with safe_cupy_array to convert any ml_dtypes.int4 to numpy.int8 and ensure CuPy compatibility.


def _quantize_gather_nodes(
graph: onnx.GraphProto,
nodes_to_exclude: list[str],
Expand Down Expand Up @@ -271,19 +294,26 @@ def quantize_rtn(
scales[name] = np.asnumpy(scales[name])
gemm_weights_quantized[name] = numpy.asarray(qw)
scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
dq_node_attributes = {"axis": 0, "block_size": block_size}
qdq.insert_dq_nodes(
graph,
scales,
quantized_weights=gemm_weights_quantized,
attributes=dq_node_attributes,
precision_info=precision_info,
)

if gather_w_map is not None:
assert gather_s_map is not None, "scale-map not found for quantizable gather nodes"
gather_dq_node_attributes = {
"axis": gather_quantize_axis,
"block_size": gather_block_size,
}
qdq.insert_dq_nodes(
graph,
gather_s_map,
quantized_weights=gather_w_map,
attributes=gather_dq_node_attributes,
precision_info=precision_info,
)
else:
Expand All @@ -299,7 +329,10 @@ def quantize_rtn(
)

logger.info(f"RTN quantization completed in {time.time() - t_start:.2f} seconds")
return gs.export_onnx(graph)
model = gs.export_onnx(graph)
model.ir_version = 10

return model


class AWQClipHelper:
Expand Down
51 changes: 43 additions & 8 deletions tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from functools import partial

import torch
from _test_utils.import_helper import skip_if_no_libcudnn, skip_if_onnx_version_above_1_18
from _test_utils.import_helper import skip_if_no_libcudnn
from _test_utils.onnx_quantization.lib_test_models import SimpleMLP, export_as_onnx, find_init
from _test_utils.torch_quantization.quantize_common import get_awq_config

Expand All @@ -39,9 +39,45 @@
# test_qdq_utils_fp8.py::test_fused_q[bf16,fp16] fails if this script runs after the int4 test, but not before.


def test_int4_awq(tmp_path):
skip_if_onnx_version_above_1_18()
def test_safe_cupy_array(monkeypatch):
"""Comprehensive test for safe_cupy_array covering all code paths."""
import builtins

import numpy # Import actual numpy for creating int4 tensors

# Test 1: Regular numpy array (should hit line 122)
result = int4.safe_cupy_array(numpy.array([1, 2, 3, 4], dtype=numpy.float32))
assert isinstance(result, np.ndarray)

# Test 2: With real ml_dtypes.int4 (covers lines 117-118)
try:
import ml_dtypes

int4_tensor = numpy.array([1, 2, -3, 4], dtype=numpy.float32).astype(ml_dtypes.int4)
result = int4.safe_cupy_array(int4_tensor)
assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
expected = int4_tensor.astype(numpy.int8)
actual = result.get() if int4.has_cupy else result
np.testing.assert_array_equal(actual, expected)
except ImportError:
Comment on lines +58 to +62
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use numpy.testing here to avoid cupy alias mismatch.

Under cupy, np is cupy, so np.testing.assert_array_equal may not accept NumPy arrays (actual/expected). Call numpy’s testing explicitly.

-        np.testing.assert_array_equal(actual, expected)
+        numpy.testing.assert_array_equal(actual, expected)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
expected = int4_tensor.astype(numpy.int8)
actual = result.get() if int4.has_cupy else result
np.testing.assert_array_equal(actual, expected)
except ImportError:
assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
expected = int4_tensor.astype(numpy.int8)
actual = result.get() if int4.has_cupy else result
numpy.testing.assert_array_equal(actual, expected)
except ImportError:
🤖 Prompt for AI Agents
In tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py around lines 58 to 62,
the test calls np.testing.assert_array_equal which under CuPy can resolve to
cupy.testing and fail when comparing NumPy arrays; replace that call with
numpy.testing.assert_array_equal so the NumPy testing function is used
explicitly (ensure the existing numpy import is used), i.e. change the assertion
to call numpy.testing.assert_array_equal(actual, expected).

pass # ml_dtypes not available

# Test 3: When ml_dtypes import fails (covers ImportError catch and line 122)
original_import = builtins.__import__

def mock_import(name, *args, **kwargs):
if name == "ml_dtypes":
raise ImportError("ml_dtypes not available")
return original_import(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", mock_import)

# Use actual numpy for creating the array
result = int4.safe_cupy_array(numpy.array([5, 6, 7, 8], dtype=numpy.int8))
assert isinstance(result, np.ndarray)


def test_int4_awq(tmp_path):
def _forward_loop(model, dataloader):
"""Forward loop for calibration."""
for data in dataloader:
Expand Down Expand Up @@ -94,20 +130,19 @@ def _forward_loop(model, dataloader):
scale_awq_lite = find_init(onnx_model_awq_lite, scale_names[i])

if int4.has_cupy:
wq_onnx_awq_lite = np.array(wq_onnx_awq_lite)
scale_awq_lite = np.array(scale_awq_lite)
wq_onnx_awq_lite = int4.safe_cupy_array(wq_onnx_awq_lite)
scale_awq_lite = int4.safe_cupy_array(scale_awq_lite)

wq_onnx_awq_lite = dq_tensor(wq_onnx_awq_lite, scale_awq_lite, block_size)

wq_torch_awq_clip = model_torch_copy.net[i * 2].weight_quantizer(
model_torch_copy.net[i * 2].weight
)
wq_onnx_awq_clip = find_init(onnx_model_awq_clip, wq_names[i])
scale_awq_clip = find_init(onnx_model_awq_clip, scale_names[i])

if int4.has_cupy:
wq_onnx_awq_clip = np.array(wq_onnx_awq_clip)
scale_awq_clip = np.array(scale_awq_clip)
wq_onnx_awq_clip = int4.safe_cupy_array(wq_onnx_awq_clip)
scale_awq_clip = int4.safe_cupy_array(scale_awq_clip)

wq_onnx_awq_clip = dq_tensor(wq_onnx_awq_clip, scale_awq_clip, block_size)

Expand Down