Remove reliance on __jax_array__ to unwrap variables. (#21719)

hertschuh · web-flow · commit 31f605e3d307 · 2025-10-03T19:16:57.000-07:00
JAX uses `__jax_array__` to handle non-JAX types. For instance when doing `a * v` where `a` is a `jax.Array` and `v` is a `keras.Variable`, the `jax.Array.__mul__` implementation calls `v.__jax_array__()` because `v` is not a JAX type. However, `__jax_array__` did not work in all contexts, and the next version of JAX further restricts which contexts it works in. The fix rarely involves explictly calling `v.value`. Instead, we rely on existing mechanisms that are already in place to unwrap variables in a lot of contexts: - ops are always supposed to call `convert_to_tensor` on tensor inputs and `convert_to_tensor` extracts values from variables - using `keras.ops` instead of native ops (+ - * / < > & etc.) unwraps variables. It is already a best practice to use `keras.ops` instead of native ops: - to support the creation of functional models via `KerasTensor`s and their serialization - to have consistent type promotion between backends - to support sparse tensors and ragged tensors This was tested via a seperate PR #21702 that won't be submitted because of https://github.com/keras-team/keras/pull/21702/files#diff-900deadc65fc119ce93fb813e340dcb644b8eab9e7c0207bf37cdc05b8e8796e .
diff --git a/keras/src/backend/jax/nn.py b/keras/src/backend/jax/nn.py
@@ -396,6 +396,8 @@ def depthwise_conv(
     feature_group_count = (
         inputs.shape[-1] if data_format == "channels_last" else inputs.shape[1]
     )
+    kernel = convert_to_tensor(kernel)
+    inputs = convert_to_tensor(inputs)
     kernel = jnp.reshape(
         kernel,
         kernel.shape[:-2] + (1, feature_group_count * kernel.shape[-1]),
diff --git a/keras/src/backend/jax/numpy.py b/keras/src/backend/jax/numpy.py
@@ -543,15 +543,18 @@ def clip(x, x_min, x_max):
 
 def concatenate(xs, axis=0):
     bcoo_count = builtins.sum(isinstance(x, jax_sparse.BCOO) for x in xs)
-    if bcoo_count:
-        if bcoo_count == len(xs):
-            axis = canonicalize_axis(axis, len(xs[0].shape))
-            return jax_sparse.bcoo_concatenate(xs, dimension=axis)
-        else:
-            xs = [
-                x.todense() if isinstance(x, jax_sparse.JAXSparse) else x
-                for x in xs
-            ]
+    if bcoo_count == len(xs):
+        axis = canonicalize_axis(axis, len(xs[0].shape))
+        return jax_sparse.bcoo_concatenate(xs, dimension=axis)
+    elif bcoo_count:
+        xs = [
+            x.todense()
+            if isinstance(x, jax_sparse.JAXSparse)
+            else convert_to_tensor(x)
+            for x in xs
+        ]
+    else:
+        xs = [convert_to_tensor(x) for x in xs]
     return jnp.concatenate(xs, axis=axis)
 
 
@@ -1087,6 +1090,7 @@ def reshape(x, newshape):
         if None not in output_shape:
             newshape = output_shape
         return jax_sparse.bcoo_reshape(x, new_sizes=newshape)
+    x = convert_to_tensor(x)
     return jnp.reshape(x, newshape)
 
 
@@ -1149,10 +1153,12 @@ def sort(x, axis=-1):
 
 
 def split(x, indices_or_sections, axis=0):
+    x = convert_to_tensor(x)
     return jnp.split(x, indices_or_sections, axis=axis)
 
 
 def stack(x, axis=0):
+    x = [convert_to_tensor(t) for t in x]
     return jnp.stack(x, axis=axis)
 
 
@@ -1338,6 +1344,7 @@ def squeeze(x, axis=None):
             axis = tuple(i for i, d in enumerate(x.shape) if d == 1)
         axis = to_tuple_or_list(axis)
         return jax_sparse.bcoo_squeeze(x, dimensions=axis)
+    x = convert_to_tensor(x)
     return jnp.squeeze(x, axis=axis)
 
 
diff --git a/keras/src/backend/jax/optimizer.py b/keras/src/backend/jax/optimizer.py
@@ -36,13 +36,14 @@ def _backend_apply_gradients(self, grads, trainable_variables):
             new_g_accs = jax.lax.cond(
                 is_update_step,
                 lambda: [jnp.zeros(g.shape, dtype=g.dtype) for g in acc_grads],
-                lambda: [g + acc_g for g, acc_g in zip(grads, acc_grads)],
+                lambda: [g + acc_g.value for g, acc_g in zip(grads, acc_grads)],
             )
 
             grads = jax.lax.cond(
                 is_update_step,
                 lambda: [
-                    (g + acc_g) / steps for g, acc_g in zip(grads, acc_grads)
+                    (g + acc_g.value) / steps
+                    for g, acc_g in zip(grads, acc_grads)
                 ],
                 lambda: list(grads),
             )
diff --git a/keras/src/layers/attention/attention.py b/keras/src/layers/attention/attention.py
@@ -121,7 +121,7 @@ def _calculate_scores(self, query, key):
         if self.score_mode == "dot":
             scores = ops.matmul(query, ops.transpose(key, axes=[0, 2, 1]))
             if self.scale is not None:
-                scores *= self.scale
+                scores = ops.multiply(scores, self.scale)
         elif self.score_mode == "concat":
             # Reshape tensors to enable broadcasting.
             # Reshape into [batch_size, Tq, 1, dim].
diff --git a/keras/src/layers/layer_test.py b/keras/src/layers/layer_test.py
@@ -678,7 +678,7 @@ def __init__(self):
             def call(self, x):
                 # Should not autocast.
                 assertDType(self.v, "float32")
-                return ops.cast(x, "float32") + self.v
+                return ops.add(ops.cast(x, "float32"), self.v)
 
         # A layer that is explicitly full precision.
         class InnerLayerTwo(layers.Layer):
@@ -694,7 +694,7 @@ def __init__(self):
             def call(self, x):
                 # Should not autocast.
                 assertDType(self.v, "float32")
-                return x + self.v
+                return ops.add(x, self.v)
 
         # A layer that is explicitly mixed precision but with autocast=False
         # weight.
@@ -732,7 +732,7 @@ def call(self, x):
                 # Should autocast.
                 assertDType(self.v, "float16")
                 return self.inner_three(
-                    self.inner_two(self.inner_one(x + self.v))
+                    self.inner_two(self.inner_one(ops.add(x, self.v)))
                 )
 
         layer = MixedPrecisionLayer()
@@ -935,7 +935,7 @@ def call(self, x):
                 x = x + backend.random.normal(
                     shape=(), seed=self._seed_generator
                 )
-                return x + self.tw + self.ntw
+                return ops.add(x, ops.add(self.tw, self.ntw))
 
         data = np.random.random((3, 4))
         layer = TestLayer()
diff --git a/keras/src/layers/normalization/layer_normalization_test.py b/keras/src/layers/normalization/layer_normalization_test.py
@@ -99,8 +99,8 @@ def test_correctness(self):
         ).astype("float32")
 
         out = layer(inputs)
-        out -= layer.beta
-        out /= layer.gamma
+        out = ops.subtract(out, layer.beta)
+        out = ops.divide(out, layer.gamma)
 
         self.assertAllClose(ops.mean(out), 0.0, atol=1e-1)
         self.assertAllClose(ops.std(out), 1.0, atol=1e-1)
diff --git a/keras/src/layers/normalization/rms_normalization_test.py b/keras/src/layers/normalization/rms_normalization_test.py
@@ -38,10 +38,12 @@ def test_correctness(self):
         inputs = ops.convert_to_tensor(inputs)
 
         out = layer(inputs)
-        expected = (
-            inputs
-            * ops.rsqrt(ops.mean(ops.square(inputs), axis=-1, keepdims=True))
-            * layer.scale
+        expected = ops.multiply(
+            ops.multiply(
+                inputs,
+                ops.rsqrt(ops.mean(ops.square(inputs), axis=-1, keepdims=True)),
+            ),
+            layer.scale,
         )
 
         self.assertAllClose(out, expected, atol=1e-1)
diff --git a/keras/src/layers/rnn/gru.py b/keras/src/layers/rnn/gru.py
@@ -261,7 +261,7 @@ def call(self, inputs, states, training=False):
             matrix_x = ops.matmul(inputs, self.kernel)
             if self.use_bias:
                 # biases: bias_z_i, bias_r_i, bias_h_i
-                matrix_x += input_bias
+                matrix_x = ops.add(matrix_x, input_bias)
 
             x_z, x_r, x_h = ops.split(matrix_x, 3, axis=-1)
 
diff --git a/keras/src/layers/rnn/lstm.py b/keras/src/layers/rnn/lstm.py
@@ -276,9 +276,9 @@ def call(self, inputs, states, training=False):
 
             z = ops.matmul(inputs, self.kernel)
 
-            z += ops.matmul(h_tm1, self.recurrent_kernel)
+            z = ops.add(z, ops.matmul(h_tm1, self.recurrent_kernel))
             if self.use_bias:
-                z += self.bias
+                z = ops.add(z, self.bias)
 
             z = ops.split(z, 4, axis=1)
             c, o = self._compute_carry_and_output_fused(z, c_tm1)
diff --git a/keras/src/layers/rnn/simple_rnn.py b/keras/src/layers/rnn/simple_rnn.py
@@ -160,7 +160,7 @@ def call(self, sequence, states, training=False):
             sequence = sequence * dp_mask
         h = ops.matmul(sequence, self.kernel)
         if self.bias is not None:
-            h += self.bias
+            h = ops.add(h, self.bias)
 
         if training and rec_dp_mask is not None:
             prev_output = prev_output * rec_dp_mask
diff --git a/keras/src/ops/core_test.py b/keras/src/ops/core_test.py
@@ -1185,7 +1185,9 @@ def __init__(self):
                 self.b = self.add_weight(shape=(1,), initializer="zeros")
 
             def call(self, x, training=False):
-                return x * ops.stop_gradient(self.w) + self.b
+                return ops.add(
+                    ops.multiply(x, ops.stop_gradient(self.w)), self.b
+                )
 
         model = models.Sequential([ExampleLayer()])
         model.compile(
diff --git a/keras/src/ops/nn.py b/keras/src/ops/nn.py
@@ -1443,7 +1443,7 @@ def depthwise_conv(
     """
     data_format = standardize_data_format(data_format)
     padding = padding.lower()
-    if any_symbolic_tensors((inputs,)):
+    if any_symbolic_tensors((inputs, kernel)):
         return DepthwiseConv(
             strides, padding, data_format, dilation_rate
         ).symbolic_call(inputs, kernel)
diff --git a/keras/src/optimizers/adafactor.py b/keras/src/optimizers/adafactor.py
@@ -158,33 +158,52 @@ def update_step(self, gradient, variable, learning_rate):
         rho_t = ops.minimum(lr, 1 / ops.sqrt(local_step))
         alpha_t = ops.maximum(epsilon_2, self._rms(variable)) * rho_t
         regulated_grad_square = ops.add(ops.square(gradient), self.epsilon_1)
-        beta_2_t = 1 - ops.power(local_step, self.beta_2_decay)
+        beta_2_t = ops.subtract(1, ops.power(local_step, self.beta_2_decay))
 
         if len(variable.shape) >= 2:
             # `r` deletes the last dimension of gradient, so it is of shape
             # `gradient.shape[:-1]`.
             self.assign(
                 r,
-                beta_2_t * r
-                + (1 - beta_2_t) * ops.mean(regulated_grad_square, axis=-1),
+                ops.add(
+                    ops.multiply(beta_2_t, r),
+                    ops.multiply(
+                        ops.subtract(1, beta_2_t),
+                        ops.mean(regulated_grad_square, axis=-1),
+                    ),
+                ),
             )
             # `c` deletes the second last dimension of gradient, so it is of
             # shape `gradient.shape[:-2] + gradient.shape[-1]`.
             self.assign(
                 c,
-                beta_2_t * c
-                + (1 - beta_2_t) * ops.mean(regulated_grad_square, axis=-2),
+                ops.add(
+                    ops.multiply(beta_2_t, c),
+                    ops.multiply(
+                        ops.subtract(1, beta_2_t),
+                        ops.mean(regulated_grad_square, axis=-2),
+                    ),
+                ),
             )
             self.assign(
                 v,
-                ops.expand_dims(
-                    r / ops.mean(r, axis=-1, keepdims=True), axis=-1
-                )
-                * ops.expand_dims(c, -2),
+                ops.multiply(
+                    ops.expand_dims(
+                        ops.divide(r, ops.mean(r, axis=-1, keepdims=True)),
+                        axis=-1,
+                    ),
+                    ops.expand_dims(c, -2),
+                ),
             )
         else:
             self.assign(
-                v, beta_2_t * v + (1 - beta_2_t) * regulated_grad_square
+                v,
+                ops.add(
+                    ops.multiply(beta_2_t, v),
+                    ops.multiply(
+                        ops.subtract(1, beta_2_t), regulated_grad_square
+                    ),
+                ),
             )
 
         u_t = ops.divide(gradient, ops.sqrt(v))
diff --git a/keras/src/optimizers/base_optimizer.py b/keras/src/optimizers/base_optimizer.py
@@ -983,10 +983,15 @@ def _update_model_variables_moving_average(self, trainable_variables):
             ):
                 if average is not None:
                     not_first_step = ops.not_equal(self.iterations, 0)
-                    momentum = (
-                        ops.cast(not_first_step, var.dtype) * self.ema_momentum
+                    momentum = ops.multiply(
+                        ops.cast(not_first_step, var.dtype), self.ema_momentum
+                    )
+                    average.assign(
+                        ops.add(
+                            ops.multiply(momentum, average),
+                            ops.multiply(ops.subtract(1, momentum), var),
+                        )
                     )
-                    average.assign(momentum * average + (1 - momentum) * var)
 
     def _overwrite_model_variables_with_average_value(
         self, trainable_variables
diff --git a/keras/src/optimizers/loss_scale_optimizer.py b/keras/src/optimizers/loss_scale_optimizer.py
@@ -148,7 +148,7 @@ def upscale():
             mapping = list(zip(self.variables, optimizer_variables))
             with backend.StatelessScope(state_mapping=mapping) as scope:
                 self.step_counter.assign(0)
-                self.dynamic_scale.assign(self.dynamic_scale * 2.0)
+                self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 2.0))
             return [scope.get_current_value(v) for v in self._variables]
 
         def increment():
@@ -192,7 +192,7 @@ def _stateless_handle_non_finite_grads(
         mapping = list(zip(self.variables, optimizer_variables))
         with backend.StatelessScope(state_mapping=mapping) as scope:
             self.step_counter.assign(0)
-            self.dynamic_scale.assign(self.dynamic_scale / 2.0)
+            self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 0.5))
         new_optimizer_variables = []
         for v in self.variables:
             new_optimizer_variables.append(scope.get_current_value(v))
@@ -226,7 +226,7 @@ def _stateful_handle_finite_grads(self, grads, trainable_variables):
 
         def upscale():
             self.step_counter.assign(0)
-            self.dynamic_scale.assign(self.dynamic_scale * 2.0)
+            self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 2.0))
 
         def increment():
             self.step_counter.assign_add(1)
@@ -241,7 +241,7 @@ def increment():
     def _stateful_handle_non_finite_grads(self):
         # If any inf or nan in grads, downscale loss and reset counter.
         self.step_counter.assign(0)
-        self.dynamic_scale.assign(self.dynamic_scale / 2.0)
+        self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 0.5))
 
     def _common_apply(self, grads, trainable_variables=None):
         finite = self.check_finite(grads)
@@ -314,7 +314,7 @@ def iterations(self):
 
     def scale_loss(self, loss):
         scale = self.dynamic_scale if self.built else self.initial_scale
-        return loss * scale
+        return ops.multiply(loss, scale)
 
     def finalize_variable_values(self, var_list):
         self.inner_optimizer.finalize_variable_values(var_list)

Original file line number	Diff line number	Diff line change
`@@ -396,6 +396,8 @@ def depthwise_conv(`
`396`	`396`	`feature_group_count = (`
`397`	`397`	`inputs.shape[-1] if data_format == "channels_last" else inputs.shape[1]`
`398`	`398`	`)`
	`399`	`+ kernel = convert_to_tensor(kernel)`
	`400`	`+ inputs = convert_to_tensor(inputs)`
`399`	`401`	`kernel = jnp.reshape(`
`400`	`402`	`kernel,`
`401`	`403`	`kernel.shape[:-2] + (1, feature_group_count * kernel.shape[-1]),`