feat: sort autodiff rules (#1584)

avik-pal · web-flow · commit 51d94b351ff8 · 2025-11-06T18:55:07.000-05:00
* feat: sort forward mode AD

* feat: reverse mode

* refactor: move the common function

* fix: derivative rule

* test: update

* fix: replace in cacheValues
diff --git a/src/enzyme_ad/jax/Implementations/StableHLOAutoDiffOpInterfaceImpl.cpp b/src/enzyme_ad/jax/Implementations/StableHLOAutoDiffOpInterfaceImpl.cpp
@@ -2279,26 +2279,237 @@ class AutoDiffHLOReturn
   }
 };
 
-class AutoDiffSort
-    : public AutoDiffOpInterface::ExternalModel<AutoDiffSort, SortOp> {
+stablehlo::SortOp
+constructSortOpWithExtraOperands(OpBuilder &builder, stablehlo::SortOp original,
+                                 SmallVectorImpl<Value> &newOperands) {
+  auto newSortOp = stablehlo::SortOp::create(
+      builder, original.getLoc(), newOperands, original.getDimensionAttr(),
+      original.getIsStableAttr());
+
+  IRMapping regionMapper;
+  auto &newComparator = newSortOp.getComparator();
+  auto *newBlock = new Block();
+  newComparator.push_back(newBlock);
+
+  {
+    SmallVector<Type> scalarArgTys;
+    for (auto arg : newOperands) {
+      auto elTy = RankedTensorType::get(
+          {}, cast<TensorType>(arg.getType()).getElementType());
+      scalarArgTys.push_back(elTy);
+      scalarArgTys.push_back(elTy);
+    }
+    newBlock->addArguments(
+        scalarArgTys,
+        SmallVector<Location>(scalarArgTys.size(), original.getLoc()));
+  }
+
+  auto &origComparator = original.getComparator();
+  auto &origBlock = origComparator.front();
+
+  IRMapping mapper;
+  for (int64_t i = 0; i < origBlock.getNumArguments(); i++)
+    mapper.map(origBlock.getArgument(i), newBlock->getArgument(i));
+
+  {
+    OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(newBlock);
+    for (Operation &origOpInside : origBlock) {
+      builder.clone(origOpInside, mapper);
+    }
+  }
+
+  return newSortOp;
+}
+
+class AutoDiffSortFwd
+    : public AutoDiffOpInterface::ExternalModel<AutoDiffSortFwd, SortOp> {
 public:
   LogicalResult createForwardModeTangent(Operation *op, OpBuilder &builder,
                                          MGradientUtils *gutils) const {
+    if (gutils->width > 1) {
+      op->emitError(
+          "TODO: AutoDiffSortFwd does not support batched forward mode");
+      return failure();
+    }
 
-    // TODO: we may need to record, for every successor, which of its inputs
-    // need a shadow to recreate the body correctly.
-    llvm::SmallDenseSet<unsigned> operandPositionsToShadow;
-    llvm::SmallDenseSet<unsigned> resultPositionsToShadow;
+    auto sortOp = cast<stablehlo::SortOp>(op);
 
-    for (auto res : op->getResults())
-      if (!gutils->isConstantValue(res)) {
-        operandPositionsToShadow.insert(res.getResultNumber());
-        resultPositionsToShadow.insert(res.getResultNumber());
+    DenseMap<int32_t, int32_t> gradMapping;
+
+    SmallVector<Value> newOperands;
+    for (auto operand : sortOp.getInputs()) {
+      newOperands.push_back(gutils->getNewFromOriginal(operand));
+    }
+    for (auto [i, operand] : llvm::enumerate(sortOp.getInputs())) {
+      if (!gutils->isConstantValue(operand)) {
+        newOperands.push_back(gutils->invertPointerM(operand, builder));
+        gradMapping[i] = newOperands.size() - 1;
       }
+    }
 
-    return mlir::enzyme::detail::controlFlowForwardHandler(
-        op, builder, gutils, operandPositionsToShadow, resultPositionsToShadow);
+    auto newSortOp =
+        constructSortOpWithExtraOperands(builder, sortOp, newOperands);
+
+    SmallVector<Value> replacementResults(sortOp.getNumResults());
+    for (int32_t i = 0; i < sortOp.getNumResults(); i++) {
+      replacementResults[i] = newSortOp.getResults()[i];
+      auto origRes = sortOp.getResults()[i];
+      if (!gutils->isConstantValue(origRes)) {
+        int32_t j = gradMapping[i];
+        gutils->setDiffe(origRes, newSortOp.getResults()[j], builder);
+      }
+    }
+
+    gutils->replaceOrigOpWith(op, replacementResults);
+    gutils->originalToNewFnOps[op] = newSortOp;
+    gutils->eraseIfUnused(op);
+    return success();
+  }
+};
+
+class AutoDiffSortRev
+    : public ReverseAutoDiffOpInterface::ExternalModel<AutoDiffSortRev,
+                                                       stablehlo::SortOp> {
+public:
+  LogicalResult createReverseModeAdjoint(Operation *orig, OpBuilder &builder,
+                                         MGradientUtilsReverse *gutils,
+                                         SmallVector<Value> caches) const {
+    auto sortOp = cast<stablehlo::SortOp>(orig);
+
+    if (gutils->width > 1) {
+      orig->emitError(
+          "TODO: AutoDiffSortRev does not support batched reverse mode");
+      return failure();
+    }
+
+    auto indices = gutils->popCache(caches[0], builder);
+    auto indicesTy = cast<RankedTensorType>(indices.getType());
+
+    SmallVector<int64_t> newIndicesShape(indicesTy.getShape().begin(),
+                                         indicesTy.getShape().end());
+    newIndicesShape.push_back(1);
+
+    indices = stablehlo::ReshapeOp::create(
+        builder, orig->getLoc(),
+        RankedTensorType::get(newIndicesShape, indicesTy.getElementType()),
+        indices);
+
+    auto inTy = cast<RankedTensorType>(orig->getOperand(0).getType());
+    auto inRank = inTy.getRank();
+    auto inShape = inTy.getShape();
+
+    SmallVector<int64_t> batchingDims;
+    for (int32_t d = 0; d < inRank; d++) {
+      if (d != sortOp.getDimension()) {
+        batchingDims.push_back(d);
+      }
+    }
+
+    auto scatterDims = stablehlo::ScatterDimensionNumbersAttr::get(
+        orig->getContext(), SmallVector<int64_t>(),
+        SmallVector<int64_t>{static_cast<int64_t>(sortOp.getDimension())},
+        batchingDims, batchingDims,
+        SmallVector<int64_t>{static_cast<int64_t>(sortOp.getDimension())},
+        indicesTy.getRank());
+
+    for (size_t i = 0; i < orig->getNumResults(); i++) {
+      if (gutils->isConstantValue(orig->getResult(i)) ||
+          gutils->isConstantValue(orig->getOperand(i)))
+        continue;
+
+      // we compute the gradients with scatter_add and then set the original
+      auto inDiffe = gutils->diffe(orig->getResult(i), builder);
+      auto inDiffeTy = cast<RankedTensorType>(inDiffe.getType());
+      gutils->zeroDiffe(orig->getResult(i), builder);
+
+      auto outDiffe = gutils->diffe(orig->getOperand(i), builder);
+
+      Region combiner;
+      {
+        Block *block = new Block();
+        combiner.push_back(block);
+        block->addArgument(
+            RankedTensorType::get({}, inDiffeTy.getElementType()),
+            orig->getLoc());
+        block->addArgument(
+            RankedTensorType::get({}, inDiffeTy.getElementType()),
+            orig->getLoc());
+        OpBuilder::InsertionGuard guard(builder);
+        builder.setInsertionPointToStart(block);
+        stablehlo::ReturnOp::create(
+            builder, orig->getLoc(),
+            ValueRange{stablehlo::AddOp::create(builder, orig->getLoc(),
+                                                block->getArgument(0),
+                                                block->getArgument(1))});
+      }
+
+      auto scatterOp = stablehlo::ScatterOp::create(
+          builder, orig->getLoc(), outDiffe, indices, inDiffe, scatterDims,
+          builder.getBoolAttr(false), builder.getBoolAttr(true));
+      scatterOp.getUpdateComputation().takeBody(combiner);
+
+      gutils->setDiffe(orig->getOperand(i), scatterOp.getResults()[0], builder);
+    }
+
+    return success();
+  }
+
+  SmallVector<Value> cacheValues(Operation *orig,
+                                 MGradientUtilsReverse *gutils) const {
+    auto sortOp = cast<stablehlo::SortOp>(orig);
+
+    if (gutils->width > 1)
+      return {};
+
+    bool allConstant = true;
+    for (auto input : sortOp.getInputs()) {
+      if (!gutils->isConstantValue(input)) {
+        allConstant = false;
+        break;
+      }
+    }
+
+    if (allConstant)
+      return {};
+
+    auto newOp = gutils->getNewFromOriginal(orig);
+    OpBuilder cacheBuilder(newOp);
+
+    SmallVector<Value> newOperands(sortOp.getInputs().size() + 1);
+    for (auto [i, operand] : llvm::enumerate(sortOp.getInputs())) {
+      newOperands[i] = gutils->getNewFromOriginal(operand);
+    }
+    auto OpTy = cast<TensorType>(newOperands[0].getType());
+    auto iotaOp = stablehlo::IotaOp::create(
+        cacheBuilder, orig->getLoc(),
+        RankedTensorType::get(OpTy.getShape(),
+                              cacheBuilder.getIntegerType(32, false)),
+        sortOp.getDimensionAttr());
+    newOperands[newOperands.size() - 1] = iotaOp.getResult();
+
+    auto newSortOp =
+        constructSortOpWithExtraOperands(cacheBuilder, sortOp, newOperands);
+    auto newResults = newSortOp.getResults();
+
+    SmallVector<Value> caches;
+    caches.push_back(gutils->initAndPushCache(newResults[newResults.size() - 1],
+                                              cacheBuilder));
+
+    SmallVector<Value> replacements;
+    for (size_t i = 0; i < newResults.size() - 1; i++) {
+      replacements.push_back(newResults[i]);
+    }
+
+    gutils->replaceOrigOpWith(orig, replacements);
+    gutils->eraseIfUnused(orig);
+    gutils->originalToNewFnOps[orig] = newSortOp;
+
+    return caches;
   }
+
+  void createShadowValues(Operation *op, OpBuilder &builder,
+                          MGradientUtilsReverse *gutils) const {}
 };
 
 class AutoDiffBatchNormTrainingRev
@@ -3701,8 +3912,6 @@ void mlir::enzyme::registerStableHLODialectAutoDiffInterface(
                             stablehlo::StablehloDialect *) {
     registerInterfaces(context);
 
-    // SortOp::attachInterface<AutoDiffSort>(*context);
-
     WhileOp::attachInterface<WhileOpEnzymeOpsRemover>(*context);
     IfOp::attachInterface<IfOpEnzymeOpsRemover>(*context);
 
@@ -3722,6 +3931,8 @@ void mlir::enzyme::registerStableHLODialectAutoDiffInterface(
     IfOp::attachInterface<AutoDiffIfFwd>(*context);
     IfOp::attachInterface<AutoDiffIfCF>(*context);
 
+    SortOp::attachInterface<AutoDiffSortFwd>(*context);
+    SortOp::attachInterface<AutoDiffSortRev>(*context);
     WhileOp::attachInterface<AutoDiffWhileFwd>(*context);
     WhileOp::attachInterface<AutoDiffWhileRev>(*context);
     ReduceOp::attachInterface<AutoDiffReduceCF<ReduceOp>>(*context);
diff --git a/test/lit_tests/diffrules/stablehlo/sort.mlir b/test/lit_tests/diffrules/stablehlo/sort.mlir
@@ -0,0 +1,37 @@
+// RUN: enzymexlamlir-opt %s --enzyme-wrap="infn=main outfn= retTys=enzyme_dup argTys=enzyme_dup mode=ForwardMode" --canonicalize --remove-unnecessary-enzyme-ops --enzyme-simplify-math | FileCheck %s --check-prefix=FORWARD
+// RUN: enzymexlamlir-opt %s --enzyme-wrap="infn=main outfn= retTys=enzyme_active argTys=enzyme_active mode=ReverseModeCombined" --canonicalize --remove-unnecessary-enzyme-ops --enzyme-simplify-math | FileCheck %s --check-prefix=REVERSE
+
+func.func @main(%arg0: tensor<8x4xf64>) -> (tensor<8x4xf64>) {
+  %0 = "stablehlo.sort"(%arg0) <{dimension = 0 : i64, is_stable = false}> ({
+  ^bb0(%arg1: tensor<f64>, %arg2: tensor<f64>):
+    %1 = stablehlo.compare  LT, %arg1, %arg2 : (tensor<f64>, tensor<f64>) -> tensor<i1>
+    stablehlo.return %1 : tensor<i1>
+  }) : (tensor<8x4xf64>) -> tensor<8x4xf64>
+  return %0 : tensor<8x4xf64>
+}
+
+// FORWARD: func.func @main(%arg0: tensor<8x4xf64>, %arg1: tensor<8x4xf64>) -> (tensor<8x4xf64>, tensor<8x4xf64>) {
+// FORWARD-NEXT:   %0:2 = "stablehlo.sort"(%arg0, %arg1) <{dimension = 0 : i64, is_stable = false}> ({
+// FORWARD-NEXT:   ^bb0(%arg2: tensor<f64>, %arg3: tensor<f64>, %arg4: tensor<f64>, %arg5: tensor<f64>):
+// FORWARD-NEXT:     %1 = stablehlo.compare  LT, %arg2, %arg3 : (tensor<f64>, tensor<f64>) -> tensor<i1>
+// FORWARD-NEXT:     stablehlo.return %1 : tensor<i1>
+// FORWARD-NEXT:   }) : (tensor<8x4xf64>, tensor<8x4xf64>) -> (tensor<8x4xf64>, tensor<8x4xf64>)
+// FORWARD-NEXT:   return %0#0, %0#1 : tensor<8x4xf64>, tensor<8x4xf64>
+// FORWARD-NEXT: }
+
+// REVERSE: func.func @main(%arg0: tensor<8x4xf64>, %arg1: tensor<8x4xf64>) -> tensor<8x4xf64> {
+// REVERSE-NEXT:   %cst = arith.constant dense<0.000000e+00> : tensor<8x4xf64>
+// REVERSE-NEXT:   %0 = stablehlo.iota dim = 0 : tensor<8x4xui32>
+// REVERSE-NEXT:   %1:2 = "stablehlo.sort"(%arg0, %0) <{dimension = 0 : i64, is_stable = false}> ({
+// REVERSE-NEXT:   ^bb0(%arg2: tensor<f64>, %arg3: tensor<f64>, %arg4: tensor<ui32>, %arg5: tensor<ui32>):
+// REVERSE-NEXT:     %4 = stablehlo.compare  LT, %arg2, %arg3 : (tensor<f64>, tensor<f64>) -> tensor<i1>
+// REVERSE-NEXT:     stablehlo.return %4 : tensor<i1>
+// REVERSE-NEXT:   }) : (tensor<8x4xf64>, tensor<8x4xui32>) -> (tensor<8x4xf64>, tensor<8x4xui32>)
+// REVERSE-NEXT:   %2 = stablehlo.reshape %1#1 : (tensor<8x4xui32>) -> tensor<8x4x1xui32>
+// REVERSE-NEXT:   %3 = "stablehlo.scatter"(%cst, %2, %arg1) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [0], input_batching_dims = [1], scatter_indices_batching_dims = [1], scatter_dims_to_operand_dims = [0], index_vector_dim = 2>, unique_indices = true}> ({
+// REVERSE-NEXT:   ^bb0(%arg2: tensor<f64>, %arg3: tensor<f64>):
+// REVERSE-NEXT:     %4 = stablehlo.add %arg2, %arg3 : tensor<f64>
+// REVERSE-NEXT:     stablehlo.return %4 : tensor<f64>
+// REVERSE-NEXT:   }) : (tensor<8x4xf64>, tensor<8x4x1xui32>, tensor<8x4xf64>) -> tensor<8x4xf64>
+// REVERSE-NEXT:   return %3 : tensor<8x4xf64>
+// REVERSE-NEXT: }
diff --git a/test/lit_tests/diffrules/stablehlo/sort2.mlir b/test/lit_tests/diffrules/stablehlo/sort2.mlir
@@ -0,0 +1,75 @@
+// RUN: enzymexlamlir-opt --enzyme --canonicalize --remove-unnecessary-enzyme-ops --enzyme-simplify-math --inline --enzyme-hlo-opt %s | FileCheck %s
+
+module {
+  func.func private @sort(%arg0: tensor<2xf64> {enzymexla.memory_effects = []}) -> (tensor<2xf64>, tensor<2xf64>) attributes {enzymexla.memory_effects = []} {
+    %0 = "stablehlo.sort"(%arg0) <{dimension = 0 : i64, is_stable = false}> ({
+    ^bb0(%arg1: tensor<f64>, %arg2: tensor<f64>):
+      %1 = stablehlo.compare  LT, %arg1, %arg2 : (tensor<f64>, tensor<f64>) -> tensor<i1>
+      stablehlo.return %1 : tensor<i1>
+    }) : (tensor<2xf64>) -> tensor<2xf64>
+    return %0, %arg0 : tensor<2xf64>, tensor<2xf64>
+  }
+  func.func @main(%arg0: tensor<2xf64>) -> (tensor<2xf64>, tensor<2xf64>) {
+    %cst = stablehlo.constant dense<1.000000e+00> : tensor<2xf64>
+    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<2xf64>
+    %0:2 = enzyme.autodiff @sort(%arg0, %cst, %cst_0) {activity = [#enzyme<activity enzyme_active>], ret_activity = [#enzyme<activity enzyme_activenoneed>, #enzyme<activity enzyme_active>]} : (tensor<2xf64>, tensor<2xf64>, tensor<2xf64>) -> (tensor<2xf64>, tensor<2xf64>)
+    return %0#1, %0#0 : tensor<2xf64>, tensor<2xf64>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<2xf64>) -> (tensor<2xf64>, tensor<2xf64>) {
+// CHECK-NEXT:   %cst = stablehlo.constant dense<1.000000e+00> : tensor<f64>
+// CHECK-NEXT:   %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<2xf64>
+// CHECK-NEXT:   %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<2xf64>
+// CHECK-NEXT:   %c = stablehlo.constant dense<[0, 1]> : tensor<2xui32>
+// CHECK-NEXT:   %0:2 = "stablehlo.sort"(%arg0, %c) <{dimension = 0 : i64, is_stable = false}> ({
+// CHECK-NEXT:   ^bb0(%arg1: tensor<f64>, %arg2: tensor<f64>, %arg3: tensor<ui32>, %arg4: tensor<ui32>):
+// CHECK-NEXT:     %3 = stablehlo.compare  LT, %arg1, %arg2 : (tensor<f64>, tensor<f64>) -> tensor<i1>
+// CHECK-NEXT:     stablehlo.return %3 : tensor<i1>
+// CHECK-NEXT:   }) : (tensor<2xf64>, tensor<2xui32>) -> (tensor<2xf64>, tensor<2xui32>)
+// CHECK-NEXT:   %1 = stablehlo.reshape %0#1 : (tensor<2xui32>) -> tensor<2x1xui32>
+// CHECK-NEXT:   %2 = "stablehlo.scatter"(%cst_1, %1, %cst_0) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = true}> ({
+// CHECK-NEXT:   ^bb0(%arg1: tensor<f64>, %arg2: tensor<f64>):
+// CHECK-NEXT:     stablehlo.return %cst : tensor<f64>
+// CHECK-NEXT:   }) : (tensor<2xf64>, tensor<2x1xui32>, tensor<2xf64>) -> tensor<2xf64>
+// CHECK-NEXT:   return %2, %arg0 : tensor<2xf64>, tensor<2xf64>
+// CHECK-NEXT: }
+
+module {
+  func.func private @sort(%arg0: tensor<5x4x3x2xf32>) -> (tensor<f32>, tensor<5x4x3x2xf32>) {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %0 = stablehlo.transpose %arg0, dims = [3, 2, 1, 0] : (tensor<5x4x3x2xf32>) -> tensor<2x3x4x5xf32>
+    %1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %4 = stablehlo.compare  LT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      stablehlo.return %4 : tensor<i1>
+    }) : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
+    %2 = stablehlo.multiply %1, %1 : tensor<2x3x4x5xf32>
+    %3 = stablehlo.reduce(%2 init: %cst) applies stablehlo.add across dimensions = [0, 1, 2, 3] : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<f32>
+    return %3, %arg0 : tensor<f32>, tensor<5x4x3x2xf32>
+  }
+  func.func @main(%arg0: tensor<5x4x3x2xf32>) -> (tensor<5x4x3x2xf32>, tensor<5x4x3x2xf32>) {
+    %cst = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+    %0:2 = enzyme.autodiff @sort(%arg0, %cst) {activity = [#enzyme<activity enzyme_active>], ret_activity = [#enzyme<activity enzyme_activenoneed>, #enzyme<activity enzyme_const>]} : (tensor<5x4x3x2xf32>, tensor<f32>) -> (tensor<5x4x3x2xf32>, tensor<5x4x3x2xf32>)
+    return %0#1, %0#0 : tensor<5x4x3x2xf32>, tensor<5x4x3x2xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<5x4x3x2xf32>) -> (tensor<5x4x3x2xf32>, tensor<5x4x3x2xf32>) {
+// CHECK-NEXT:   %c = stablehlo.constant dense<"0x000000000000000000000000000000000000000001000000010000000100000001000000010000000200000002000000020000000200000002000000030000000300000003000000030000000300000000000000000000000000000000000000000000000100000001000000010000000100000001000000020000000200000002000000020000000200000003000000030000000300000003000000030000000000000000000000000000000000000000000000010000000100000001000000010000000100000002000000020000000200000002000000020000000300000003000000030000000300000003000000000000000000000000000000000000000000000001000000010000000100000001000000010000000200000002000000020000000200000002000000030000000300000003000000030000000300000000000000000000000000000000000000000000000100000001000000010000000100000001000000020000000200000002000000020000000200000003000000030000000300000003000000030000000000000000000000000000000000000000000000010000000100000001000000010000000100000002000000020000000200000002000000020000000300000003000000030000000300000003000000"> : tensor<2x3x4x5xui32>
+// CHECK-NEXT:   %cst = arith.constant dense<0.000000e+00> : tensor<2x3x4x5xf32>
+// CHECK-NEXT:   %0 = stablehlo.transpose %arg0, dims = [3, 2, 1, 0] : (tensor<5x4x3x2xf32>) -> tensor<2x3x4x5xf32>
+// CHECK-NEXT:   %1:2 = "stablehlo.sort"(%0, %c) <{dimension = 2 : i64, is_stable = false}> ({
+// CHECK-NEXT:   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<ui32>, %arg4: tensor<ui32>):
+// CHECK-NEXT:     %6 = stablehlo.compare  LT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK-NEXT:     stablehlo.return %6 : tensor<i1>
+// CHECK-NEXT:   }) : (tensor<2x3x4x5xf32>, tensor<2x3x4x5xui32>) -> (tensor<2x3x4x5xf32>, tensor<2x3x4x5xui32>)
+// CHECK-NEXT:   %2 = arith.addf %1#0, %1#0 : tensor<2x3x4x5xf32>
+// CHECK-NEXT:   %3 = stablehlo.reshape %1#1 : (tensor<2x3x4x5xui32>) -> tensor<2x3x4x5x1xui32>
+// CHECK-NEXT:   %4 = "stablehlo.scatter"(%cst, %3, %2) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [2], input_batching_dims = [0, 1, 3], scatter_indices_batching_dims = [0, 1, 3], scatter_dims_to_operand_dims = [2], index_vector_dim = 4>, unique_indices = true}> ({
+// CHECK-NEXT:   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+// CHECK-NEXT:     stablehlo.return %arg2 : tensor<f32>
+// CHECK-NEXT:   }) : (tensor<2x3x4x5xf32>, tensor<2x3x4x5x1xui32>, tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
+// CHECK-NEXT:   %5 = stablehlo.transpose %4, dims = [3, 2, 1, 0] : (tensor<2x3x4x5xf32>) -> tensor<5x4x3x2xf32>
+// CHECK-NEXT:   return %5, %arg0 : tensor<5x4x3x2xf32>, tensor<5x4x3x2xf32>
+// CHECK-NEXT: }