intel · niuxiaog · May 14, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp
@@ -592,7 +592,7 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder,
   globalIndexes.insert(globalIndexes.begin(), globalIndexes.size());
   auto moduleOp = dyn_cast<ModuleOp>(topOp);
   addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder,
-                    "__" + name + "_buffer_ids_", globalIndexes);
+                    "__" + name + "_buffer_ids", globalIndexes);
 
   auto returnOp =
       builder.create<func::ReturnOp>(topOp->getLoc(), outputValuesInFold);
@@ -605,6 +605,24 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder,
                                      });
   }
 
+  // the ranks of folded results.
+  SmallVector<int32_t> foldRanks;
+  // the shapes of folded results.
+  SmallVector<int64_t> foldShapes;
+  for (Value &tensor : outputValuesInFold) {
+    auto t = dyn_cast<TensorType>(tensor.getType());
+    Type eleType = t.getElementType();
+    int64_t bitWidth = eleType.getIntOrFloatBitWidth() / 8; // bytes
+    ArrayRef<int64_t> shape = t.getShape();
+    foldRanks.push_back(shape.size());
+    foldShapes.insert(foldShapes.end(), shape.begin(), shape.end());
+    foldShapes.push_back(bitWidth);
+  }
+  addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__folded_ranks",
+                    foldRanks);
+  addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, "__folded_shapes",
+                    foldShapes);
+
   foldFunc.setVisibility(SymbolTable::Visibility::Public);
   foldFunc->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
                     UnitAttr::get(context));
@@ -621,11 +639,13 @@ void modifyComputeFunc(MLIRContext *context, OpBuilder &builder,
                        std::unordered_set<int> &constArgsIndexes,
                        SmallVector<Type> &outputTypes,
                        SmallVector<Value> &outputValues) {
-  // the indexes of args to the folding func.
+  // the indexes of args to the folding func, including to-fold tensors and
+  // folded results.
   SmallVector<int32_t> foldArgs;
-  // the indexes of folded args.
+  // the indexes of folded results.
   SmallVector<int32_t> foldIds;
-  // the indexes of args to the computing func.
+  // the indexes of args to the computing func, including non-fold tensors and
+  // folded results.
   SmallVector<int32_t> computeArgs;
 
   // modify the BlockArguments of block
@@ -705,7 +725,7 @@ void modifyComputeFunc(MLIRContext *context, OpBuilder &builder,
   addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__compute_args",
                     computeArgs);
 
-  addGlobalI32(moduleOp, moduleOp.getLoc(), builder, "__num_orig_num_args",
+  addGlobalI32(moduleOp, moduleOp.getLoc(), builder, "__num_orig_args",
                oriNumArgs);
 }
 
@@ -730,6 +750,14 @@ void canonicalizeAndClean(MLIRContext *context, Operation *topOp) {
       op->removeAttr("onednn_graph.in_const_subgraph");
     }
   });
+  topOp->walk([&](func::FuncOp op) {
+    if (op.getOperation()->getAttr("compiletime_const_args_index")) {
+      op.getOperation()->removeAttr("compiletime_const_args_index");
+    }
+    if (op.getOperation()->getAttr("runtime_const_args_index")) {
+      op.getOperation()->removeAttr("runtime_const_args_index");
+    }
+  });
 }
 
 // Operate on tensors. Create fold() and compute() on module. The

diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir
@@ -32,7 +32,7 @@ module {
 
 // COM: expected output:
 // COM: module {
-// COM:   llvm.mlir.global external constant @__num_orig_num_args(3 : i32) {addr_space = 0 : i32} : i32
+// COM:   llvm.mlir.global external constant @__num_orig_args(3 : i32) {addr_space = 0 : i32} : i32
 // COM:   llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32>
 // COM:   llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32>
 // COM:   llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64>

diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir
@@ -74,7 +74,7 @@ module {
 
 // COM: expected output:
 // COM: module {
-// COM:   llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32
+// COM:   llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32
 // COM:   llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32>
 // COM:   llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32>
 // COM:   llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64>

diff --git a/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py
@@ -141,7 +141,7 @@
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
 module {
-  llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32
+  llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32
   llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32>
   llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32>
   llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64>

diff --git a/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py
@@ -111,7 +111,7 @@
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
 module {
-  llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32
+  llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32
   llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32>
   llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32>
   llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64>