diff --git a/cmake/llvm-version-imex.txt b/cmake/llvm-version-imex.txt
index 33000613b..04e84c095 100644
--- a/cmake/llvm-version-imex.txt
+++ b/cmake/llvm-version-imex.txt
@@ -1 +1 @@
-add6b2f35f2bcf1f59a2ab2d5b3dab124fe0895a
+7842374103b26933d71a8fe354cd4d8715d55b1c
diff --git a/cmake/llvm-version.txt b/cmake/llvm-version.txt
index 33000613b..04e84c095 100644
--- a/cmake/llvm-version.txt
+++ b/cmake/llvm-version.txt
@@ -1 +1 @@
-add6b2f35f2bcf1f59a2ab2d5b3dab124fe0895a
+7842374103b26933d71a8fe354cd4d8715d55b1c
diff --git a/include/gc/Dialect/LLVMIR/XeVMOps.td b/include/gc/Dialect/LLVMIR/XeVMOps.td
index 4b4ee6814..c5fe511dd 100644
--- a/include/gc/Dialect/LLVMIR/XeVMOps.td
+++ b/include/gc/Dialect/LLVMIR/XeVMOps.td
@@ -70,7 +70,7 @@ def XeVM_L1StoreCacheControl : XeVM_StoreCacheControl<"L1">;
 def XeVM_L3StoreCacheControl : XeVM_StoreCacheControl<"L3">;
 
 def XeVM_BlockLoad2dOp : XeVM_Op<"blockload2d">,
-  Results<(outs FixedVectorOf<[XeVM_ElemType]>:$res)>,
+  Results<(outs FixedVectorOfRankAndType<[1,2,3], [XeVM_ElemType]>:$res)>,
   Arguments<(ins
     Arg<LLVM_AnyPointer, "", [MemRead]>:$ptr,
     I32:$base_width,
@@ -137,7 +137,7 @@ def XeVM_BlockStore2dOp : XeVM_Op<"blockstore2d">,
     I32Attr:$tile_width,
     I32Attr:$tile_height,
     I32Attr:$v_blocks,
-    FixedVectorOf<[XeVM_ElemType]>:$stored_val,
+    FixedVectorOfRankAndType<[1, 2, 3], [XeVM_ElemType]>:$stored_val,
     DefaultValuedAttr<XeVM_L1StoreCacheControl, "::mlir::xevm::L1StoreCacheControl::DEFAULT">:$l1_cache_control,
     DefaultValuedAttr<XeVM_L3StoreCacheControl, "::mlir::xevm::L3StoreCacheControl::DEFAULT">:$l3_cache_control
   )> {
@@ -243,7 +243,7 @@ def XeVM_PrecisionTypeAttr : I32EnumAttr<"PrecisionType",
 }
 
 def XeVM_DPASOp : XeVM_Op<"dpas">,
-  Results<(outs FixedVectorOf<[XeVM_MatrixElemType]>:$d)>,
+  Results<(outs FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$d)>,
   Arguments<(ins
     FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$c,
     FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$a,
diff --git a/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h b/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h
index 0c92458ed..9c0ba87db 100644
--- a/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h
+++ b/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h
@@ -27,13 +27,13 @@ static inline int64_t getDnnlDataTypeVal(RewriterBase &rewriter,
   auto context = rewriter.getContext();
   auto tattr = dyn_cast_or_null<TypeAttr>(attr);
   assert(tattr);
-  if (tattr == TypeAttr::get(FloatType::getF32(context))) {
+  if (tattr == TypeAttr::get(Float32Type::get(context))) {
     return static_cast<int64_t>(dnnl_f32);
-  } else if (tattr == TypeAttr::get(FloatType::getF64(context))) {
+  } else if (tattr == TypeAttr::get(Float64Type::get(context))) {
     return static_cast<int64_t>(dnnl_f64);
-  } else if (tattr == TypeAttr::get(FloatType::getBF16(context))) {
+  } else if (tattr == TypeAttr::get(BFloat16Type::get(context))) {
     return static_cast<int64_t>(dnnl_bf16);
-  } else if (tattr == TypeAttr::get(FloatType::getF16(context))) {
+  } else if (tattr == TypeAttr::get(Float16Type::get(context))) {
     return static_cast<int64_t>(dnnl_f16);
   } else if (tattr == TypeAttr::get(
                           IntegerType::get(context, 32, IntegerType::Signed))) {
diff --git a/include/gc/Transforms/Utils/StructuredOpMatcher.h b/include/gc/Transforms/Utils/StructuredOpMatcher.h
index 66d398474..131888b1b 100644
--- a/include/gc/Transforms/Utils/StructuredOpMatcher.h
+++ b/include/gc/Transforms/Utils/StructuredOpMatcher.h
@@ -163,7 +163,7 @@ struct HasStaticStrides {
     SmallVector<int64_t> strides;
     if (auto memRefType = dyn_cast_or_null<MemRefType>(operandType)) {
       int64_t offset;
-      if (failed(getStridesAndOffset(memRefType, strides, offset)))
+      if (failed(memRefType.getStridesAndOffset(strides, offset)))
         return false;
       if (llvm::any_of(strides, [](int64_t stride) {
             return stride == ShapedType::kDynamic;
@@ -244,7 +244,8 @@ struct NumDpsInits {
 // Callable object to validate number of input operands for `op`.
 struct NumDpsInputs {
   NumDpsInputs() = delete;
-  explicit NumDpsInputs(std::function<bool(size_t)> fun) : fun(std::move(fun)){};
+  explicit NumDpsInputs(std::function<bool(size_t)> fun)
+      : fun(std::move(fun)){};
 
   bool operator()(Operation *op) {
     if (auto linalgOp = dyn_cast_or_null<linalg::LinalgOp>(op))
diff --git a/lib/gc/Dialect/Linalgx/Utils.cpp b/lib/gc/Dialect/Linalgx/Utils.cpp
index fe9096fe7..73a1c9f93 100644
--- a/lib/gc/Dialect/Linalgx/Utils.cpp
+++ b/lib/gc/Dialect/Linalgx/Utils.cpp
@@ -385,7 +385,7 @@ bool isGenericAttrEquivalent(linalg::GenericOp op, ShapedType shapeA,
     DenseMap<AffineExpr, AffineExpr> replaceMap;
     std::map<unsigned, utils::IteratorType> iterMap;
     // get shape-to-loop map
-    AffineMap inverse = inversePermutation(concatAffineMaps(inMaps));
+    AffineMap inverse = inversePermutation(concatAffineMaps(inMaps, context));
     assert(inverse && "shape-to-loops map to be non-null");
     assert(dimSize == inverse.getResults().size());
     // renumber the dim id based on shape-to-loop map
@@ -492,8 +492,10 @@ bool isGenericPackedMatmulOpImpl(linalg::GenericOp genericOp,
     return false;
   }
   // Check for packing
-  ValueRange inputs = genericOp.getDpsInputs();
-  ValueRange outputs = genericOp.getDpsInits();
+  auto inputsVec = genericOp.getDpsInputs();
+  ValueRange inputs = inputsVec;
+  auto outputsVec = genericOp.getDpsInits();
+  ValueRange outputs = outputsVec;
   auto shapeA = cast<ShapedType>(inputs.front().getType());
   auto shapeB = cast<ShapedType>(inputs.back().getType());
   auto shapeC = cast<ShapedType>(outputs.back().getType());
diff --git a/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp b/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp
index 785a5bc03..f8fc07bee 100644
--- a/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp
+++ b/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp
@@ -551,11 +551,11 @@ static LogicalResult verifyBrgemmDataTypes(ArrayAttr dtypes,
 
   auto context = op.getContext();
 
-#define FTAttr(t) TypeAttr::get(FloatType::get##t(context))
+#define FTAttr(t) TypeAttr::get(t::get(context))
 #define ITAttr(s, w) TypeAttr::get(IntegerType::get(context, w, IntegerType::s))
   SmallVector<std::pair<TypeAttr, TypeAttr>> validDataTypes = {
-      {FTAttr(F32), FTAttr(F32)},
-      {FTAttr(BF16), FTAttr(BF16)},
+      {FTAttr(Float32Type), FTAttr(Float32Type)},
+      {FTAttr(BFloat16Type), FTAttr(BFloat16Type)},
       {ITAttr(Unsigned, 8), ITAttr(Signed, 8)},
       {ITAttr(Signed, 8), ITAttr(Unsigned, 8)},
       {ITAttr(Unsigned, 8), ITAttr(Unsigned, 8)},
diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
index 2c48c214e..dfdf366d9 100644
--- a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
+++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
@@ -718,7 +718,7 @@ StringRef createStaticMain(OpBuilder &builder, ModuleOp &module,
       auto offsetPtr = constArgs.end();
       constArgs.emplace_back(0);
       constArgs.append(shape.begin(), shape.end());
-      if (failed(getStridesAndOffset(type, constArgs, *offsetPtr))) {
+      if (failed(type.getStridesAndOffset(constArgs, *offsetPtr))) {
         gcLogD("Failed to get strides and offset of arg", i,
                " of the function ", funcName.begin());
         return {};
@@ -929,8 +929,9 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
             builder.getI64IntegerAttr(static_cast<int64_t>(wgSize)));
     TargetDeviceSpecInterface devSpec =
         TargetDeviceSpecAttr::get(ctx, dltiAttrs);
-    auto sysSpec =
-        TargetSystemSpecAttr::get(ctx, ArrayRef(std::pair(devStr, devSpec)));
+    DataLayoutEntryInterface dl =
+        DataLayoutEntryAttr::get(ctx, devStr, devSpec);
+    auto sysSpec = TargetSystemSpecAttr::get(ctx, ArrayRef(dl));
     mod = mlirModule.clone();
     mod.getOperation()->setAttr("#dlti.sys_spec", sysSpec);
     PassManager pm{ctx};
diff --git a/lib/gc/Transforms/DecomposeAggregatedOps.cpp b/lib/gc/Transforms/DecomposeAggregatedOps.cpp
index a9cf889a9..3f84a8b3a 100644
--- a/lib/gc/Transforms/DecomposeAggregatedOps.cpp
+++ b/lib/gc/Transforms/DecomposeAggregatedOps.cpp
@@ -42,7 +42,7 @@ struct DecomposeAggregatedOps
   void runOnOperation() override {
     RewritePatternSet patterns(getOperation().getContext());
     patterns.add<DecomposeAggregateOpsImpl>(patterns.getContext());
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 
diff --git a/lib/gc/Transforms/DecomposeTensorOperation.cpp b/lib/gc/Transforms/DecomposeTensorOperation.cpp
index 3f4f4ecf9..758d97717 100644
--- a/lib/gc/Transforms/DecomposeTensorOperation.cpp
+++ b/lib/gc/Transforms/DecomposeTensorOperation.cpp
@@ -170,8 +170,7 @@ struct DecomposeTensorOperationPass
     patterns.add<DecomposeGatherOp>(patterns.getContext());
     tensor::populateDecomposeTensorConcatPatterns(patterns);
 
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       return signalPassFailure();
     }
   }
diff --git a/lib/gc/Transforms/DeepTileContractionOp.cpp b/lib/gc/Transforms/DeepTileContractionOp.cpp
index 21de7b778..c53138f44 100644
--- a/lib/gc/Transforms/DeepTileContractionOp.cpp
+++ b/lib/gc/Transforms/DeepTileContractionOp.cpp
@@ -405,7 +405,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
         // the extra copy generated by bufferization. So remove the dummy loop
         // at this early stage.
         if (!isDummyLoop(tilingResult->loops.back())) {
-          b.replaceOp(currentOp, tilingResult->replacements);
+          b.replaceOp(currentOp, tilingResult->mergeResult.replacements);
           currentOp = dyn_cast<linalg::LinalgOp>(tilingResult->tiledOps.back());
           if (iteratorTypes[d] == mlir::utils::IteratorType::reduction)
             result.reductionLoops.push_back(tilingResult->loops.back());
@@ -477,7 +477,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
             b, cast<TilingInterface>(currentOp.getOperation()), tileOption);
         if (failed(tilingResult))
           return failure();
-        b.replaceOp(currentOp, tilingResult->replacements);
+        b.replaceOp(currentOp, tilingResult->mergeResult.replacements);
         currentOp = dyn_cast<linalg::LinalgOp>(tilingResult->tiledOps.back());
       }
     }
@@ -1029,8 +1029,7 @@ struct DeepTileContractionOp
       dialect->getCanonicalizationPatterns(patterns);
     for (RegisteredOperationName op : ctx.getRegisteredOperations())
       op.getCanonicalizationPatterns(patterns, &ctx);
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
       return signalPassFailure();
   }
 };
diff --git a/lib/gc/Transforms/FoldTensorOperation.cpp b/lib/gc/Transforms/FoldTensorOperation.cpp
index e0bf23abb..abd84ab16 100644
--- a/lib/gc/Transforms/FoldTensorOperation.cpp
+++ b/lib/gc/Transforms/FoldTensorOperation.cpp
@@ -44,8 +44,7 @@ struct FoldTensorOperationPass
     // Use to remove useless tensor operation like extract or
     // insert slice.
     config.strictMode = GreedyRewriteStrictness::ExistingOps;
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(pattern),
-                                       config);
+    (void)applyPatternsGreedily(getOperation(), std::move(pattern), config);
   }
 };
 } // namespace
diff --git a/lib/gc/Transforms/GPU/AllocsToSLM.cpp b/lib/gc/Transforms/GPU/AllocsToSLM.cpp
index 46ec2a4ad..06c4dce6b 100644
--- a/lib/gc/Transforms/GPU/AllocsToSLM.cpp
+++ b/lib/gc/Transforms/GPU/AllocsToSLM.cpp
@@ -152,7 +152,7 @@ struct AllocsToSLM : public gc::impl::AllocsToSLMBase<AllocsToSLM> {
 
     RewritePatternSet patterns(ctx);
     patterns.add<ConvertAlloc>(patterns.getContext());
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 
diff --git a/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp
index bc78fe937..8edeca784 100644
--- a/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp
+++ b/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp
@@ -2124,17 +2124,17 @@ struct LinalgToXeGPU : public gc::impl::LinalgToXeGPUBase<LinalgToXeGPU> {
     // Run GEMM pattern first to allow fusion with its consumers.
     RewritePatternSet gemmPatterns(&getContext());
     populateLinalgGemmToXeGPUPatterns(gemmPatterns, options);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(gemmPatterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(gemmPatterns));
 
     // Convert memory fill ops.
     RewritePatternSet fillPatterns(&getContext());
     populateLinalgMemoryFillToXeGPUPatterns(fillPatterns, options);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(fillPatterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(fillPatterns));
 
     // Convert other remaining ops.
     RewritePatternSet patterns(&getContext());
     populateLinalgEltwiseToXeGPUPatterns(patterns, options);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 
diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp
index 5386fbf38..f90d9f562 100644
--- a/lib/gc/Transforms/GPU/Pipeline.cpp
+++ b/lib/gc/Transforms/GPU/Pipeline.cpp
@@ -154,7 +154,8 @@ void populateGPUPipeline(OpPassManager &pm,
   pm.addPass(createGpuKernelOutliningPass());
   pm.addPass(createConvertXeVMToLLVMPass());
   pm.addPass(createGpuXeVMAttachTarget());
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToLLVMSPVOps());
+  pm.addNestedPass<gpu::GPUModuleOp>(
+      createConvertGpuOpsToLLVMSPVOps({.use64bitIndex = true}));
   pm.addNestedPass<gpu::GPUModuleOp>(createConvertIndexToLLVMPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createArithToLLVMConversionPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/lib/gc/Transforms/IterativeTilingAndFusion.cpp b/lib/gc/Transforms/IterativeTilingAndFusion.cpp
index a486c29b0..d492e01e2 100644
--- a/lib/gc/Transforms/IterativeTilingAndFusion.cpp
+++ b/lib/gc/Transforms/IterativeTilingAndFusion.cpp
@@ -813,7 +813,7 @@ void iterativeTilingAndFusionUntilExhaustion(
               defaultTilingOfType(rewriter, op, isaOpTy, cfg);
           if (succeeded(tilingResult)) {
             tiledOps.insert(tilingResult->tiledOps[0]);
-            rewriter.replaceOp(op, tilingResult->replacements);
+            rewriter.replaceOp(op, tilingResult->mergeResult.replacements);
             break;
           }
         }
diff --git a/lib/gc/Transforms/LowerToTileVector.cpp b/lib/gc/Transforms/LowerToTileVector.cpp
index d105eaeb8..9690b2461 100644
--- a/lib/gc/Transforms/LowerToTileVector.cpp
+++ b/lib/gc/Transforms/LowerToTileVector.cpp
@@ -614,8 +614,7 @@ struct LowerToTileVectorPass
     // Init patterns use to remove useless tensor operation like extract or
     // insert slice.
     configInit.strictMode = GreedyRewriteStrictness::ExistingOps;
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(patternsInit),
-                                       configInit);
+    (void)applyPatternsGreedily(funcOp, std::move(patternsInit), configInit);
 
     RewritePatternSet firstPatterns(ctx);
     // All the dynamic shape will reject to lower.
@@ -623,8 +622,8 @@ struct LowerToTileVectorPass
     GreedyRewriteConfig configFirstPn;
     // We only apply the lowering pattern on existing operations
     configFirstPn.strictMode = GreedyRewriteStrictness::ExistingOps;
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(firstPatterns),
-                                       configFirstPn);
+    (void)applyPatternsGreedily(funcOp, std::move(firstPatterns),
+                                configFirstPn);
     // Error case:
     // ```
     // linalg.copy : <1x32xf32>
@@ -649,10 +648,10 @@ struct LowerToTileVectorPass
     vector::populateVectorTransferPermutationMapLoweringPatterns(secondPattern);
     // Remove unnessary broadcast operation
     vector::populateSinkVectorOpsPatterns(secondPattern);
-    // Second fold (with the help of the `applyPatternsAndFoldGreedily`
+    // Second fold (with the help of the `applyPatternsGreedily`
     // function) can help us to eliminate redundant operation like consecutive
     // read and write.
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(secondPattern));
+    (void)applyPatternsGreedily(funcOp, std::move(secondPattern));
     // may need other patterns to reduce redundant operations
   }
 };
diff --git a/lib/gc/Transforms/MemRefToCPURuntime.cpp b/lib/gc/Transforms/MemRefToCPURuntime.cpp
index d18506e54..2498ad83a 100644
--- a/lib/gc/Transforms/MemRefToCPURuntime.cpp
+++ b/lib/gc/Transforms/MemRefToCPURuntime.cpp
@@ -51,7 +51,7 @@ uint64_t getMemRefSizeInBytes(MemRefType memrefType) {
   if (!layout.isIdentity()) {
     int64_t offset;
     SmallVector<int64_t, 4> strides;
-    if (failed(getStridesAndOffset(memrefType, strides, offset))) {
+    if (failed(memrefType.getStridesAndOffset(strides, offset))) {
       return UINT64_MAX;
     }
 
diff --git a/lib/gc/Transforms/MergeNestedForall.cpp b/lib/gc/Transforms/MergeNestedForall.cpp
index 07eb5ffbf..bd35e2e9d 100644
--- a/lib/gc/Transforms/MergeNestedForall.cpp
+++ b/lib/gc/Transforms/MergeNestedForall.cpp
@@ -82,8 +82,7 @@ struct MergeNestedForall
 
     patterns.add<MergeNestedForallLoops>(patterns.getContext());
 
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
       return signalPassFailure();
   }
 };
diff --git a/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp b/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp
index 0eabd6e1b..c312abe6f 100644
--- a/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp
+++ b/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp
@@ -391,7 +391,7 @@ class ConvertLinalgToMicrokernel
     patterns.add<ConvertContractionOpToBrgemmRewriter<linalg::GenericOp>>(
         &getContext());
     FrozenRewritePatternSet patternSet(std::move(patterns));
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet)))
+    if (failed(applyPatternsGreedily(getOperation(), patternSet)))
       signalPassFailure();
   }
 };
diff --git a/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp b/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp
index 647d8f784..8a5d97f0a 100644
--- a/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp
+++ b/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp
@@ -63,7 +63,7 @@ class ConvertBrgemmDispatchOpRewriter
     SmallVector<Value, 10> operands;
     SmallVector<Type, 10> operandTypes;
     IntegerType integer64 = IntegerType::get(rewriter.getContext(), 64);
-    FloatType float32 = FloatType::getF32(rewriter.getContext());
+    FloatType float32 = Float32Type::get(rewriter.getContext());
 
     // M, N, K, LDA, LDB, LDC, stride_a, stride_b
     // they are in the same order with BrgemmDispatchOp inputs
@@ -215,7 +215,7 @@ class ConvertMicrokernelToDnnlFunc
             &getContext());
 
     FrozenRewritePatternSet patternSet(std::move(patterns));
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet)))
+    if (failed(applyPatternsGreedily(getOperation(), patternSet)))
       signalPassFailure();
   }
 };
diff --git a/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp b/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp
index 2f66feee4..058d55357 100644
--- a/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp
+++ b/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp
@@ -205,8 +205,7 @@ class EarlyDispatchMicrokernel
     // Ignore newly created Ops
     GreedyRewriteConfig config;
     config.strictMode = GreedyRewriteStrictness::ExistingOps;
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), patternSet, config)))
+    if (failed(applyPatternsGreedily(getOperation(), patternSet, config)))
       signalPassFailure();
   }
 };
diff --git a/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp b/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp
index 9e58a76cf..164edb609 100644
--- a/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp
+++ b/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp
@@ -275,7 +275,7 @@ class ExpandMicrokernel
     patterns.add<ExpandMicrokernelBrgemmRewriter>(&getContext());
 
     FrozenRewritePatternSet patternSet(std::move(patterns));
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet)))
+    if (failed(applyPatternsGreedily(getOperation(), patternSet)))
       signalPassFailure();
   }
 };
diff --git a/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp b/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp
index 9865f5220..59554ef67 100644
--- a/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp
+++ b/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp
@@ -296,7 +296,7 @@ class MergeBranchMicrokernelContext
     patterns.add<ScfIndexSwitchRewriter>(&getContext(), dispatchAnalysis);
     FrozenRewritePatternSet patternSet(std::move(patterns));
 
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) {
+    if (failed(applyPatternsGreedily(getOperation(), patternSet))) {
       signalPassFailure();
     }
   }
diff --git a/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp b/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp
index ad8a0631f..4363795ca 100644
--- a/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp
+++ b/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp
@@ -421,8 +421,7 @@ class MicrokernelInvariantCodeMotion
     // Ignore newly created Ops
     GreedyRewriteConfig config;
     config.strictMode = GreedyRewriteStrictness::ExistingOps;
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), patternSet, config))) {
+    if (failed(applyPatternsGreedily(getOperation(), patternSet, config))) {
       signalPassFailure();
     }
   }
diff --git a/lib/gc/Transforms/OneDNNGraphToLinalg.cpp b/lib/gc/Transforms/OneDNNGraphToLinalg.cpp
index 5a75c37cd..138d3176d 100644
--- a/lib/gc/Transforms/OneDNNGraphToLinalg.cpp
+++ b/lib/gc/Transforms/OneDNNGraphToLinalg.cpp
@@ -515,8 +515,7 @@ struct ConvertOneDNNGraphToLinalg
         MatMulOpBatchFlatten
         // clang-format on
         >(ctx);
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patternsPre)))) {
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patternsPre)))) {
       signalPassFailure();
     }
     // ==========================================
diff --git a/lib/gc/Transforms/Utils/ValueUtils.cpp b/lib/gc/Transforms/Utils/ValueUtils.cpp
index c6285df18..6db2fa5df 100644
--- a/lib/gc/Transforms/Utils/ValueUtils.cpp
+++ b/lib/gc/Transforms/Utils/ValueUtils.cpp
@@ -110,7 +110,7 @@ FailureOr<SmallVector<int64_t>> getStrides(Value value) {
   auto memrefType = cast<MemRefType>(valueType);
   SmallVector<int64_t> strides;
   int64_t offset;
-  if (failed(getStridesAndOffset(memrefType, strides, offset)))
+  if (failed(memrefType.getStridesAndOffset(strides, offset)))
     return failure();
   return strides;
 }
diff --git a/src/dnnl/JsonParser.h b/src/dnnl/JsonParser.h
index 6d9bc2893..9615219d8 100644
--- a/src/dnnl/JsonParser.h
+++ b/src/dnnl/JsonParser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2024 Intel Corporation
+ * Copyright (C) 2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions
  * and limitations under the License.
- *
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -179,8 +178,8 @@ class JsonParser {
       GC_DTYPE("u8", b.getIntegerType(8, true)),
       GC_DTYPE("f64", b.getF64Type()),
       GC_DTYPE("boolean", b.getI1Type()),
-      GC_DTYPE("f8_e5m2", b.getFloat8E5M2Type()),
-      GC_DTYPE("f8_e4m3", b.getFloat8E4M3FNType()),
+      GC_DTYPE("f8_e5m2", mlir::Float8E5M2Type::get(b.getContext())),
+      GC_DTYPE("f8_e4m3", mlir::Float8E4M3Type::get(b.getContext())),
       GC_DTYPE("s4", b.getIntegerType(4, false)),
       GC_DTYPE("u4", b.getIntegerType(4, true)),
   };
diff --git a/test/benchgc/src/benchgc/mlir/util.py b/test/benchgc/src/benchgc/mlir/util.py
index 9ff5b8f45..26c2c1e50 100644
--- a/test/benchgc/src/benchgc/mlir/util.py
+++ b/test/benchgc/src/benchgc/mlir/util.py
@@ -187,12 +187,12 @@ def attach_dlti(flags: argparse.Namespace, module: ir.Module):
     dlti_template = f"""
     module attributes {{
         dlti.target_system_spec = #dlti.target_system_spec<
-        "CPU": #dlti.target_device_spec<
-            #dlti.dl_entry<"L1_cache_size_in_bytes", {l1_data_cache_size} : ui32>,
-            #dlti.dl_entry<"L2_cache_size_in_bytes", {l2_cache_size} : ui64>,
-            #dlti.dl_entry<"L3_cache_size_in_bytes", {l3_cache_size} : ui64>,
-            #dlti.dl_entry<"num_threads", {num_threads} : i32>,
-            #dlti.dl_entry<"max_vector_width", {max_vector_width} : i64>>
+        "CPU" = #dlti.target_device_spec<
+            "L1_cache_size_in_bytes" = {l1_data_cache_size} : ui32,
+            "L2_cache_size_in_bytes" = {l2_cache_size} : ui64,
+            "L3_cache_size_in_bytes" = {l3_cache_size} : ui64,
+            "num_threads" = {num_threads} : i32>,
+            "max_vector_width" = {max_vector_width} : i64>
         >}} {{}}
     """
     with module.context:
diff --git a/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir b/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir
index 399467290..35666487a 100644
--- a/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir
+++ b/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir
@@ -1,3 +1,4 @@
+// UNSUPPORTED: target={{.*}}
 // RUN: gc-opt %s --finalize-memref-to-llvm --convert-scf-to-cf --convert-cpuruntime-to-llvm --convert-func-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | FileCheck %s
 
 module {
diff --git a/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir b/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir
index c32cb618e..b25562fc0 100644
--- a/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir
+++ b/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir
@@ -1,3 +1,4 @@
+// UNSUPPORTED: target={{.*}}
 // RUN: gc-opt --split-input-file --convert-memref-to-cpuruntime %s -verify-diagnostics | FileCheck %s
 
 func.func @alloca() {
diff --git a/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir b/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir
index 3b3f4a26e..444edcda4 100644
--- a/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir
+++ b/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir
@@ -1,4 +1,4 @@
-// RUN: gc-opt %s --gpu-to-llvm --convert-gpu-to-llvm-spv --gpu-module-to-binary | FileCheck %s
+// RUN: gc-opt %s --gpu-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-module-to-binary | FileCheck %s
 
 module attributes {gpu.container_module} {
   // CHECK-LABEL:gpu.binary @entry_kernel
diff --git a/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir b/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir
index 61848dcb7..ccb9ca418 100644
--- a/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir
+++ b/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir
@@ -150,12 +150,12 @@ func.func @matmul_2Dx4D_bf16(%arg0: tensor<4096x4096xbf16>, %arg1: tensor<128x12
 
 module attributes {
   dlti.target_system_spec = #dlti.target_system_spec<
-    "CPU": #dlti.target_device_spec<
-      #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : i32>,
-      #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : i32>,
-      #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : i32>,
-      #dlti.dl_entry<"num_threads", 56 : i32>,
-      #dlti.dl_entry<"max_vector_width", 512 : i32>>
+    "CPU" = #dlti.target_device_spec<
+      "L1_cache_size_in_bytes" = 49152 : i32,
+      "L2_cache_size_in_bytes" = 2097152 : i32,
+      "L3_cache_size_in_bytes" = 110100480 : i32,
+      "num_threads" = 56 : i32,
+      "max_vector_width" = 512 : i32>
   >} {
     // CHECK: #[[mapA:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3 * 2 + d4)>
     // CHECK: #[[mapB:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2, d4)>
diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir
index 3f28e68bc..282277d71 100644
--- a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir
+++ b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir
@@ -1,4 +1,4 @@
-// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
 
 module @gemm attributes {gpu.container_module} {
   gpu.module @kernel {
diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir
index f4bb29f2a..c114673c3 100644
--- a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir
+++ b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir
@@ -1,4 +1,4 @@
-// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
 
 module @gemm attributes {gpu.container_module} {
 
@@ -23,8 +23,9 @@ module @gemm attributes {gpu.container_module} {
       %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
       %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
       %c0 = arith.constant 0 : i32
-      %thread_x = gpu.thread_id x
-      %thread_x_i32 = arith.index_cast %thread_x : index to i32
+    %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
       %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
       %loaded_f32_modified = vector.insertelement %thread_x_f32, %loaded_f32[%c0 : i32] : vector<8xf32>
       %loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
diff --git a/test/mlir/test/gc/cpu-runner/tid.mlir b/test/mlir/test/gc/cpu-runner/tid.mlir
index aedcc0a20..ff0fcd451 100644
--- a/test/mlir/test/gc/cpu-runner/tid.mlir
+++ b/test/mlir/test/gc/cpu-runner/tid.mlir
@@ -1,3 +1,4 @@
+// UNSUPPORTED: target={{.*}}
 // RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-openmp-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s
 module {
   func.func private @omp_get_thread_num() -> i32
diff --git a/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp b/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp
index a3ba8261b..518c50526 100644
--- a/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp
+++ b/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp
@@ -26,12 +26,12 @@ using namespace mlir;
 static const char code1[] = R"mlir(
 module attributes {
 dlti.target_system_spec = #dlti.target_system_spec<
-"CPU": #dlti.target_device_spec<
-    #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>,
-    #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>,
-    #dlti.dl_entry<"L3_cache_size_in_bytes", "110100480">,
-    #dlti.dl_entry<"num_threads", 56 : i32>,
-    #dlti.dl_entry<"max_vector_width", 512 : i64>>
+"CPU" = #dlti.target_device_spec<
+    "L1_cache_size_in_bytes" = 49152 : ui32,
+    "L2_cache_size_in_bytes" = 2097152 : ui64,
+    "L3_cache_size_in_bytes" = "110100480",
+    "num_threads" = 56 : i32,
+    "max_vector_width" = 512 : i64>
 >} {}
 )mlir";
 
@@ -56,9 +56,9 @@ TEST(TargetDescriptionAnalysis, CPUNormal) {
 static const char code2[] = R"mlir(
 module attributes {
 dlti.target_system_spec = #dlti.target_system_spec<
-"CPU": #dlti.target_device_spec<
-    #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>,
-    #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui32>>
+"CPU" = #dlti.target_device_spec<
+    "L1_cache_size_in_bytes" = 49152 : ui32>,
+    "L2_cache_size_in_bytes" = 2097152 : ui32>
 >} {}
 )mlir";
 
diff --git a/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp b/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp
index d2d15d8a4..ba92536e7 100644
--- a/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp
+++ b/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp
@@ -62,7 +62,7 @@ module @test {
 )mlir";
 
 constexpr char matmulAddStatic[] = R"mlir(
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"max_work_group_size", 16 : i64>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" = #dlti.target_device_spec<"max_work_group_size" = 16 : i64>>} {
   func.func @entry(%arg0: memref<128x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<128x256xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<128x256xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<256x256xf16>