diff --git a/.github/workflows/PR-5.x.yaml b/.github/workflows/PR-5.x.yaml
index 03162e9f3e3..e1e12deeee8 100644
--- a/.github/workflows/PR-5.x.yaml
+++ b/.github/workflows/PR-5.x.yaml
@@ -14,6 +14,9 @@ jobs:
   Ubuntu2004-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-ARM64.yaml@main
 
+  Ubuntu2004-ARM64-FastCV:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-ARM64-FastCV.yaml@main
+
   Ubuntu2004-x64-CUDA:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U20-Cuda.yaml@main
 
diff --git a/modules/bgsegm/tutorials/bgsegm_bg_subtraction/bgsegm_bg_subtraction.markdown b/modules/bgsegm/tutorials/bgsegm_bg_subtraction/bgsegm_bg_subtraction.markdown
index 4340b3654ee..07bfffb2325 100644
--- a/modules/bgsegm/tutorials/bgsegm_bg_subtraction/bgsegm_bg_subtraction.markdown
+++ b/modules/bgsegm/tutorials/bgsegm_bg_subtraction/bgsegm_bg_subtraction.markdown
@@ -30,8 +30,8 @@ In the following, we will have a look at two algorithms from the `bgsegm` module
 ### BackgroundSubtractorMOG
 
 It is a Gaussian Mixture-based Background/Foreground Segmentation Algorithm. It was introduced in
-the paper "An improved adaptive background mixture model for real-time tracking with shadow
-detection" by P. KadewTraKuPong and R. Bowden in 2001. It uses a method to model each background
+the paper "An Improved Adaptive Background Mixture Model for Realtime Tracking with Shadow
+Detection" by P. KaewTraKulPong and R. Bowden in 2001. It uses a method to model each background
 pixel by a mixture of K Gaussian distributions (K = 3 to 5). The weights of the mixture represent
 the time proportions that those colours stay in the scene. The probable background colours are the
 ones which stay longer and more static.
diff --git a/modules/cudaimgproc/src/color.cpp b/modules/cudaimgproc/src/color.cpp
index cad5dcd394d..94ffe90fa2f 100644
--- a/modules/cudaimgproc/src/color.cpp
+++ b/modules/cudaimgproc/src/color.cpp
@@ -70,8 +70,8 @@ namespace cv { namespace cuda {
         template <int cn>
         void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
 
-        template <int cn>
-        void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+        template <int cn, typename Depth>
+        void MHCdemosaic(PtrStepSz<Depth> src, int2 sourceOffset, PtrStepSz<Depth> dst, int2 firstRed, cudaStream_t stream);
     }
 }}
 
@@ -2136,7 +2136,7 @@ void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn,
         GpuMat src = _src.getGpuMat();
         const int depth = _src.depth();
 
-        CV_Assert( depth == CV_8U );
+        CV_Assert( depth == CV_8U || depth == CV_16U);
         CV_Assert( src.channels() == 1 );
         CV_Assert( dcn == 3 || dcn == 4 );
 
@@ -2148,16 +2148,27 @@ void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn,
         Size wholeSize;
         Point ofs;
         src.locateROI(wholeSize, ofs);
-        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
 
         const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
                                         code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
 
-        if (dcn == 3)
-            cv::cuda::device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
-        else
-            cv::cuda::device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
-
+        if (dcn == 3) {
+            if (depth == CV_8U) {
+                PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+                cv::cuda::device::MHCdemosaic<3, uchar>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            } else {
+                PtrStepSz<ushort> srcWhole(wholeSize.height, wholeSize.width, src.ptr<ushort>(), src.step);
+                cv::cuda::device::MHCdemosaic<3, ushort>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            }
+        } else {
+            if (depth == CV_8U) {
+                PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+                cv::cuda::device::MHCdemosaic<4, uchar>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            } else {
+                PtrStepSz<ushort> srcWhole(wholeSize.height, wholeSize.width, src.ptr<ushort>(), src.step);
+                cv::cuda::device::MHCdemosaic<4, ushort>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            }
+        }
         break;
     }
 
@@ -2166,7 +2177,7 @@ void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn,
         GpuMat src = _src.getGpuMat();
         const int depth = _src.depth();
 
-        CV_Assert( depth == CV_8U );
+        CV_Assert( depth == CV_8U || depth == CV_16U);
 
         _dst.create(_src.size(), CV_MAKE_TYPE(depth, 1));
         GpuMat dst = _dst.getGpuMat();
@@ -2176,12 +2187,17 @@ void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn,
         Size wholeSize;
         Point ofs;
         src.locateROI(wholeSize, ofs);
-        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
 
         const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
                                         code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
 
-        cv::cuda::device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        if (depth == CV_8U) {
+            PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+            cv::cuda::device::MHCdemosaic<1, uchar>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        } else {
+            PtrStepSz<ushort> srcWhole(wholeSize.height, wholeSize.width, src.ptr<ushort>(), src.step);
+            cv::cuda::device::MHCdemosaic<1, ushort>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        }
 
         break;
     }
diff --git a/modules/cudaimgproc/src/cuda/debayer.cu b/modules/cudaimgproc/src/cuda/debayer.cu
index bfe4b6f5ea8..dfd3b9aa11d 100644
--- a/modules/cudaimgproc/src/cuda/debayer.cu
+++ b/modules/cudaimgproc/src/cuda/debayer.cu
@@ -390,6 +390,17 @@ namespace cv { namespace cuda { namespace device
     //
     // ported to CUDA
 
+    template<typename Depth> __device__
+    typename TypeVec<Depth, 3>::vec_type make_3(Depth x, Depth y, Depth z);
+
+    template<> __device__ TypeVec<uchar, 3>::vec_type make_3<uchar>(uchar x, uchar y, uchar z) {
+        return make_uchar3(x, y, z);
+    }
+
+    template<> __device__ TypeVec<ushort, 3>::vec_type make_3<ushort>(ushort x, ushort y, ushort z) {
+        return make_ushort3(x, y, z);
+    }
+
     template <typename DstType, class Ptr2D>
     __global__ void MHCdemosaic(PtrStepSz<DstType> dst, Ptr2D src, const int2 firstRed)
     {
@@ -506,34 +517,36 @@ namespace cv { namespace cuda { namespace device
         alternate.x = (x + firstRed.x) % 2;
         alternate.y = (y + firstRed.y) % 2;
 
-        // in BGR sequence;
-        uchar3 pixelColor =
+        typedef typename VecTraits<DstType>::elem_type SrcElemType;
+        typedef typename TypeVec<SrcElemType, 3>::vec_type SrcType;
+
+        SrcType pixelColor =
             (alternate.y == 0) ?
                 ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
-                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                    make_3<SrcElemType>(saturate_cast<SrcElemType>(PATTERN.y), saturate_cast<SrcElemType>(PATTERN.x), saturate_cast<SrcElemType>(C)) :
+                    make_3<SrcElemType>(saturate_cast<SrcElemType>(PATTERN.w), saturate_cast<SrcElemType>(C), saturate_cast<SrcElemType>(PATTERN.z))) :
                 ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
-                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+                    make_3<SrcElemType>(saturate_cast<SrcElemType>(PATTERN.z), saturate_cast<SrcElemType>(C), saturate_cast<SrcElemType>(PATTERN.w)) :
+                    make_3<SrcElemType>(saturate_cast<SrcElemType>(C), saturate_cast<SrcElemType>(PATTERN.x), saturate_cast<SrcElemType>(PATTERN.y)));
 
         dst(y, x) = toDst<DstType>(pixelColor);
     }
 
-    template <int cn>
-    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    template <int cn, typename Depth>
+    void MHCdemosaic(PtrStepSz<Depth> src, int2 sourceOffset, PtrStepSz<Depth> dst, int2 firstRed, cudaStream_t stream)
     {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+        typedef typename TypeVec<Depth, cn>::vec_type dst_t;
 
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
         if (sourceOffset.x || sourceOffset.y) {
-            cv::cudev::TextureOff<uchar> texSrc(src, sourceOffset.y, sourceOffset.x);
-            MHCdemosaic<dst_t, cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
+            cv::cudev::TextureOff<Depth> texSrc(src, sourceOffset.y, sourceOffset.x);
+            MHCdemosaic<dst_t, cv::cudev::TextureOffPtr<Depth>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
         }
         else {
-            cv::cudev::Texture<uchar> texSrc(src);
-            MHCdemosaic<dst_t, cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
+            cv::cudev::Texture<Depth> texSrc(src);
+            MHCdemosaic<dst_t, cv::cudev::TexturePtr<Depth>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
         }
 
         cudaSafeCall( cudaGetLastError() );
@@ -542,9 +555,12 @@ namespace cv { namespace cuda { namespace device
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<1, uchar>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3, uchar>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4, uchar>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<1, ushort>(PtrStepSz<ushort> src, int2 sourceOffset, PtrStepSz<ushort> dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3, ushort>(PtrStepSz<ushort> src, int2 sourceOffset, PtrStepSz<ushort> dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4, ushort>(PtrStepSz<ushort> src, int2 sourceOffset, PtrStepSz<ushort> dst, int2 firstRed, cudaStream_t stream);
 }}}
 
 #endif /* CUDA_DISABLER */
diff --git a/modules/cudev/test/CMakeLists.txt b/modules/cudev/test/CMakeLists.txt
index ff936cad54c..d8768ab6c56 100644
--- a/modules/cudev/test/CMakeLists.txt
+++ b/modules/cudev/test/CMakeLists.txt
@@ -20,9 +20,16 @@ if(OCV_DEPENDENCIES_FOUND)
     ocv_check_windows_crt_linkage()
     set(target_libs ${target_libs} ${CUDA_LIBRARIES})
     if(CUDA_VERSION VERSION_LESS "11.0")
-      ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++11")
+      # Windows version does not support --std option
+      if(UNIX OR APPLE)
+        ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++11")
+      endif()
     else()
-      ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++14")
+      if(CUDA_VERSION VERSION_LESS "12.8")
+        ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++14")
+      else()
+        ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++17")
+      endif()
       ocv_warnings_disable(CMAKE_CXX_FLAGS -Wdeprecated-declarations)
     endif()
     CUDA_ADD_EXECUTABLE(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} OPTIONS ${OPENCV_CUDA_OPTIONS_opencv_test_cudev})
diff --git a/modules/fastcv/include/opencv2/fastcv.hpp b/modules/fastcv/include/opencv2/fastcv.hpp
index af188dfcb09..292e83a2dc3 100644
--- a/modules/fastcv/include/opencv2/fastcv.hpp
+++ b/modules/fastcv/include/opencv2/fastcv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -11,6 +11,7 @@
 #include "opencv2/fastcv/arithm.hpp"
 #include "opencv2/fastcv/bilateralFilter.hpp"
 #include "opencv2/fastcv/blur.hpp"
+#include "opencv2/fastcv/channel.hpp"
 #include "opencv2/fastcv/cluster.hpp"
 #include "opencv2/fastcv/draw.hpp"
 #include "opencv2/fastcv/edges.hpp"
diff --git a/modules/fastcv/include/opencv2/fastcv/arithm.hpp b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
index 5a0c43b2408..29f5fdfe1f9 100644
--- a/modules/fastcv/include/opencv2/fastcv/arithm.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/arithm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -8,6 +8,10 @@
 
 #include <opencv2/core.hpp>
 
+#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)
+
+#define FCV_OPTYPE(depth,op) ((depth<<3) + op)
+
 namespace cv {
 namespace fastcv {
 
@@ -26,6 +30,59 @@ CV_EXPORTS_W void matmuls8s32(InputArray src1, InputArray src2, OutputArray dst)
 
 //! @}
 
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Arithmetic add and subtract operations for two matrices
+ *        It is optimized for Qualcomm's processors
+ * @param src1 First source matrix, can be of type CV_8U, CV_16S, CV_32F.
+ *             Note: CV_32F not supported for subtract
+ * @param src2 Second source matrix of same type and size as src1
+ * @param dst Resulting matrix of type as src mats
+ * @param op  type of operation - 0 for add and 1 for subtract
+ */
+CV_EXPORTS_W void arithmetic_op(InputArray src1, InputArray src2, OutputArray dst, int op);
+
+//! @}
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Matrix multiplication of two float type matrices
+ *        R = a*A*B + b*C where A,B,C,R are matrices and a,b are constants
+ *        It is optimized for Qualcomm's processors
+ * @param src1 First source matrix of type CV_32F
+ * @param src2 Second source matrix of type CV_32F with same rows as src1 cols
+ * @param dst Resulting matrix of type CV_32F
+ * @param alpha multiplying factor for src1 and src2
+ * @param src3 Optional third matrix of type CV_32F to be added to matrix product
+ * @param beta multiplying factor for src3
+ */
+CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, OutputArray dst, float alpha = 1.0,
+                           InputArray src3 = noArray(), float beta = 0.0);
+
+//! @}
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Integral of a YCbCr420 image.
+ *        Note: Input height should be multiple of 2. Input width and stride should be multiple of 16.
+ *              Output stride should be multiple of 8.
+ *              It is optimized for Qualcomm's processors
+ * @param Y Input Y component of 8UC1 YCbCr420 image.
+ * @param CbCr Input CbCr component(interleaved) of 8UC1 YCbCr420 image.
+ * @param IY Output Y integral of CV_32S one channel, size (Y height + 1)*(Y width + 1)
+ * @param ICb Output Cb integral of CV_32S one channel, size (Y height/2 + 1)*(Y width/2 + 1)
+ * @param ICr Output Cr integral of CV_32S one channel, size (Y height/2 + 1)*(Y width/2 + 1)
+ */
+CV_EXPORTS_W void integrateYUV(InputArray Y, InputArray CbCr, OutputArray IY, OutputArray ICb, OutputArray ICr);
+
+//! @}
+
 } // fastcv::
 } // cv::
 
diff --git a/modules/fastcv/include/opencv2/fastcv/blur.hpp b/modules/fastcv/include/opencv2/fastcv/blur.hpp
index 99d1cd3d655..fdb2326d993 100644
--- a/modules/fastcv/include/opencv2/fastcv/blur.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/blur.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -58,6 +58,22 @@ CV_EXPORTS_W void filter2D(InputArray _src, OutputArray _dst, int ddepth, InputA
 CV_EXPORTS_W void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY);
 //! @}
 
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Calculates the local subtractive and contrastive normalization of the image.
+ *        Each pixel of the image is normalized by the mean and standard deviation of the patch centred at the pixel.
+ *        It is optimized for Qualcomm's processors.
+ * @param _src Input image, should have one channel CV_8U or CV_32F
+ * @param _dst Output array, should be one channel, CV_8S if src of type CV_8U, or CV_32F if src of CV_32F
+ * @param pSize Patch size for mean and std dev calculation
+ * @param useStdDev If 1, bot mean and std dev will be used for normalization, if 0, only mean used
+ */
+CV_EXPORTS_W void normalizeLocalBox(InputArray _src, OutputArray _dst, Size pSize, bool useStdDev);
+
+//! @}
+
 } // fastcv::
 } // cv::
 
diff --git a/modules/fastcv/include/opencv2/fastcv/channel.hpp b/modules/fastcv/include/opencv2/fastcv/channel.hpp
new file mode 100644
index 00000000000..7b911a15f71
--- /dev/null
+++ b/modules/fastcv/include/opencv2/fastcv/channel.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#ifndef OPENCV_FASTCV_CHANNEL_HPP
+#define OPENCV_FASTCV_CHANNEL_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace fastcv {
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Creates one multi-channel mat out of several single-channel CV_8U mats.
+ *        Optimized for Qualcomm's processors
+ * @param mv input vector of matrices to be merged; all the matrices in mv must be of CV_8UC1 and have the same size
+ *           Note: numbers of mats can be 2,3 or 4.
+ * @param dst output array of depth CV_8U and same size as mv[0]; The number of channels
+ *            will be the total number of matrices in the matrix array
+ */
+CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);
+
+//! @}
+
+//! @addtogroup fastcv
+//! @{
+
+/**
+ * @brief Splits an CV_8U multi-channel mat into several CV_8UC1 mats
+ *        Optimized for Qualcomm's processors
+ * @param src input 2,3 or 4 channel mat of depth CV_8U
+ * @param mv  output vector of size src.channels() of CV_8UC1 mats
+ */
+CV_EXPORTS_W void split(InputArray src, OutputArrayOfArrays mv);
+
+//! @}
+
+} // fastcv::
+} // cv::
+
+#endif // OPENCV_FASTCV_CHANNEL_HPP
diff --git a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
index 6c20a21ab78..962730f40f9 100644
--- a/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/pyramid.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -16,7 +16,7 @@ namespace fastcv {
 
 /**
  * @brief Creates a gradient pyramid from an image pyramid
- *
+ *        Note: The borders are ignored during gradient calculation.
  * @param pyr Input pyramid of 1-channel 8-bit images. Only continuous images are supported.
  * @param dx Horizontal Sobel gradient pyramid of the same size as pyr
  * @param dy Verical Sobel gradient pyramid of the same size as pyr
diff --git a/modules/fastcv/perf/perf_arithm.cpp b/modules/fastcv/perf/perf_arithm.cpp
new file mode 100644
index 00000000000..c1cf04a5438
--- /dev/null
+++ b/modules/fastcv/perf/perf_arithm.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+typedef perf::TestBaseWithParam<tuple<Size, int>> IntegrateYUVPerfTest;
+
+PERF_TEST_P(IntegrateYUVPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U)                                     // image depth
+                      )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+
+    cv::Mat Y(srcSize, depth), CbCr(srcSize.height/2, srcSize.width, depth);
+    cv::Mat IY, ICb, ICr;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, Y, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, CbCr, Scalar::all(0), Scalar::all(255));
+
+    TEST_CYCLE() cv::fastcv::integrateYUV(Y, CbCr, IY, ICb, ICr);
+
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_blur.cpp b/modules/fastcv/perf/perf_blur.cpp
index bca8f80974a..8c8fd59b66a 100644
--- a/modules/fastcv/perf/perf_blur.cpp
+++ b/modules/fastcv/perf/perf_blur.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -120,4 +120,29 @@ PERF_TEST_P(SepFilter2DPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
+typedef perf::TestBaseWithParam<tuple<Size, int, Size, int>> NormalizeLocalBoxPerfTest;
+
+PERF_TEST_P(NormalizeLocalBoxPerfTest, run,
+    ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p), // image size
+                       ::testing::Values(CV_8U,CV_32F),                             // src image depth
+                       ::testing::Values(Size(3,3),Size(5,5)),                      // patch size
+                       ::testing::Values(0,1)                                       // use std dev or not
+                       )
+           )
+{
+    cv::Size srcSize = get<0>(GetParam());
+    int depth = get<1>(GetParam());
+    Size sz = get<2>(GetParam());
+    bool useStdDev = get<3>(GetParam());
+
+    cv::Mat src(srcSize, depth);
+    cv::Mat dst;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(255));
+
+    TEST_CYCLE() cv::fastcv::normalizeLocalBox(src, dst, sz, useStdDev);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
\ No newline at end of file
diff --git a/modules/fastcv/perf/perf_matmul.cpp b/modules/fastcv/perf/perf_matmul.cpp
index 83af7618b31..a8e4f314b55 100644
--- a/modules/fastcv/perf/perf_matmul.cpp
+++ b/modules/fastcv/perf/perf_matmul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -10,6 +10,9 @@ namespace opencv_test {
 typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/> MatMulPerfParams;
 typedef perf::TestBaseWithParam<MatMulPerfParams> MatMulPerfTest;
 
+typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/, float> MatMulGemmPerfParams;
+typedef perf::TestBaseWithParam<MatMulGemmPerfParams> MatMulGemmPerfTest;
+
 PERF_TEST_P(MatMulPerfTest, run,
     ::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
                        ::testing::Values(8, 16, 128, 256), // cols1
@@ -37,4 +40,34 @@ PERF_TEST_P(MatMulPerfTest, run,
     SANITY_CHECK_NOTHING();
 }
 
+PERF_TEST_P(MatMulGemmPerfTest, run,
+    ::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
+                       ::testing::Values(8, 16, 128, 256), // cols1
+                       ::testing::Values(8, 16, 128, 256), // cols2
+                       ::testing::Values(2.5, 5.8))   // alpha
+           )
+{
+    auto p = GetParam();
+    int rows1 = std::get<0>(p);
+    int cols1 = std::get<1>(p);
+    int cols2 = std::get<2>(p);
+    float alpha = std::get<3>(p);
+
+    RNG& rng = cv::theRNG();
+    Mat src1(rows1, cols1, CV_32FC1), src2(cols1, cols2, CV_32FC1);
+    cvtest::randUni(rng, src1, Scalar::all(-128.0), Scalar::all(128.0));
+    cvtest::randUni(rng, src2, Scalar::all(-128.0), Scalar::all(128.0));
+
+    Mat dst;
+
+    while (next())
+    {
+        startTimer();
+        cv::fastcv::gemm(src1, src2, dst, alpha, noArray(), 0);
+        stopTimer();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
diff --git a/modules/fastcv/src/arithm.cpp b/modules/fastcv/src/arithm.cpp
index bf8077cbe7b..8b9a4be1f48 100644
--- a/modules/fastcv/src/arithm.cpp
+++ b/modules/fastcv/src/arithm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -32,5 +32,177 @@ void matmuls8s32(InputArray _src1, InputArray _src2, OutputArray _dst)
                            (int32_t*)dst.data, dst.step);
 }
 
+void arithmetic_op(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
+{
+    CV_Assert(!_src1.empty() && (_src1.depth() == CV_8U || _src1.depth() == CV_16S || _src1.depth() == CV_32F));
+    CV_Assert(!_src2.empty() && _src2.type() == _src1.type());
+    CV_Assert(_src2.size() == _src1.size());
+
+    Mat src1 = _src1.getMat();
+    Mat src2 = _src2.getMat();
+
+    _dst.create(_src1.rows(), _src1.cols(), _src1.type());
+    Mat dst = _dst.getMat();
+
+    INITIALIZATION_CHECK;
+
+    fcvConvertPolicy policy = FASTCV_CONVERT_POLICY_SATURATE;
+
+    int nStripes = cv::getNumThreads();
+
+    int func = FCV_OPTYPE(_src1.depth(), op);
+    switch(func)
+    {
+        case FCV_OPTYPE(CV_8U, 0):
+            cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
+                          int rangeHeight = range.end - range.start;
+                          const uchar* yS1 =  src1.data + static_cast<size_t>(range.start)*src1.step[0];
+                          const uchar* yS2 =  src2.data + static_cast<size_t>(range.start)*src2.step[0];
+                          uchar* yD = dst.data + static_cast<size_t>(range.start)*dst.step[0];
+                          fcvAddu8(yS1, src1.cols, rangeHeight, src1.step[0],
+                                     yS2, src2.step[0], policy, yD, dst.step[0]);
+                          }, nStripes);
+            break;
+        case FCV_OPTYPE(CV_16S, 0):
+            cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
+                          int rangeHeight = range.end - range.start;
+                          const short* yS1 =  (short*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(short));
+                          const short* yS2 =  (short*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(short));
+                          short* yD = (short*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(short));
+                          fcvAdds16_v2(yS1, src1.cols, rangeHeight, src1.step[0],
+                                     yS2, src2.step[0], policy, yD, dst.step[0]);
+                          }, nStripes);
+            break;
+        case FCV_OPTYPE(CV_32F, 0):
+            cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
+                          int rangeHeight = range.end - range.start;
+                          const float* yS1 =  (float*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(float));
+                          const float* yS2 =  (float*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(float));
+                          float* yD = (float*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(float));
+                          fcvAddf32(yS1, src1.cols, rangeHeight, src1.step[0],
+                                     yS2, src2.step[0], yD, dst.step[0]);
+                          }, nStripes);
+            break;
+        case FCV_OPTYPE(CV_8U, 1):
+            cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
+                          int rangeHeight = range.end - range.start;
+                          const uchar* yS1 =  src1.data + static_cast<size_t>(range.start)*src1.step[0];
+                          const uchar* yS2 =  src2.data + static_cast<size_t>(range.start)*src2.step[0];
+                          uchar* yD = dst.data + static_cast<size_t>(range.start)*dst.step[0];
+                          fcvSubtractu8(yS1, src1.cols, rangeHeight, src1.step[0],
+                                     yS2, src2.step[0], policy, yD, dst.step[0]);
+                          }, nStripes);
+            break;
+        case FCV_OPTYPE(CV_16S, 1):
+            cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
+                          int rangeHeight = range.end - range.start;
+                          const short* yS1 =  (short*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(short));
+                          const short* yS2 =  (short*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(short));
+                          short* yD = (short*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(short));
+                          fcvSubtracts16(yS1, src1.cols, rangeHeight, src1.step[0],
+                                     yS2, src2.step[0], policy, yD, dst.step[0]);
+                          }, nStripes);
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, cv::format("op type is not supported"));
+            break;
+    }
+}
+
+
+void gemm(InputArray _src1, InputArray _src2, OutputArray _dst, float alpha, InputArray _src3, float beta)
+{
+    CV_Assert(!_src1.empty() && _src1.type() == CV_32FC1);
+    CV_Assert(_src1.cols() == _src2.rows());
+    Mat src1 = _src1.getMat();
+
+    CV_Assert(!_src2.empty() && _src2.type() == CV_32FC1);
+    Mat src2 = _src2.getMat();
+
+    bool isSrc3 = !_src3.empty();
+
+    Mat src3 = _src3.getMat();
+
+    _dst.create(_src1.rows(), _src2.cols(), CV_32FC1);
+
+    Mat dst = _dst.getMat();
+
+    CV_Assert(!FCV_CMP_EQ(alpha,0));
+
+    cv::Mat dst_temp1, dst_temp2;
+    float *dstp = NULL;
+    bool inplace = false;
+    size_t dst_stride;
+    fcvStatus status = FASTCV_SUCCESS;
+
+    int n = src1.cols, m = src1.rows, k = src2.cols;
+
+    INITIALIZATION_CHECK;
+
+    if(src1.data == dst.data || src2.data == dst.data || (isSrc3 && (src3.data == dst.data)))
+    {
+        dst_temp1 = cv::Mat(m, k, CV_32FC1);
+        dstp = dst_temp1.ptr<float>();
+        inplace = true;
+        dst_stride = dst_temp1.step[0];
+    }
+    else
+    {
+        dstp = (float32_t*)dst.data;
+        dst_stride = dst.step[0];
+    }
+    float32_t *dstp1 = dstp;
+    status = fcvMatrixMultiplyf32_v2((float32_t*)src1.data, n, m, src1.step[0], (float32_t*)src2.data, k,
+                                        src2.step[0], dstp, dst_stride);
+
+    bool isAlpha = !(FCV_CMP_EQ(alpha,0) || FCV_CMP_EQ(alpha,1));
+    if(isAlpha && status == FASTCV_SUCCESS)
+    {
+        status = fcvMultiplyScalarf32(dstp, k, m, dst_stride, alpha, dstp1, dst_stride);
+    }
+
+    if(isSrc3 && (!FCV_CMP_EQ(beta,0)) && status == FASTCV_SUCCESS)
+    {
+        cv::Mat dst3 = cv::Mat(m, k, CV_32FC1);
+        if(!FCV_CMP_EQ(beta,1))
+        {
+            status = fcvMultiplyScalarf32((float32_t*)src3.data, k, m, src3.step[0], beta, (float32_t*)dst3.data, dst3.step[0]);
+            if(status == FASTCV_SUCCESS)
+                fcvAddf32_v2(dstp, k, m, dst_stride, (float32_t*)dst3.data, dst3.step[0], dstp1, dst_stride);
+        }
+        else
+            fcvAddf32_v2(dstp, k, m, dst_stride, (float32_t*)src3.data, src3.step[0], dstp1, dst_stride);
+    }
+
+    if(inplace == true)
+    {
+        dst_temp1(cv::Rect(0, 0, k, m)).copyTo(dst(cv::Rect(0, 0, k, m)));
+    }
+}
+
+void integrateYUV(InputArray _Y, InputArray _CbCr, OutputArray _IY, OutputArray _ICb, OutputArray _ICr)
+{
+    CV_Assert(!_Y.empty() && !_CbCr.empty());
+    CV_Assert(_Y.type() == _CbCr.type() && _Y.type() == CV_8UC1);
+    Mat Y = _Y.getMat();
+    Mat CbCr = _CbCr.getMat();
+    int Ywidth = Y.cols;
+    int Yheight = Y.rows;
+
+    INITIALIZATION_CHECK;
+
+    _IY.create(Yheight + 1, Ywidth + 1, CV_32SC1);
+    _ICb.create(Yheight/2 + 1, Ywidth/2 + 1, CV_32SC1);
+    _ICr.create(Yheight/2 + 1, Ywidth/2 + 1, CV_32SC1);
+
+    Mat IY_ = _IY.getMat();
+    Mat ICb_ = _ICb.getMat();
+    Mat ICr_ = _ICr.getMat();
+
+    fcvIntegrateImageYCbCr420PseudoPlanaru8(Y.data, CbCr.data, Ywidth, Yheight, Y.step[0],
+                                            CbCr.step[0], (uint32_t*)IY_.data, (uint32_t*)ICb_.data, (uint32_t*)ICr_.data,
+                                            IY_.step[0], ICb_.step[0], ICr_.step[0]);
+}
+
 } // fastcv::
 } // cv::
diff --git a/modules/fastcv/src/blur.cpp b/modules/fastcv/src/blur.cpp
index 66058a37b5a..3ce22e07375 100644
--- a/modules/fastcv/src/blur.cpp
+++ b/modules/fastcv/src/blur.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -361,5 +361,26 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, InputArray _kern
     }
 }
 
+void normalizeLocalBox(InputArray _src, OutputArray _dst, Size pSize, bool useStdDev)
+{
+    CV_Assert(!_src.empty());
+    int type = _src.type();
+    CV_Assert(type == CV_8UC1 || type == CV_32FC1);
+
+    Size size = _src.size();
+    int dst_type = type == CV_8UC1 ? CV_8SC1 : CV_32FC1;
+    _dst.create(size, dst_type);
+
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    if(type == CV_8UC1)
+        fcvNormalizeLocalBoxu8(src.data, src.cols, src.rows, src.step[0],
+                              pSize.width, pSize.height, useStdDev, (int8_t*)dst.data, dst.step[0]);
+    else if(type == CV_32FC1)
+        fcvNormalizeLocalBoxf32((float*)src.data, src.cols, src.rows, src.step[0],
+                              pSize.width, pSize.height, useStdDev, (float*)dst.data, dst.step[0]);
+}
+
 } // fastcv::
 } // cv::
\ No newline at end of file
diff --git a/modules/fastcv/src/channel.cpp b/modules/fastcv/src/channel.cpp
new file mode 100644
index 00000000000..0283ac4ab7c
--- /dev/null
+++ b/modules/fastcv/src/channel.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "precomp.hpp"
+
+namespace cv {
+namespace fastcv {
+
+void merge(InputArrayOfArrays _mv, OutputArray _dst)
+{
+    CV_Assert(!_mv.empty());
+    std::vector<cv::Mat> mv;
+    _mv.getMatVector(mv);
+    int count = mv.size();
+
+    CV_Assert(!mv.empty());
+
+    CV_Assert(count == 2 || count == 3 || count == 4);
+    CV_Assert(!mv[0].empty());
+    CV_Assert(mv[0].dims <= 2);
+
+    for(int i = 0; i < count; i++ )
+    {
+        CV_Assert(mv[i].size == mv[0].size && mv[i].step[0] == mv[0].step[0] && mv[i].type() == CV_8UC1);
+    }
+
+     _dst.create(mv[0].dims, mv[0].size, CV_MAKE_TYPE(CV_8U,count));
+    Mat dst = _dst.getMat();
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    switch(count)
+    {
+        case 2:
+        cv::parallel_for_(cv::Range(0, mv[0].rows), [&](const cv::Range &range){
+                          int height_ = range.end - range.start;
+                          const uchar* yS1 =  mv[0].data + static_cast<size_t>(range.start) * mv[0].step[0];
+                          const uchar* yS2 =  mv[1].data + static_cast<size_t>(range.start) * mv[1].step[0];
+                          uchar* yD = dst.data + static_cast<size_t>(range.start) * dst.step[0];
+                          fcvChannelCombine2Planesu8(yS1, mv[0].cols, height_, mv[0].step[0], yS2, mv[1].step[0], yD, dst.step[0]);
+                          }, nStripes);
+
+        break;
+
+        case 3:
+        cv::parallel_for_(cv::Range(0, mv[0].rows), [&](const cv::Range &range){
+                          int height_ = range.end - range.start;
+                          const uchar* yS1 =  mv[0].data + static_cast<size_t>(range.start) * mv[0].step[0];
+                          const uchar* yS2 =  mv[1].data + static_cast<size_t>(range.start) * mv[1].step[0];
+                          const uchar* yS3 =  mv[2].data + static_cast<size_t>(range.start) * mv[2].step[0];
+                          uchar* yD = dst.data + static_cast<size_t>(range.start) * dst.step[0];
+                          fcvChannelCombine3Planesu8(yS1, mv[0].cols, height_, mv[0].step[0], yS2, mv[1].step[0], yS3, mv[2].step[0], yD, dst.step[0]);
+                          }, nStripes);
+
+        break;
+
+        case 4:
+        cv::parallel_for_(cv::Range(0, mv[0].rows), [&](const cv::Range &range){
+                          int height_ = range.end - range.start;
+                          const uchar* yS1 =  mv[0].data + static_cast<size_t>(range.start) * mv[0].step[0];
+                          const uchar* yS2 =  mv[1].data + static_cast<size_t>(range.start) * mv[1].step[0];
+                          const uchar* yS3 =  mv[2].data + static_cast<size_t>(range.start) * mv[2].step[0];
+                          const uchar* yS4 =  mv[3].data + static_cast<size_t>(range.start) * mv[3].step[0];
+                          uchar* yD = dst.data + static_cast<size_t>(range.start) * dst.step[0];
+                          fcvChannelCombine4Planesu8(yS1, mv[0].cols, height_, mv[0].step[0], yS2, mv[1].step[0], yS3, mv[2].step[0], yS4, mv[3].step[0], yD, dst.step[0]);
+                          }, nStripes);
+
+        break;
+
+        default:
+        CV_Error(cv::Error::StsBadArg, cv::format("count is not supported"));
+        break;
+    }
+}
+
+void split(InputArray _src, OutputArrayOfArrays _mv)
+{
+    CV_Assert(!_src.empty());
+    Mat src = _src.getMat();
+
+    int depth = src.depth(), cn = src.channels();
+
+    CV_Assert(depth == CV_8U && (cn == 2 || cn == 3 || cn == 4));
+    CV_Assert(src.dims <= 2);
+    _mv.create(cn, 1, depth);
+    for( int k = 0; k < cn; k++ )
+    {
+        _mv.create(src.dims, src.size, depth, k);
+    }
+
+    std::vector<cv::Mat> mv(cn);
+    _mv.getMatVector(mv);
+
+    INITIALIZATION_CHECK;
+
+    int nStripes = cv::getNumThreads();
+
+    if(src.rows * src.cols < 640 * 480)
+        if(cn == 3 || cn == 4)
+            nStripes = 1;
+
+    if(cn == 2)
+    {
+        cv::parallel_for_(cv::Range(0, src.rows), [&](const cv::Range &range){
+                      int height_ = range.end - range.start;
+                      const uchar* yS =  src.data + static_cast<size_t>(range.start) * src.step[0];
+                      uchar* y1D = mv[0].data + static_cast<size_t>(range.start) * mv[0].step[0];
+                      uchar* y2D = mv[1].data + static_cast<size_t>(range.start) * mv[1].step[0];
+                      fcvDeinterleaveu8(yS, src.cols, height_, src.step[0], y1D, mv[0].step[0], y2D, mv[1].step[0]);
+                      }, nStripes);
+    }
+    else if(cn == 3)
+    {
+        for(int i=0; i<cn; i++)
+        {
+            cv::parallel_for_(cv::Range(0, src.rows), [&](const cv::Range &range){
+                      int height_ = range.end - range.start;
+                      const uchar* yS =  src.data + static_cast<size_t>(range.start) * src.step[0];
+                      uchar* yD = mv[i].data + static_cast<size_t>(range.start) * mv[i].step[0];
+                      fcvChannelExtractu8(yS, src.cols, height_, src.step[0], NULL, 0, NULL, 0, (fcvChannelType)i, (fcvImageFormat)FASTCV_RGB, yD, mv[i].step[0]);
+                      }, nStripes);
+        }
+    }
+    else if(cn == 4)
+    {
+        for(int i=0; i<cn; i++)
+        {
+            cv::parallel_for_(cv::Range(0, src.rows), [&](const cv::Range &range){
+                      int height_ = range.end - range.start;
+                      const uchar* yS =  src.data + static_cast<size_t>(range.start) * src.step[0];
+                      uchar* yD = mv[i].data + static_cast<size_t>(range.start) * mv[i].step[0];
+                      fcvChannelExtractu8(yS, src.cols, height_, src.step[0], NULL, 0, NULL, 0, (fcvChannelType)i, (fcvImageFormat)FASTCV_RGBX, yD, mv[i].step[0]);
+                      }, nStripes);
+        }
+    }
+}
+
+} // fastcv::
+} // cv::
diff --git a/modules/fastcv/test/test_arithm.cpp b/modules/fastcv/test/test_arithm.cpp
index 39979908136..216faa7282c 100644
--- a/modules/fastcv/test/test_arithm.cpp
+++ b/modules/fastcv/test/test_arithm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -10,6 +10,9 @@ namespace opencv_test { namespace {
 typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/> MatMulTestParams;
 class MatMulTest : public ::testing::TestWithParam<MatMulTestParams> {};
 
+typedef std::tuple<Size, int /*depth*/, int /*op type*/> ArithmOpTestParams;
+class ArithmOpTest : public ::testing::TestWithParam<ArithmOpTestParams> {};
+
 TEST_P(MatMulTest, accuracy)
 {
     auto p = GetParam();
@@ -48,9 +51,75 @@ TEST_P(MatMulTest, accuracy)
     }
 }
 
+TEST_P(ArithmOpTest, accuracy)
+{
+    auto p = GetParam();
+    Size sz = std::get<0>(p);
+    int depth = std::get<1>(p);
+    int op = std::get<2>(p);
+    RNG& rng = cv::theRNG();
+    Mat src1(sz, depth), src2(sz, depth);
+
+    cvtest::randUni(rng, src1, Scalar::all(0), Scalar::all(128));
+    cvtest::randUni(rng, src2, Scalar::all(0), Scalar::all(128));
+
+    Mat dst;
+    cv::fastcv::arithmetic_op(src1, src2, dst, op);
+
+    Mat ref;
+    if(op == 0)
+        cv::add(src1, src2, ref);
+    else if(op == 1)
+        cv::subtract(src1, src2, ref);
+
+    double normInf = cvtest::norm(ref, dst, cv::NORM_INF);
+    double normL2  = cvtest::norm(ref, dst, cv::NORM_L2);
+
+    EXPECT_EQ(normInf, 0);
+    EXPECT_EQ(normL2, 0);
+}
+
+typedef testing::TestWithParam<tuple<Size>> IntegrateYUVTest;
+
+TEST_P(IntegrateYUVTest, accuracy)
+{
+    auto p = GetParam();
+    Size srcSize = std::get<0>(p);
+    int depth = CV_8U;
+
+    cv::Mat Y(srcSize, depth), CbCr(srcSize.height/2, srcSize.width, depth);
+    cv::Mat IY, ICb, ICr;
+    RNG& rng = cv::theRNG();
+    cvtest::randUni(rng, Y, Scalar::all(0), Scalar::all(255));
+    cvtest::randUni(rng, CbCr, Scalar::all(0), Scalar::all(255));
+
+    cv::fastcv::integrateYUV(Y, CbCr, IY, ICb, ICr);
+
+    CbCr = CbCr.reshape(2,0);
+    std::vector<cv::Mat> ref;
+    cv::fastcv::split(CbCr, ref);
+
+    cv::Mat IY_ref, ICb_ref, ICr_ref;
+    cv::integral(Y,IY_ref,CV_32S);
+    cv::integral(ref[0],ICb_ref,CV_32S);
+    cv::integral(ref[1],ICr_ref,CV_32S);
+
+    EXPECT_EQ(IY_ref.at<int>(IY_ref.rows - 1, IY_ref.cols - 1), IY.at<int>(IY.rows - 1, IY.cols - 1));
+    EXPECT_EQ(ICb_ref.at<int>(ICb_ref.rows - 1, ICb_ref.cols - 1), ICb.at<int>(ICb.rows - 1, ICb.cols - 1));
+    EXPECT_EQ(ICr_ref.at<int>(ICr_ref.rows - 1, ICr_ref.cols - 1), ICr.at<int>(ICr.rows - 1, ICr.cols - 1));
+}
+
 INSTANTIATE_TEST_CASE_P(FastCV_Extension, MatMulTest,
                          ::testing::Combine(::testing::Values(8, 16, 128, 256),   // rows1
                                             ::testing::Values(8, 16, 128, 256),   // cols1
                                             ::testing::Values(8, 16, 128, 256))); // cols2
 
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, ArithmOpTest,
+                         ::testing::Combine(::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p),   // sz
+                                            ::testing::Values(CV_8U, CV_16S), // depth
+                                            ::testing::Values(0,1))); // op type
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, IntegrateYUVTest,
+                         Values(perf::szVGA, perf::sz720p, perf::sz1080p)); // sz
+
 }} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_blur.cpp b/modules/fastcv/test/test_blur.cpp
index 1dde0261f28..dd7aaacf54f 100644
--- a/modules/fastcv/test/test_blur.cpp
+++ b/modules/fastcv/test/test_blur.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -108,6 +108,24 @@ TEST_P(SepFilter2DTest, accuracy)
     EXPECT_LT(num_diff_pixels, (src.rows+src.cols)*ksize);
 }
 
+typedef testing::TestWithParam<tuple<int>> NormalizeLocalBoxTest;
+
+TEST_P(NormalizeLocalBoxTest, accuracy)
+{
+    bool use_stddev = get<0>(GetParam());
+    cv::Mat src, dst;
+    src = imread(cvtest::findDataFile("cv/shared/baboon.png"), cv::IMREAD_GRAYSCALE);
+
+    cv::fastcv::normalizeLocalBox(src, dst, Size(5,5), use_stddev);
+    Scalar s = cv::mean(dst);
+
+    if(use_stddev)
+       EXPECT_LT(s[0],1);
+    else
+       EXPECT_LT(s[0],50);
+}
+
+
 INSTANTIATE_TEST_CASE_P(FastCV_Extension, GaussianBlurTest, Combine(
 /*image size*/     ::testing::Values(perf::szVGA, perf::sz720p, perf::sz1080p),
 /*image depth*/    ::testing::Values(CV_8U,CV_16S,CV_32S),
@@ -126,4 +144,7 @@ INSTANTIATE_TEST_CASE_P(FastCV_Extension, SepFilter2DTest, Combine(
 /*kernel size*/    Values(3, 5, 7, 9, 11)
 ));
 
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, NormalizeLocalBoxTest, Values(0,1));
+
+
 }} // namespaces opencv_test, ::
\ No newline at end of file
diff --git a/modules/fastcv/test/test_channel.cpp b/modules/fastcv/test/test_channel.cpp
new file mode 100644
index 00000000000..b3ed86ecfed
--- /dev/null
+++ b/modules/fastcv/test/test_channel.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef std::tuple<Size, int> ChannelMergeTestParams;
+class ChannelMergeTest : public ::testing::TestWithParam<ChannelMergeTestParams> {};
+
+typedef std::tuple<Size, int> ChannelSplitTestParams;
+class ChannelSplitTest : public ::testing::TestWithParam<ChannelSplitTestParams> {};
+
+TEST_P(ChannelMergeTest, accuracy)
+{
+    int depth = CV_8UC1;
+    Size sz = std::get<0>(GetParam());
+    int count = std::get<1>(GetParam());
+    std::vector<Mat> src_mats;
+
+    RNG& rng = cv::theRNG();
+
+    for(int i = 0; i < count; i++)
+    {
+        Mat tmp(sz, depth);
+        src_mats.push_back(tmp);
+        cvtest::randUni(rng, src_mats[i], Scalar::all(0), Scalar::all(127));
+    }
+
+    Mat dst;
+    cv::fastcv::merge(src_mats, dst);
+
+    Mat ref;
+    cv::merge(src_mats, ref);
+
+    double normInf = cvtest::norm(ref, dst, cv::NORM_INF);
+
+    EXPECT_EQ(normInf, 0);
+}
+
+TEST_P(ChannelSplitTest, accuracy)
+{
+    Size sz = std::get<0>(GetParam());
+    int cn = std::get<1>(GetParam());
+    std::vector<Mat> dst_mats(cn), ref_mats(cn);
+
+    RNG& rng = cv::theRNG();
+    Mat src(sz, CV_MAKE_TYPE(CV_8U,cn));
+    cvtest::randUni(rng, src, Scalar::all(0), Scalar::all(127));
+
+    cv::fastcv::split(src, dst_mats);
+
+    cv::split(src, ref_mats);
+
+    for(int i=0; i<cn; i++)
+    {
+        double normInf = cvtest::norm(ref_mats[i], dst_mats[i], cv::NORM_INF);
+        EXPECT_EQ(normInf, 0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, ChannelMergeTest,
+                         ::testing::Combine(::testing::Values(perf::szODD, perf::szVGA, perf::sz720p, perf::sz1080p),   // sz
+                                            ::testing::Values(2,3,4)));  // count
+
+INSTANTIATE_TEST_CASE_P(FastCV_Extension, ChannelSplitTest,
+                         ::testing::Combine(::testing::Values(perf::szODD, perf::szVGA, perf::sz720p, perf::sz1080p),   // sz
+                                            ::testing::Values(2,3,4)));    // cn
+
+}} // namespaces opencv_test, ::
diff --git a/modules/fastcv/test/test_pyramid.cpp b/modules/fastcv/test/test_pyramid.cpp
index 29acf9ab9a7..84c834fc9ed 100644
--- a/modules/fastcv/test/test_pyramid.cpp
+++ b/modules/fastcv/test/test_pyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
 */
 
@@ -126,16 +126,17 @@ TEST_P(SobelPyramidTest, accuracy)
     {
         cv::Mat ref, dst;
         double normInf, normL2;
-        ref = refPyrDx[i];
-        dst = pyrDx[i];
+        cv::Rect roi(1, 1, pyr[i].cols - 2, pyr[i].rows - 2);
+        ref = refPyrDx[i](roi);
+        dst = pyrDx[i](roi);
         normInf = cvtest::norm(dst, ref, cv::NORM_INF);
         normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
 
         EXPECT_LE(normInf, 76.1);
         EXPECT_LT(normL2,   0.4);
 
-        ref = refPyrDy[i];
-        dst = pyrDy[i];
+        ref = refPyrDy[i](roi);
+        dst = pyrDy[i](roi);
         normInf = cvtest::norm(dst, ref, cv::NORM_INF);
         normL2  = cvtest::norm(dst, ref, cv::NORM_L2) / dst.total();
 
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp
index f2ad032dcbe..77aa46ae6e6 100644
--- a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp
@@ -141,6 +141,12 @@ ObjectsAssociator::ComputeRgbDistance(const std::vector<Detection> &detections,
             if (tracking_per_class_ && (detections[d].class_label != tracklets[t]->label))
                 continue;
 
+            // Check if RGB features are available
+            auto t_rgb_features = tracklets[t]->GetRgbFeatures();
+            if (!t_rgb_features || t_rgb_features->empty()) {
+                continue; // Skip if no RGB features are available
+            }
+
             // Find best match in rgb feature history
             float min_dist = 1000.0f;
             for (const auto &t_rgb_feature : *(tracklets[t]->GetRgbFeatures())) {
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp
index 62e8d10cf67..7327325a4a7 100644
--- a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp
@@ -7,13 +7,15 @@
 #include "tracklet.hpp"
 
 #include <sstream>
+#include <memory>
 
 namespace vas {
 namespace ot {
 
 Tracklet::Tracklet()
     : id(0), label(-1), association_idx(kNoMatchDetection), status(ST_DEAD), age(0), confidence(0.f),
-      occlusion_ratio(0.f), association_delta_t(0.f), association_fail_count(0) {
+      occlusion_ratio(0.f), association_delta_t(0.f), association_fail_count(0),
+      rgb_features_(std::make_shared<std::deque<cv::Mat>>()) {
 }
 
 Tracklet::~Tracklet() {
@@ -45,12 +47,13 @@ void Tracklet::RenewTrajectory(const cv::Rect2f &bounding_box) {
     trajectory_filtered.push_back(bounding_box);
 }
 
-#define DEFINE_STRING_VAR(var_name, value)                                                                             \
-    std::stringstream __##var_name;                                                                                    \
-    __##var_name << value;                                                                                             \
-    std::string var_name = __##var_name.str();
+std::deque<cv::Mat> *Tracklet::GetRgbFeatures() {
+    return rgb_features_.get(); // Return the raw pointer from the shared_ptr
+}
 
-#define ROUND_F(value, scale) (round((value)*scale) / scale)
+void Tracklet::AddRgbFeature(const cv::Mat &feature) {
+    rgb_features_->push_back(feature);
+}
 
 std::string Tracklet::Serialize() const {
 #ifdef DUMP_OTAV
@@ -97,10 +100,6 @@ std::string Tracklet::Serialize() const {
 #endif
 }
 
-std::deque<cv::Mat> *Tracklet::GetRgbFeatures() {
-    return nullptr;
-}
-
 ZeroTermImagelessTracklet::ZeroTermImagelessTracklet() : Tracklet(), birth_count(1) {
 }
 
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp
index 762e3f6ea65..5282a183357 100644
--- a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp
@@ -13,6 +13,7 @@
 
 #include <cstdint>
 #include <deque>
+#include <memory>
 
 namespace vas {
 namespace ot {
@@ -45,6 +46,7 @@ class Tracklet {
     virtual void RenewTrajectory(const cv::Rect2f &bounding_box);
 
     virtual std::deque<cv::Mat> *GetRgbFeatures();
+    void AddRgbFeature(const cv::Mat &feature);
     virtual std::string Serialize() const; // Returns key:value with comma separated format
 
   public:
@@ -63,6 +65,9 @@ class Tracklet {
     std::deque<cv::Rect2f> trajectory_filtered;
     cv::Rect2f predicted;                      // Result from Kalman prediction. It is for debugging (OTAV)
     mutable std::vector<std::string> otav_msg; // Messages for OTAV
+
+private:
+    std::shared_ptr<std::deque<cv::Mat>> rgb_features_;
 };
 
 class ZeroTermImagelessTracklet : public Tracklet {
diff --git a/modules/matlab/CMakeLists.txt b/modules/matlab/CMakeLists.txt
index 17fd8427445..0bc89cb1ea6 100644
--- a/modules/matlab/CMakeLists.txt
+++ b/modules/matlab/CMakeLists.txt
@@ -113,6 +113,8 @@ ocv_add_module(matlab   BINDINGS
                                  opencv_calib opencv_3d
                                  opencv_stitching opencv_superres
                                  opencv_xfeatures2d
+				 opencv_optflow
+				 opencv_xphoto
 )
 
 # get the commit information
@@ -156,6 +158,8 @@ endforeach()
 # add extra headers by hand
 list(APPEND opencv_extra_hdrs "core=${OPENCV_MODULE_opencv_core_LOCATION}/include/opencv2/core/base.hpp")
 list(APPEND opencv_extra_hdrs "video=${OPENCV_MODULE_opencv_video_LOCATION}/include/opencv2/video/tracking.hpp")
+list(APPEND opencv_extra_hdrs "optflow=${OPENCV_MODULE_opencv_optflow_LOCATION}/include/opencv2/optflow.hpp")
+
 
 # pass the OPENCV_CXX_EXTRA_FLAGS through to the mex compiler
 # remove the visibility modifiers, so the mex gateway is visible
diff --git a/modules/matlab/generator/parse_tree.py b/modules/matlab/generator/parse_tree.py
index 0a7ef364841..41d6d24d123 100644
--- a/modules/matlab/generator/parse_tree.py
+++ b/modules/matlab/generator/parse_tree.py
@@ -1,4 +1,10 @@
-import collections
+import sys
+if sys.version_info >= (3, 10):
+    import collections.abc
+    IterableType = collections.abc.Iterable
+else:
+    import collections
+    IterableType = collections.Iterable
 from textwrap import fill
 from filters import *
 try:
@@ -371,7 +377,7 @@ def todict(obj):
         return obj
     elif isinstance(obj, dict):
         return dict((key, todict(val)) for key, val in obj.items())
-    elif isinstance(obj, collections.Iterable):
+    elif isinstance(obj, IterableType):
         return [todict(val) for val in obj]
     elif hasattr(obj, '__dict__'):
         return todict(vars(obj))
diff --git a/modules/matlab/include/opencv2/matlab/bridge.hpp b/modules/matlab/include/opencv2/matlab/bridge.hpp
index c6fe605b2a4..f0e947a7da0 100644
--- a/modules/matlab/include/opencv2/matlab/bridge.hpp
+++ b/modules/matlab/include/opencv2/matlab/bridge.hpp
@@ -55,6 +55,21 @@
 #include <opencv2/photo.hpp>
 #include <opencv2/stitching.hpp>
 #include <opencv2/video.hpp>
+#include <opencv2/optflow.hpp>
+#include <opencv2/xphoto.hpp>
+
+/* This 'using' line was added in order to fix the following Error.
+ * Failed to compile currentUIFramework:
+ * modules/matlab/src/currentUIFramework.cpp:
+ * In function void mexFunction(int, mxArray**, int, const mxArray**)
+ * error: string was not declared in this scope
+ * string retval; in line 41
+ *
+ * This error happens at the last stage of opencv build, when compiling the mex bindings
+ * TODO: This is NOT the optimal fix, and needs to be addressed
+ */
+using std::string;
+
 
 namespace cv {
 namespace bridge {
@@ -85,17 +100,21 @@ typedef cv::Ptr<AlignMTB> Ptr_AlignMTB;
 typedef cv::Ptr<CalibrateDebevec> Ptr_CalibrateDebevec;
 typedef cv::Ptr<CalibrateRobertson> Ptr_CalibrateRobertson;
 typedef cv::Ptr<DenseOpticalFlow> Ptr_DenseOpticalFlow;
-typedef cv::Ptr<DualTVL1OpticalFlow> Ptr_DualTVL1OpticalFlow;
+typedef cv::Ptr<cv::optflow::DualTVL1OpticalFlow> Ptr_DualTVL1OpticalFlow;
 typedef cv::Ptr<MergeDebevec> Ptr_MergeDebevec;
 typedef cv::Ptr<MergeMertens> Ptr_MergeMertens;
 typedef cv::Ptr<MergeRobertson> Ptr_MergeRobertson;
 typedef cv::Ptr<Stitcher> Ptr_Stitcher;
 typedef cv::Ptr<Tonemap> Ptr_Tonemap;
 typedef cv::Ptr<TonemapDrago> Ptr_TonemapDrago;
-typedef cv::Ptr<TonemapDurand> Ptr_TonemapDurand;
+typedef cv::Ptr<cv::xphoto::TonemapDurand> Ptr_TonemapDurand;
 typedef cv::Ptr<TonemapMantiuk> Ptr_TonemapMantiuk;
 typedef cv::Ptr<TonemapReinhard> Ptr_TonemapReinhard;
 typedef cv::Ptr<float> Ptr_float;
+typedef cv::Ptr<cv::GeneralizedHoughBallard> Ptr_GeneralizedHoughBallard;
+typedef cv::Ptr<cv::GeneralizedHoughGuil> Ptr_GeneralizedHoughGuil;
+
+
 
 // ----------------------------------------------------------------------------
 //                          PREDECLARATIONS
@@ -527,6 +546,15 @@ class Bridge {
   Bridge& operator=(const Ptr_float& ) { return *this; }
   Ptr_float toPtrFloat() { return Ptr_float(); }
   operator Ptr_float() { return toPtrFloat(); }
+
+  // ---------------------------   Ptr_GeneralizedHoughBallard   --------------
+  Bridge& operator=(const Ptr_GeneralizedHoughBallard& obj) { return *this; }
+  operator Ptr_GeneralizedHoughBallard() { return Ptr_GeneralizedHoughBallard(); }
+
+  // ---------------------------   Ptr_GeneralizedHoughGuil   ----------------------
+  Bridge& operator=(const Ptr_GeneralizedHoughGuil& obj) { return *this; }
+  operator Ptr_GeneralizedHoughGuil() { return Ptr_GeneralizedHoughGuil(); }
+
 }; // class Bridge