diff --git a/benchmarks/GraphBLAS-sharp.Benchmarks/Configs/Kronecker.txt b/benchmarks/GraphBLAS-sharp.Benchmarks/Configs/Kronecker.txt
new file mode 100644
index 00000000..b16b59a8
--- /dev/null
+++ b/benchmarks/GraphBLAS-sharp.Benchmarks/Configs/Kronecker.txt
@@ -0,0 +1,3 @@
+can_634
+Si2
+lshp1561
diff --git a/benchmarks/GraphBLAS-sharp.Benchmarks/GraphBLAS-sharp.Benchmarks.fsproj b/benchmarks/GraphBLAS-sharp.Benchmarks/GraphBLAS-sharp.Benchmarks.fsproj
index 2dd0c406..ea52ed24 100644
--- a/benchmarks/GraphBLAS-sharp.Benchmarks/GraphBLAS-sharp.Benchmarks.fsproj
+++ b/benchmarks/GraphBLAS-sharp.Benchmarks/GraphBLAS-sharp.Benchmarks.fsproj
@@ -23,6 +23,7 @@
     
     
     
+    
     
     
     
diff --git a/benchmarks/GraphBLAS-sharp.Benchmarks/Matrix/Kronecker.fs b/benchmarks/GraphBLAS-sharp.Benchmarks/Matrix/Kronecker.fs
new file mode 100644
index 00000000..f8c7b880
--- /dev/null
+++ b/benchmarks/GraphBLAS-sharp.Benchmarks/Matrix/Kronecker.fs
@@ -0,0 +1,142 @@
+module GraphBLAS.FSharp.Benchmarks.Matrix.Kronecker
+
+open System.IO
+open BenchmarkDotNet.Attributes
+open Brahma.FSharp
+open GraphBLAS.FSharp
+open GraphBLAS.FSharp.IO
+open GraphBLAS.FSharp.Backend.Quotes
+open GraphBLAS.FSharp.Objects
+open GraphBLAS.FSharp.Objects.ClContextExtensions
+open GraphBLAS.FSharp.Benchmarks
+
+[]
+[]
+[]
+[)>]
+type Benchmarks<'elem when 'elem : struct>(
+        buildFunToBenchmark,
+        converter: string -> 'elem,
+        converterBool,
+        buildMatrix) =
+
+    let mutable funToBenchmark = None
+
+    let mutable matrix = Unchecked.defaultof>
+
+    let mutable matrixHost = Unchecked.defaultof<_>
+
+    member val ResultMatrix = Unchecked.defaultof option> with get, set
+
+    []
+    member val OclContextInfo = Unchecked.defaultof with get, set
+
+    []
+    member val InputMatrixReader = Unchecked.defaultof with get, set
+
+    member this.OclContext: ClContext = (fst this.OclContextInfo).ClContext
+    member this.WorkGroupSize = snd this.OclContextInfo
+
+    member this.Processor =
+        let p = (fst this.OclContextInfo).Queue
+        p.Error.Add(fun e -> failwithf "%A" e)
+        p
+
+    static member AvailableContexts = Utils.availableContexts
+
+    static member InputMatrixProviderBuilder pathToConfig =
+        let datasetFolder = ""
+        pathToConfig
+        |> Utils.getMatricesFilenames
+        |> Seq.map
+            (fun matrixFilename ->
+                printfn "%A" matrixFilename
+
+                match Path.GetExtension matrixFilename with
+                | ".mtx" ->
+                    MtxReader(Utils.getFullPathToMatrix datasetFolder matrixFilename)
+                | _ -> failwith "Unsupported matrix format")
+
+    member this.FunToBenchmark =
+        match funToBenchmark with
+        | None ->
+            let x = buildFunToBenchmark this.OclContext this.WorkGroupSize
+            funToBenchmark <- Some x
+            x
+        | Some x -> x
+
+    member this.ReadMatrix (reader: MtxReader) =
+        let converter =
+            match reader.Field with
+            | Pattern -> converterBool
+            | _ -> converter
+
+        reader.ReadMatrix converter
+
+    member this.Mxm() =
+        this.ResultMatrix <- this.FunToBenchmark this.Processor DeviceOnly matrix matrix
+
+    member this.ClearInputMatrices() =
+        matrix.Dispose this.Processor
+
+    member this.ClearResult() =
+        match this.ResultMatrix with
+        | Some matrix -> matrix.Dispose this.Processor
+        | None -> ()
+
+    member this.ReadMatrices() =
+        matrixHost <- this.ReadMatrix this.InputMatrixReader
+
+    member this.LoadMatricesToGPU () =
+        matrix <- buildMatrix this.OclContext matrixHost
+
+    abstract member GlobalSetup : unit -> unit
+
+    abstract member Benchmark : unit -> unit
+
+    abstract member IterationCleanup : unit -> unit
+
+    abstract member GlobalCleanup : unit -> unit
+
+module WithoutTransfer =
+    type Benchmark<'elem when 'elem : struct>(
+            buildFunToBenchmark,
+            converter: string -> 'elem,
+            converterBool,
+            buildMatrix) =
+
+        inherit Benchmarks<'elem>(
+            buildFunToBenchmark,
+            converter,
+            converterBool,
+            buildMatrix)
+
+        []
+        override this.GlobalSetup() =
+            this.ReadMatrices()
+            this.LoadMatricesToGPU()
+
+        []
+        override this.Benchmark() =
+            this.Mxm()
+            this.Processor.PostAndReply(Msg.MsgNotifyMe)
+
+        []
+        override this.IterationCleanup () =
+            this.ClearResult()
+
+        []
+        override this.GlobalCleanup () =
+            this.ClearInputMatrices()
+
+    type Float32() =
+
+        inherit Benchmark(
+            Operations.kronecker (ArithmeticOperations.float32MulOption),
+            float32,
+            (fun _ -> Utils.nextSingle (System.Random())),
+            (fun context matrix -> ClMatrix.CSR <| matrix.ToCSR.ToDevice context)
+            )
+
+        static member InputMatrixProvider =
+            Benchmarks<_>.InputMatrixProviderBuilder "Kronecker.txt"
diff --git a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
index ad6b3caf..6bea80a1 100644
--- a/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
+++ b/src/GraphBLAS-sharp.Backend/Common/ClArray.fs
@@ -728,11 +728,11 @@ module ClArray =
         bound<'a, int> Search.Bin.lowerBound clContext
 
     /// 
-    /// Gets the value at the specified position from the input array.
+    /// Gets the value at the specified position from the input array and insert it into given ClCell.
     /// 
     /// OpenCL context.
     /// Should be a power of 2 and greater than 1.
-    let item<'a> (clContext: ClContext) workGroupSize =
+    let itemTo<'a> (clContext: ClContext) workGroupSize =
 
         let kernel =
             <@ fun (ndRange: Range1D) index (array: ClArray<'a>) (result: ClCell<'a>) ->
@@ -744,21 +744,37 @@ module ClArray =
 
         let program = clContext.Compile kernel
 
-        fun (processor: MailboxProcessor<_>) (index: int) (array: ClArray<'a>) ->
+        fun (processor: MailboxProcessor<_>) (index: int) (array: ClArray<'a>) (output: ClCell<'a>) ->
 
             if index < 0 || index >= array.Length then
                 failwith "Index out of range"
 
-            let result =
-                clContext.CreateClCell Unchecked.defaultof<'a>
-
             let kernel = program.GetKernel()
 
             let ndRange = Range1D.CreateValid(1, workGroupSize)
 
-            processor.Post(Msg.MsgSetArguments(fun () -> kernel.KernelFunc ndRange index array result))
+            processor.Post(Msg.MsgSetArguments(fun () -> kernel.KernelFunc ndRange index array output))
             processor.Post(Msg.CreateRunMsg<_, _> kernel)
 
+    /// 
+    /// Gets the value at the specified position from the input array.
+    /// 
+    /// OpenCL context.
+    /// Should be a power of 2 and greater than 1.
+    let item<'a> (clContext: ClContext) workGroupSize =
+
+        let itemTo = itemTo clContext workGroupSize
+
+        fun (processor: MailboxProcessor<_>) (index: int) (array: ClArray<'a>) ->
+
+            if index < 0 || index >= array.Length then
+                failwith "Index out of range"
+
+            let result =
+                clContext.CreateClCell Unchecked.defaultof<'a>
+
+            itemTo processor index array result
+
             result
 
     /// 
diff --git a/src/GraphBLAS-sharp.Backend/Operations/Kronecker.fs b/src/GraphBLAS-sharp.Backend/Operations/Kronecker.fs
index 9ff810bf..8c5835ba 100644
--- a/src/GraphBLAS-sharp.Backend/Operations/Kronecker.fs
+++ b/src/GraphBLAS-sharp.Backend/Operations/Kronecker.fs
@@ -171,7 +171,7 @@ module internal Kronecker =
     let private setPositions<'c when 'c: struct> (clContext: ClContext) workGroupSize =
 
         let setPositions =
-            <@ fun (ndRange: Range1D) rowCount columnCount startIndex (rowOffset: ClCell) (columnOffset: ClCell) (bitmap: ClArray) (values: ClArray<'c>) (resultRows: ClArray) (resultColumns: ClArray) (resultValues: ClArray<'c>) ->
+            <@ fun (ndRange: Range1D) rowCount columnCount startIndex (rowOffset: int) (columnOffset: int) (bitmap: ClArray) (values: ClArray<'c>) (resultRows: ClArray) (resultColumns: ClArray) (resultValues: ClArray<'c>) ->
 
                 let gid = ndRange.GlobalID0
 
@@ -184,8 +184,8 @@ module internal Kronecker =
 
                     let index = startIndex + bitmap.[gid] - 1
 
-                    resultRows.[index] <- rowIndex + rowOffset.Value
-                    resultColumns.[index] <- columnIndex + columnOffset.Value
+                    resultRows.[index] <- rowIndex + rowOffset
+                    resultColumns.[index] <- columnIndex + columnOffset
                     resultValues.[index] <- values.[gid] @>
 
         let kernel = clContext.Compile <| setPositions
@@ -202,9 +202,6 @@ module internal Kronecker =
 
             let kernel = kernel.GetKernel()
 
-            let rowOffset = rowOffset |> clContext.CreateClCell
-            let columnOffset = columnOffset |> clContext.CreateClCell
-
             processor.Post(
                 Msg.MsgSetArguments
                     (fun () ->
@@ -224,23 +221,20 @@ module internal Kronecker =
 
             processor.Post(Msg.CreateRunMsg<_, _> kernel)
 
-            rowOffset.Free processor
-            columnOffset.Free processor
-
             (sum.ToHostAndFree processor) + startIndex
 
     let private copyToResult (clContext: ClContext) workGroupSize =
 
         let copyToResult =
-            <@ fun (ndRange: Range1D) startIndex sourceLength (rowOffset: ClCell) (columnOffset: ClCell) (sourceRows: ClArray) (sourceColumns: ClArray) (sourceValues: ClArray<'c>) (resultRows: ClArray) (resultColumns: ClArray) (resultValues: ClArray<'c>) ->
+            <@ fun (ndRange: Range1D) startIndex sourceLength (rowOffset: int) (columnOffset: int) (sourceRows: ClArray) (sourceColumns: ClArray) (sourceValues: ClArray<'c>) (resultRows: ClArray) (resultColumns: ClArray) (resultValues: ClArray<'c>) ->
 
                 let gid = ndRange.GlobalID0
 
                 if gid < sourceLength then
                     let index = startIndex + gid
 
-                    resultRows.[index] <- rowOffset.Value + sourceRows.[gid]
-                    resultColumns.[index] <- columnOffset.Value + sourceColumns.[gid]
+                    resultRows.[index] <- rowOffset + sourceRows.[gid]
+                    resultColumns.[index] <- columnOffset + sourceColumns.[gid]
                     resultValues.[index] <- sourceValues.[gid] @>
 
         let kernel = clContext.Compile <| copyToResult
@@ -252,9 +246,6 @@ module internal Kronecker =
 
             let kernel = kernel.GetKernel()
 
-            let rowOffset = rowOffset |> clContext.CreateClCell
-            let columnOffset = columnOffset |> clContext.CreateClCell
-
             processor.Post(
                 Msg.MsgSetArguments
                     (fun () ->
@@ -274,14 +265,11 @@ module internal Kronecker =
 
             processor.Post(Msg.CreateRunMsg<_, _> kernel)
 
-            rowOffset.Free processor
-            columnOffset.Free processor
-
     let private insertZero (clContext: ClContext) workGroupSize =
 
         let copy = copyToResult clContext workGroupSize
 
-        fun queue startIndex (zeroCounts: int list array) (matrixZero: COO<'c>) resultMatrix ->
+        fun queue startIndex (zeroCounts: ClArray array) (matrixZero: COO<'c>) resultMatrix ->
 
             let rowCount = zeroCounts.Length
 
@@ -298,32 +286,31 @@ module internal Kronecker =
 
                     startIndex <- startIndex + matrixZero.NNZ
 
-            let rec insertInRowRec zeroCounts row column =
-                match zeroCounts with
-                | [] -> ()
-                | h :: tl ->
-                    insertMany row column h
+            for row in 0 .. rowCount - 1 do
+                let zeroCountInRow = zeroCounts.[row].ToHostAndFree queue
 
-                    insertInRowRec tl row (h + column + 1)
+                let mutable column = 0
 
-            for row in 0 .. rowCount - 1 do
-                insertInRowRec zeroCounts.[row] row 0
+                for count in zeroCountInRow do
+                    insertMany row column count
+
+                    column <- column + count + 1
 
     let private insertNonZero (clContext: ClContext) workGroupSize op =
 
-        let item = ClArray.item clContext workGroupSize
+        let itemTo = ClArray.itemTo clContext workGroupSize
 
         let preparePositions =
             preparePositions clContext workGroupSize op
 
         let setPositions = setPositions clContext workGroupSize
 
-        fun queue (rowsEdges: (int * int) array) (matrixRight: CSR<'b>) (leftValues: ClArray<'a>) (leftColsHost: int array) (resultMatrix: COO<'c>) ->
+        fun queue (rowBoundaries: (int * int) array) (matrixRight: CSR<'b>) (leftValues: ClArray<'a>) (leftColsHost: int array) (resultMatrix: COO<'c>) ->
 
             let setPositions =
                 setPositions queue matrixRight.RowCount matrixRight.ColumnCount
 
-            let rowCount = rowsEdges.Length
+            let rowCount = rowBoundaries.Length
 
             let length =
                 matrixRight.RowCount * matrixRight.ColumnCount
@@ -336,11 +323,15 @@ module internal Kronecker =
 
             let mutable startIndex = 0
 
+            let value =
+                clContext.CreateClCell Unchecked.defaultof<'a>
+
             for row in 0 .. rowCount - 1 do
-                let leftEdge, rightEdge = rowsEdges.[row]
+                let leftEdge, rightEdge = rowBoundaries.[row]
 
                 for i in leftEdge .. rightEdge do
-                    let value = item queue i leftValues
+                    itemTo queue i leftValues value
+
                     let column = leftColsHost.[i]
 
                     let rowOffset = row * matrixRight.RowCount
@@ -348,21 +339,88 @@ module internal Kronecker =
 
                     preparePositions queue value matrixRight mappedMatrix bitmap
 
-                    value.Free queue
-
                     startIndex <- setPositions rowOffset columnOffset startIndex resultMatrix mappedMatrix bitmap
 
+            value.Free queue
             bitmap.Free queue
             mappedMatrix.Free queue
 
             startIndex
 
+    let private countZeroElements (clContext: ClContext) workGroupSize =
+
+        let countZeroElementsInRow =
+            <@ fun (ndRange: Range1D) (firstIndex: int) (lastIndex: int) (columnCount: int) (columns: ClArray) (result: ClArray) ->
+
+                let gid = ndRange.GlobalID0
+
+                let nnzInRow = lastIndex - firstIndex + 1
+
+                if gid <= nnzInRow then
+
+                    if nnzInRow = 0 then
+                        result.[0] <- columnCount
+
+                    elif gid = nnzInRow then
+                        result.[nnzInRow] <- columnCount - columns.[lastIndex] - 1
+
+                    elif gid = 0 then
+                        result.[0] <- columns.[firstIndex]
+
+                    else
+                        result.[gid] <-
+                            columns.[firstIndex + gid]
+                            - columns.[firstIndex + gid - 1]
+                            - 1 @>
+
+        let kernel = clContext.Compile countZeroElementsInRow
+
+        fun (queue: MailboxProcessor<_>) (matrix: CSR<_>) (rowBoundaries: (int * int) array) ->
+
+            let kernel = kernel.GetKernel()
+
+            let (zeroCounts: ClArray array) = Array.zeroCreate matrix.RowCount
+
+            for row in 0 .. matrix.RowCount - 1 do
+
+                let firstIndex = fst rowBoundaries.[row]
+                let lastIndex = snd rowBoundaries.[row]
+
+                let nnzInRow = lastIndex - firstIndex + 1
+                let length = nnzInRow + 1
+
+                let ndRange =
+                    Range1D.CreateValid(length, workGroupSize)
+
+                let result =
+                    clContext.CreateClArrayWithSpecificAllocationMode(DeviceOnly, length)
+
+                queue.Post(
+                    Msg.MsgSetArguments
+                        (fun () ->
+                            kernel.KernelFunc ndRange firstIndex lastIndex matrix.ColumnCount matrix.Columns result)
+                )
+
+                queue.Post(Msg.CreateRunMsg<_, _>(kernel))
+
+                zeroCounts.[row] <- result
+
+            zeroCounts
+
     let private mapAll<'a, 'b, 'c when 'a: struct and 'b: struct and 'c: struct and 'c: equality>
         (clContext: ClContext)
         workGroupSize
         (op: Expr<'a option -> 'b option -> 'c option>)
         =
 
+        let pairwise = ClArray.pairwise clContext workGroupSize
+
+        let mapInPlace =
+            ClArray.mapInPlace <@ fun (a, b) -> (a, b - 1) @> clContext workGroupSize
+
+        let countZeroElements =
+            countZeroElements clContext workGroupSize
+
         let insertNonZero = insertNonZero clContext workGroupSize op
 
         let insertZero = insertZero clContext workGroupSize
@@ -386,35 +444,22 @@ module internal Kronecker =
                   RowCount = matrixLeft.RowCount * matrixRight.RowCount
                   ColumnCount = matrixLeft.ColumnCount * matrixRight.ColumnCount }
 
-            let leftRowPointers = matrixLeft.RowPointers.ToHost queue
             let leftColumns = matrixLeft.Columns.ToHost queue
 
-            let nnzInRows =
-                leftRowPointers
-                |> Array.pairwise
-                |> Array.map (fun (fst, snd) -> snd - fst)
-
-            let rowsEdges =
-                leftRowPointers
-                |> Array.pairwise
-                |> Array.map (fun (fst, snd) -> (fst, snd - 1))
-
-            let (zeroCounts: int list array) = Array.zeroCreate matrixLeft.RowCount
-
-            { 0 .. matrixLeft.RowCount - 1 }
-            |> Seq.iter2
-                (fun edges i ->
-                    zeroCounts.[i] <-
-                        leftColumns.[fst edges..snd edges]
-                        |> Array.toList
-                        |> List.insertAt 0 -1
-                        |> List.insertAt (nnzInRows.[i] + 1) matrixLeft.ColumnCount
-                        |> List.pairwise
-                        |> List.map (fun (fstCol, sndCol) -> sndCol - fstCol - 1))
-                rowsEdges
+            let pairsOfRowPointers =
+                pairwise queue DeviceOnly matrixLeft.RowPointers
+                |> Option.defaultWith
+                    (fun () -> failwith "The state of the matrix is broken. The length of the rowPointers must be >= 2")
+
+            mapInPlace queue pairsOfRowPointers
+
+            let rowBoundaries = pairsOfRowPointers.ToHostAndFree queue
+
+            let zeroCounts =
+                countZeroElements queue matrixLeft rowBoundaries
 
             let startIndex =
-                insertNonZero queue rowsEdges matrixRight matrixLeft.Values leftColumns resultMatrix
+                insertNonZero queue rowBoundaries matrixRight matrixLeft.Values leftColumns resultMatrix
 
             matrixZero
             |> Option.iter (fun m -> insertZero queue startIndex zeroCounts m resultMatrix)