diff --git a/Project.toml b/Project.toml
index 819831d..ce6eb39 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.2.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
diff --git a/src/HMatrices.jl b/src/HMatrices.jl
index 72dec53..f4b338b 100644
--- a/src/HMatrices.jl
+++ b/src/HMatrices.jl
@@ -10,6 +10,7 @@ using RecipesBase
 using Distributed
 using Base.Threads
 using AbstractTrees
+using DataFlowTasks
 
 """
     getblock!(block,K,irange,jrange)
@@ -65,6 +66,23 @@ include("multiplication.jl")
 include("triangular.jl")
 include("lu.jl")
 
+# interface of DataFlowTasks
+function DataFlowTasks.memory_overlap(H1::HMatrix, H2::HMatrix)
+    isempty(intersect(rowrange(H1), rowrange(H2))) && (return false)
+    isempty(intersect(colrange(H1), colrange(H2))) && (return false)
+    return true
+end
+function DataFlowTasks.memory_overlap(H1::HMatrix, pairs::Vector{<:NTuple{2,<:HMatrix}})
+    for (A, B) in pairs
+        DataFlowTasks.memory_overlap(H1, A) && (return true)
+        DataFlowTasks.memory_overlap(H1, B) && (return true)
+    end
+    return false
+end
+function DataFlowTasks.memory_overlap(pairs::Vector{<:NTuple{2,<:HMatrix}}, H::HMatrix)
+    return DataFlowTasks.memory_overlap(H, pairs)
+end
+
 export ClusterTree,
     CardinalitySplitter,
     DyadicSplitter,
diff --git a/src/hmatrix.jl b/src/hmatrix.jl
index a3ad640..5429482 100644
--- a/src/hmatrix.jl
+++ b/src/hmatrix.jl
@@ -121,7 +121,9 @@ function _getcol!(col, adjH::Adjoint{<:Any,<:HMatrix}, j, piv)
         shift = pivot(adjH) .- 1
         jl = j - shift[2]
         irange = rowrange(adjH) .- (piv[1] - 1)
-        getcol!(view(col, irange), data(adjH), jl)
+        H = parent(adjH)
+        d = data(H)
+        getcol!(view(col, irange), adjoint(d), jl)
     end
     for child in children(adjH)
         if j ∈ colrange(child)
@@ -421,7 +423,7 @@ function _assemble_dense_block!(hmat, K)
 end
 
 hasdata(adjH::Adjoint{<:Any,<:HMatrix}) = hasdata(adjH.parent)
-data(adjH::Adjoint{<:Any,<:HMatrix}) = adjoint(data(adjH.parent))
+data(adjH::Adjoint{<:Any,<:HMatrix}) = adjoint(adjH.parent.data)
 children(adjH::Adjoint{<:Any,<:HMatrix}) = adjoint(children(adjH.parent))
 pivot(adjH::Adjoint{<:Any,<:HMatrix}) = reverse(pivot(adjH.parent))
 offset(adjH::Adjoint{<:Any,<:HMatrix}) = pivot(adjH) .- 1
diff --git a/src/lu.jl b/src/lu.jl
index f855b38..b49a5e5 100644
--- a/src/lu.jl
+++ b/src/lu.jl
@@ -30,7 +30,8 @@ function LinearAlgebra.lu!(M::HMatrix, compressor; threads = use_threads())
     buffers = [(VectorOfVectors(T), VectorOfVectors(T)) for _ in 1:Threads.nthreads()]
     _lu!(M, compressor, threads, buffers)
     # wrap the result in the LU structure
-    return LU(M, LinearAlgebra.BlasInt[], LinearAlgebra.BlasInt(0))
+    res = @dspawn LU(@R(M), LinearAlgebra.BlasInt[], LinearAlgebra.BlasInt(0)) label = "LU"
+    return fetch(res)
 end
 
 """
@@ -59,10 +60,8 @@ LinearAlgebra.lu(M::HMatrix, args...; kwargs...) = lu!(deepcopy(M), args...; kwa
 
 function _lu!(M::HMatrix, compressor, threads, bufs = nothing)
     if isleaf(M)
-        d = data(M)
-        @assert d isa Matrix
-        lu!(d, NOPIVOT())
-    else
+        @dspawn lu!(data(@RW(M)), NOPIVOT()) label = "Dense LU"
+    else # recurse on children
         @assert !hasdata(M)
         chdM = children(M)
         m, n = size(chdM)
diff --git a/src/multiplication.jl b/src/multiplication.jl
index f2e7f28..59e2308 100644
--- a/src/multiplication.jl
+++ b/src/multiplication.jl
@@ -17,7 +17,7 @@ function hmul!(C::T, A::T, B::T, a, b, compressor, bufs_ = nothing) where {T<:HM
     b == true || rmul!(C, b)
     dict = IdDict{T,Vector{NTuple{2,T}}}()
     _plan_dict!(dict, C, A, B)
-    _hmul!(C, compressor, dict, a, nothing, bufs)
+    _hmul!(C, compressor, dict, a, bufs)
     return C
 end
 
@@ -42,43 +42,131 @@ function _plan_dict!(dict, C::T, A::T, B::T) where {T<:HMatrix}
     return dict
 end
 
-function _hmul!(C::HMatrix, compressor, dict, a, R, bufs)
-    execute_node!(C, compressor, dict, a, R, bufs)
-    shift = pivot(C) .- 1
+# function _hmul!(C::HMatrix, compressor, dict, a, bufs)
+#     # recursive multiplication function. For the node C, if it is a leaf, you
+#     # add the contribution of all the pairs in the dictionary and P directly to
+#     # it. If it is not a leaf, build an approximation of P + all the pairs, and
+#     # pass it down to the children.
+#     T = typeof(C)
+#     S = eltype(C)
+#     pairs = get(dict, C, Tuple{T,T}[])
+#     if isleaf(C)
+#         if isadmissible(C)
+#             @dspawn begin
+#                 D = data(C)::RkMatrix{S}
+#                 L = MulLinearOp(D, P, pairs, a)
+#                 id = Threads.threadid()
+#                 R = compressor(L, axes(L, 1), axes(L, 2), bufs[id])
+#                 C.data = R
+#             end
+#         else
+#             M = data(C)::Matrix{S}
+#             for (A, B) in pairs
+#                 _mul_dense!(M, A, B, a)
+#             end
+#             isnothing(P) || mul!(M, P.A, adjoint(P.B), true, true)
+#         end
+#     else
+#         if isempty(pairs)
+#             Pnew = P
+#         else
+#             L = MulLinearOp(nothing, P, pairs, a)
+#             id = Threads.threadid()
+#             Pnew = compressor(L, axes(L, 1), axes(L, 2), bufs[id])
+#         end
+#         shift = pivot(C) .- 1
+#         for chd in children(C)
+#             if isnothing(Pnew)
+#                 _hmul!(chd, compressor, dict, a, nothing, bufs)
+#             else
+#                 irange = rowrange(chd) .- shift[1]
+#                 jrange = colrange(chd) .- shift[2]
+#                 # Rv = hasdata(C) ? view(Rp,irange,jrange) : nothing
+#                 block  = RkMatrix(Pnew.A[irange, :], Pnew.B[jrange, :])
+#                 _hmul!(chd, compressor, dict, a, block, bufs)
+#             end
+#         end
+#     end
+#     return C
+# end
+
+function _hmul!(C::T, compressor, dict, a, bufs) where {T<:HMatrix}
+    pairs = get(dict, C, Tuple{T,T}[])
+    if isleaf(C) && !isempty(pairs)
+        if isadmissible(C)
+            @dspawn begin
+                @RW C
+                @R pairs
+                L = MulLinearOp(data(C), nothing, pairs, a)
+                id = Threads.threadid()
+                R = compressor(L, axes(L, 1), axes(L, 2), bufs[id])
+                setdata!(C, R)
+            end label = "admissible leaf $(size(C)) ($(length(pairs)))"
+        else
+            @dspawn begin
+                @RW C
+                @R pairs
+                d = data(C)
+                for (A, B) in pairs
+                    _mul_dense!(d, A, B, a)
+                end
+            end label = "non-admissible leaf"
+        end
+    elseif !isempty(pairs)
+        # internal node: compute low rank approximation
+        Rtask = @dspawn begin
+            @R pairs
+            L = MulLinearOp(nothing, nothing, pairs, a)
+            id = Threads.threadid()
+            compressor(L, axes(L, 1), axes(L, 2), bufs[id])
+        end label = "internal node"
+        # move low rank data to leaves
+        add_to_leaves!(C, Rtask, compressor, bufs)
+    end
+    # continue with children of C
     for chd in children(C)
-        irange = rowrange(chd) .- shift[1]
-        jrange = colrange(chd) .- shift[2]
-        Rp     = data(C)
-        Rv     = hasdata(C) ? RkMatrix(Rp.A[irange, :], Rp.B[jrange, :]) : nothing
-        # Rv = hasdata(C) ? view(Rp,irange,jrange) : nothing
-        _hmul!(chd, compressor, dict, a, Rv, bufs)
+        _hmul!(chd, compressor, dict, a, bufs)
     end
-    isleaf(C) || (setdata!(C, nothing))
     return C
 end
 
-# non-recursive execution
-function execute_node!(C::HMatrix, compressor, dict, a, R, bufs)
-    T = typeof(C)
-    S = eltype(C)
-    pairs = get(dict, C, Tuple{T,T}[])
-    isnothing(R) && isempty(pairs) && (return C)
-    if isleaf(C) && !isadmissible(C)
-        d = data(C)::Matrix{S}
-        for (A, B) in pairs
-            _mul_dense!(d, A, B, a)
-        end
-        # isnothing(R) || axpy!(true, R, d)
-        isnothing(R) || mul!(d, R.A, adjoint(R.B), true, true)
+"""
+    add_to_leaves(H::HMatrix,R,compressor)
+
+Add `R` to the leaves of `H`.
+"""
+function add_to_leaves!(H::HMatrix, Rtask, compressor, bufs)
+    shift = pivot(H) .- 1
+    return _add_to_leaves!(H, Rtask, compressor, shift, bufs)
+end
+function _add_to_leaves!(node, Rtask, compressor, shift, bufs)
+    S = typeof(node)
+    T = eltype(node)
+    if isleaf(node)
+        @dspawn begin
+            @RW node
+            R = fetch(Rtask)::RkMatrix{T}
+            irange = rowrange(node) .- shift[1]
+            jrange = colrange(node) .- shift[2]
+            Rv = RkMatrix(R.A[irange, :], R.B[jrange, :])
+            # Rv = view(R,irange,jrange)
+            if isadmissible(node)
+                Rl = data(node)::RkMatrix{T}
+                L = MulLinearOp(Rl, Rv, Tuple{S,S}[], true)
+                id = Threads.threadid()
+                node.data = compressor(L, axes(L, 1), axes(L, 2), bufs[id])
+            else
+                M = data(node)::Matrix{T}
+                mul!(M, Rv.A, adjoint(Rv.B), true, true)
+                # axpy!(true, Rv, M)
+            end
+        end label = "add to leaves"
     else
-        L = MulLinearOp(data(C), R, pairs, a)
-        id = Threads.threadid()
-        R = compressor(L, axes(L, 1), axes(L, 2), bufs[id])
-        id == Threads.threadid() ||
-            (@warn "thread id changed from $id to $(Threads.threadid())")
-        setdata!(C, R)
+        for child in children(node)
+            _add_to_leaves!(child, Rtask, compressor, shift, bufs)
+        end
     end
-    return C
+    return node
 end
 
 """
diff --git a/src/triangular.jl b/src/triangular.jl
index a56f032..cbb7862 100644
--- a/src/triangular.jl
+++ b/src/triangular.jl
@@ -11,7 +11,7 @@ function Base.show(io::IO, ::MIME"text/plain", L::HUnitLowerTriangular)
     return println(io, "Unit lower triangular part of $H")
 end
 
-function LinearAlgebra.ldiv!(L::HUnitLowerTriangular, B::AbstractMatrix)
+function LinearAlgebra.ldiv!(L::HUnitLowerTriangular, B::StridedMatrix)
     H = parent(L)
     if isleaf(H)
         d = data(H)
@@ -51,8 +51,12 @@ function LinearAlgebra.ldiv!(
     H = parent(L)
     @assert isclean(H)
     if isleaf(X)
-        d = data(X)
-        ldiv!(L, d)
+        @dspawn begin
+            @R H
+            @RW X
+            d = data(X)
+            ldiv!(L, d)
+        end label = "dense ldiv"
     elseif isleaf(H) # X not a leaf, but L is a leaf. This should not happen.
         error()
     else
@@ -141,8 +145,12 @@ function LinearAlgebra.rdiv!(
 )
     H = parent(U)
     if isleaf(X)
-        d = data(X)
-        rdiv!(d, U) # b <-- b/L
+        @dspawn begin
+            @RW X
+            @R H
+            d = data(X)
+            rdiv!(d, U) # b <-- b/L
+        end label = "dense rdiv"
     elseif isleaf(H)
         error()
     else
diff --git a/test/compressor_test.jl b/test/compressor_test.jl
index 82e686e..61b8717 100644
--- a/test/compressor_test.jl
+++ b/test/compressor_test.jl
@@ -34,16 +34,16 @@ Random.seed!(1)
         # test fast update of frobenius norm
         m, n = 10000, 1000
         r = 10
-        A = VectorOfVectors(T,m,r)
-        B = VectorOfVectors(T,n,r)
-        A.data .= rand(T, m*r)
-        B.data .= rand(T, n*r)
-        old_norm = norm(Matrix(A)*adjoint(Matrix(B)), 2)
+        A = VectorOfVectors(T, m, r)
+        B = VectorOfVectors(T, n, r)
+        A.data .= rand(T, m * r)
+        B.data .= rand(T, n * r)
+        old_norm = norm(Matrix(A) * adjoint(Matrix(B)), 2)
         a = HMatrices.newcol!(A)
-        a .= rand(T,m)
+        a .= rand(T, m)
         b = HMatrices.newcol!(B)
-        b .= rand(T,n)
-        new_norm = norm(Matrix(A)*adjoint(Matrix(B)), 2)
+        b .= rand(T, n)
+        new_norm = norm(Matrix(A) * adjoint(Matrix(B)), 2)
         @test new_norm ≈ HMatrices._update_frob_norm(old_norm, A, B)
 
         # test simple case where things are not compressible
@@ -67,9 +67,9 @@ Random.seed!(1)
         R = tsvd(M, irange, jrange)
         @test rank(R) == r
         # test recompression using QR-SVD
-        R2 = RkMatrix(hcat(R.A,R.A),hcat(R.B,R.B))
+        R2 = RkMatrix(hcat(R.A, R.A), hcat(R.B, R.B))
         @test rank(R2) == 2r
-        HMatrices.compress!(R2,tsvd)
+        HMatrices.compress!(R2, tsvd)
         @test rank(R2) == r
     end
 end
@@ -101,17 +101,17 @@ end
 
         # test fast update of frobenius norm
         T = SMatrix{3,3,Float64,9}
-        A = VectorOfVectors(T,m,r)
-        B = VectorOfVectors(T,n,r)
-        A.data .= rand(T, m*r)
-        B.data .= rand(T, n*r)
-        R = Matrix(A)*collect(adjoint(Matrix(B)))
+        A = VectorOfVectors(T, m, r)
+        B = VectorOfVectors(T, n, r)
+        A.data .= rand(T, m * r)
+        B.data .= rand(T, n * r)
+        R = Matrix(A) * collect(adjoint(Matrix(B)))
         old_norm = norm(R, 2)
         a = HMatrices.newcol!(A)
-        a .= rand(T,m)
+        a .= rand(T, m)
         b = HMatrices.newcol!(B)
-        b .= rand(T,n)
-        R = Matrix(A)*collect(adjoint(Matrix(B)))
+        b .= rand(T, n)
+        R = Matrix(A) * collect(adjoint(Matrix(B)))
         new_norm = norm(R, 2)
         HMatrices._update_frob_norm(old_norm, A, B)
         @test_broken new_norm ≈ HMatrices._update_frob_norm(old_norm, A, B)