diff --git a/Project.toml b/Project.toml index ebe8f4ec..1c6fc038 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AbstractGPs" uuid = "99985d1d-32ba-4be9-9821-2ec096f28918" authors = ["JuliaGaussianProcesses Team"] -version = "0.5.16" +version = "0.5.17" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" diff --git a/src/AbstractGPs.jl b/src/AbstractGPs.jl index efa6735a..b0cd6885 100644 --- a/src/AbstractGPs.jl +++ b/src/AbstractGPs.jl @@ -27,6 +27,7 @@ export rand!, mean_vector, marginals, logpdf, + approx_log_evidence, elbo, dtc, posterior, diff --git a/src/abstract_gp.jl b/src/abstract_gp.jl index 9163efc6..2fb071b3 100644 --- a/src/abstract_gp.jl +++ b/src/abstract_gp.jl @@ -85,3 +85,30 @@ for (m, f) in [ ) end end + +""" + approx_log_evidence(approx::, lfx::LatentFiniteGP, ys) + +Compute an approximation to the log of the marginal likelihood (also known as +"evidence") under the given `approx`imation to the posterior. The return value +of `approx_log_evidence` can be used to optimise the hyperparameters of `lfx`. +""" +function approx_log_evidence end + +""" + posterior(fx::FiniteGP, y::AbstractVector{<:Real}) + posterior(approx::, fx::FiniteGP, y::AbstractVector{<:Real}) + posterior(approx::, lfx::LatentFiniteGP, y::AbstractVector) + +Construct the posterior distribution over the latent Gaussian process (`fx.f` +or `lfx.fx.f`), given the observations `y` corresponding to the process's +finite projection (`fx` or `lfx`). + +In the two-argument form, this describes exact GP regression with `y` observed +under a Gaussian likelihood, and returns a `PosteriorGP`. + +In the three-argument form, the first argument specifies the approximation to +be used (e.g. `VFE` or defined in other packages such as ApproximateGPs.jl), +and returns an `ApproxPosteriorGP`. +""" +function posterior end diff --git a/src/deprecations.jl b/src/deprecations.jl index 459fe675..49975b7e 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -5,3 +5,6 @@ @deprecate sampleplot!(plt::RecipesBase.AbstractPlot, gp::FiniteGP, n::Int; kwargs...) sampleplot!( plt, gp; samples=n, kwargs... ) + +@deprecate elbo(dtc::DTC, fx, y) approx_log_evidence(dtc, fx, y) +@deprecate dtc(vfe::Union{VFE,DTC}, fx, y) approx_log_evidence(vfe, fx, y) diff --git a/src/exact_gpr_posterior.jl b/src/exact_gpr_posterior.jl index fd0d265b..5cbe98f5 100644 --- a/src/exact_gpr_posterior.jl +++ b/src/exact_gpr_posterior.jl @@ -3,10 +3,18 @@ struct PosteriorGP{Tprior,Tdata} <: AbstractGP data::Tdata end +struct ExactInference end + +posterior(::ExactInference, fx::FiniteGP, y::AbstractVector{<:Real}) = posterior(fx, y) + +function approx_log_evidence(::ExactInference, fx::FiniteGP, y::AbstractVector{<:Real}) + return logpdf(fx, y) +end + """ posterior(fx::FiniteGP, y::AbstractVector{<:Real}) -Construct the posterior distribution over `fx.f` given observations `y` at `x` made under +Construct the posterior distribution over `fx.f` given observations `y` at `fx.x` made under noise `fx.Σy`. This is another `AbstractGP` object. See chapter 2 of [1] for a recap on exact inference in GPs. This posterior process has mean function ```julia diff --git a/src/sparse_approximations.jl b/src/sparse_approximations.jl index 5393a35f..99b25f35 100644 --- a/src/sparse_approximations.jl +++ b/src/sparse_approximations.jl @@ -13,7 +13,14 @@ struct VFE{Tfz<:FiniteGP} fz::Tfz end -const DTC = VFE +""" + DTC(fz::FiniteGP) + +Similar to `VFE`, but uses a different objective for `approx_log_evidence`. +""" +struct DTC{Tfz<:FiniteGP} + fz::Tfz +end struct ApproxPosteriorGP{Tapprox,Tprior,Tdata} <: AbstractGP approx::Tapprox @@ -48,7 +55,7 @@ true processes". In: Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics. 2009. """ -function posterior(vfe::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) +function posterior(vfe::Union{VFE,DTC}, fx::FiniteGP, y::AbstractVector{<:Real}) @assert vfe.fz.f === fx.f U_y = _cholesky(_symmetric(fx.Σy)).U @@ -69,7 +76,7 @@ end """ function update_posterior( - f_post_approx::ApproxPosteriorGP{<:VFE}, + f_post_approx::ApproxPosteriorGP{<:Union{VFE,DTC}}, fx::FiniteGP, y::AbstractVector{<:Real} ) @@ -78,7 +85,9 @@ Update the `ApproxPosteriorGP` given a new set of observations. Here, we retain set of pseudo-points. """ function update_posterior( - f_post_approx::ApproxPosteriorGP{<:VFE}, fx::FiniteGP, y::AbstractVector{<:Real} + f_post_approx::ApproxPosteriorGP{<:Union{VFE,DTC}}, + fx::FiniteGP, + y::AbstractVector{<:Real}, ) @assert f_post_approx.prior === fx.f @@ -111,14 +120,14 @@ end """ function update_posterior( - f_post_approx::ApproxPosteriorGP{<:VFE}, + f_post_approx::ApproxPosteriorGP{<:Union{VFE,DTC}}, z::FiniteGP, ) Update the `ApproxPosteriorGP` given a new set of pseudo-points to append to the existing set of pseudo-points. """ -function update_posterior(f_post_approx::ApproxPosteriorGP{<:VFE}, fz::FiniteGP) +function update_posterior(f_post_approx::ApproxPosteriorGP{<:Union{VFE,DTC}}, fz::FiniteGP) @assert f_post_approx.prior === fz.f z_old = inducing_points(f_post_approx) @@ -161,48 +170,56 @@ function update_posterior(f_post_approx::ApproxPosteriorGP{<:VFE}, fz::FiniteGP) x=f_post_approx.data.x, Σy=f_post_approx.data.Σy, ) - return ApproxPosteriorGP(VFE(fz_new), f_post_approx.prior, cache) + return ApproxPosteriorGP( + _update_approx(f_post_approx.approx, fz_new), f_post_approx.prior, cache + ) end +_update_approx(vfe::VFE, fz_new::FiniteGP) = VFE(fz_new) +_update_approx(dtc::DTC, fz_new::FiniteGP) = DTC(fz_new) + # AbstractGP interface implementation. -function Statistics.mean(f::ApproxPosteriorGP{<:VFE}, x::AbstractVector) +function Statistics.mean(f::ApproxPosteriorGP{<:Union{VFE,DTC}}, x::AbstractVector) return mean(f.prior, x) + cov(f.prior, x, inducing_points(f)) * f.data.α end -function Statistics.cov(f::ApproxPosteriorGP{<:VFE}, x::AbstractVector) +function Statistics.cov(f::ApproxPosteriorGP{<:Union{VFE,DTC}}, x::AbstractVector) A = f.data.U' \ cov(f.prior, inducing_points(f), x) return cov(f.prior, x) - At_A(A) + Xt_invA_X(f.data.Λ_ε, A) end -function Statistics.var(f::ApproxPosteriorGP{<:VFE}, x::AbstractVector) +function Statistics.var(f::ApproxPosteriorGP{<:Union{VFE,DTC}}, x::AbstractVector) A = f.data.U' \ cov(f.prior, inducing_points(f), x) return var(f.prior, x) - diag_At_A(A) + diag_Xt_invA_X(f.data.Λ_ε, A) end -function Statistics.cov(f::ApproxPosteriorGP{<:VFE}, x::AbstractVector, y::AbstractVector) +function Statistics.cov( + f::ApproxPosteriorGP{<:Union{VFE,DTC}}, x::AbstractVector, y::AbstractVector +) A_zx = f.data.U' \ cov(f.prior, inducing_points(f), x) A_zy = f.data.U' \ cov(f.prior, inducing_points(f), y) return cov(f.prior, x, y) - A_zx'A_zy + Xt_invA_Y(A_zx, f.data.Λ_ε, A_zy) end -function StatsBase.mean_and_cov(f::ApproxPosteriorGP{<:VFE}, x::AbstractVector) +function StatsBase.mean_and_cov(f::ApproxPosteriorGP{<:Union{VFE,DTC}}, x::AbstractVector) A = f.data.U' \ cov(f.prior, inducing_points(f), x) m_post = mean(f.prior, x) + A' * f.data.m_ε C_post = cov(f.prior, x) - At_A(A) + Xt_invA_X(f.data.Λ_ε, A) return m_post, C_post end -function StatsBase.mean_and_var(f::ApproxPosteriorGP{<:VFE}, x::AbstractVector) +function StatsBase.mean_and_var(f::ApproxPosteriorGP{<:Union{VFE,DTC}}, x::AbstractVector) A = f.data.U' \ cov(f.prior, inducing_points(f), x) m_post = mean(f.prior, x) + A' * f.data.m_ε c_post = var(f.prior, x) - diag_At_A(A) + diag_Xt_invA_X(f.data.Λ_ε, A) return m_post, c_post end -inducing_points(f::ApproxPosteriorGP{<:VFE}) = f.approx.fz.x +inducing_points(f::ApproxPosteriorGP{<:Union{VFE,DTC}}) = f.approx.fz.x """ + approx_log_evidence(vfe::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) elbo(vfe::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) The Titsias Evidence Lower BOund (ELBO) [1]. `y` are observations of `fx`, and `v.z` @@ -228,14 +245,16 @@ true processes". In: Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics. 2009. """ -function elbo(vfe::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) +function approx_log_evidence(vfe::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) @assert vfe.fz.f === fx.f - _dtc, A = _compute_intermediates(fx, y, vfe.fz) - return _dtc - (tr_Cf_invΣy(fx, fx.Σy) - sum(abs2, A)) / 2 + dtc_objective, A = _compute_intermediates(fx, y, vfe.fz) + return dtc_objective - (tr_Cf_invΣy(fx, fx.Σy) - sum(abs2, A)) / 2 end +elbo(vfe::VFE, fx, y) = approx_log_evidence(vfe, fx, y) + """ - dtc(v::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) + approx_log_evidence(dtc::DTC, fx::FiniteGP, y::AbstractVector{<:Real}) The Deterministic Training Conditional (DTC) [1]. `y` are observations of `fx`, and `v.z` are inducing points. @@ -248,11 +267,11 @@ julia> x = randn(1000); julia> z = range(-5.0, 5.0; length=256); -julia> v = VFE(f(z)); +julia> d = DTC(f(z)); julia> y = rand(f(x, 0.1)); -julia> isapprox(dtc(v, f(x, 0.1), y), logpdf(f(x, 0.1), y); atol=1e-6, rtol=1e-6) +julia> isapprox(approx_log_evidence(d, f(x, 0.1), y), logpdf(f(x, 0.1), y); atol=1e-6, rtol=1e-6) true ``` @@ -260,13 +279,13 @@ true Sparse Gaussian Process Regression". In: Proceedings of the Ninth International Workshop on Artificial Intelligence and Statistics. 2003 """ -function dtc(vfe::VFE, fx::FiniteGP, y::AbstractVector{<:Real}) - @assert vfe.fz.f === fx.f - _dtc, _ = _compute_intermediates(fx, y, vfe.fz) - return _dtc +function approx_log_evidence(dtc::DTC, fx::FiniteGP, y::AbstractVector{<:Real}) + @assert dtc.fz.f === fx.f + dtc_objective, _ = _compute_intermediates(fx, y, dtc.fz) + return dtc_objective end -# Factor out computations common to the `elbo` and `dtc`. +# Factor out computations of `approx_log_evidence` common to `VFE` and `DTC` function _compute_intermediates(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP) length(fx) == length(y) || throw( DimensionMismatch( diff --git a/test/sparse_approximations.jl b/test/sparse_approximations.jl index 3187e8a5..375ee2a7 100644 --- a/test/sparse_approximations.jl +++ b/test/sparse_approximations.jl @@ -1,133 +1,136 @@ -@testset "approx_posterior_gp" begin - rng = MersenneTwister(123456) - N_cond = 3 - N_a = 5 - N_b = 6 - - # Specify prior. - f = GP(sin, Matern32Kernel()) - - # Sample from prior. - x = collect(range(-1.0, 1.0; length=N_cond)) - fx = f(x, 1e-15) - y = rand(rng, fx) - - # Construct posterior. - f_post = posterior(fx, y) - - # Construct optimal approximate posterior. - f_approx_post = posterior(VFE(f(x, 1e-12)), fx, y) - - # Verify that approximate posterior ≈ posterior at various inputs. - x_test = randn(rng, 100) - @test mean(f_post, x_test) ≈ mean(f_approx_post, x_test) - @test cov(f_post, x_test) ≈ cov(f_approx_post, x_test) - - # Verify that AbstractGP interface is implemented fully and consistently. - a = collect(range(-1.0, 1.0; length=N_a)) - b = randn(rng, N_b) - TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b) - - @testset "update_posterior (new observation)" begin - rng = MersenneTwister(1) - X = rand(rng, 10) - y = rand(rng, 10) - Z = rand(rng, 4) - - f = GP(SqExponentialKernel()) - - # online learning - p_fx1 = posterior(VFE(f(Z)), f(X[1:7], 0.1), y[1:7]) - u_p_fx1 = update_posterior(p_fx1, f(X[8:10], 0.1), y[8:10]) - - # batch learning - p_fx2 = posterior(VFE(f(Z)), f(X, 0.1), y) - - @test inducing_points(u_p_fx1) ≈ inducing_points(p_fx2) atol = 1e-5 - @test u_p_fx1.data.m_ε ≈ p_fx2.data.m_ε atol = 1e-5 - @test u_p_fx1.data.Λ_ε.U ≈ p_fx2.data.Λ_ε.U atol = 1e-5 - @test u_p_fx1.data.U ≈ p_fx2.data.U atol = 1e-5 - @test u_p_fx1.data.α ≈ p_fx2.data.α atol = 1e-5 - @test u_p_fx1.data.b_y ≈ p_fx2.data.b_y atol = 1e-5 - @test u_p_fx1.data.B_εf ≈ p_fx2.data.B_εf atol = 1e-5 - @test u_p_fx1.data.x ≈ p_fx2.data.x atol = 1e-5 - @test u_p_fx1.data.Σy ≈ p_fx2.data.Σy atol = 1e-5 - end - - @testset "update_posterior (new pseudo-points)" begin - rng = MersenneTwister(1) - X = rand(rng, 10) - y = rand(rng, 10) - Z1 = rand(rng, 4) - Z2 = rand(rng, 3) - Z = vcat(Z1, Z2) - - f = GP(SqExponentialKernel()) - - # Sequentially adding pseudo points - p_fx1 = posterior(VFE(f(Z1)), f(X, 0.1), y) - u_p_fx1 = update_posterior(p_fx1, f(Z2)) - - # Adding all pseudo-points at once - p_fx2 = posterior(VFE(f(Z)), f(X, 0.1), y) - - @test inducing_points(u_p_fx1) ≈ inducing_points(p_fx2) atol = 1e-5 - @test u_p_fx1.data.m_ε ≈ p_fx2.data.m_ε atol = 1e-5 - @test u_p_fx1.data.Λ_ε.U ≈ p_fx2.data.Λ_ε.U atol = 1e-5 - @test u_p_fx1.data.U ≈ p_fx2.data.U atol = 1e-5 - @test u_p_fx1.data.α ≈ p_fx2.data.α atol = 1e-2 rtol = 1e-2 - @test u_p_fx1.data.b_y ≈ p_fx2.data.b_y atol = 1e-5 - @test u_p_fx1.data.B_εf ≈ p_fx2.data.B_εf atol = 1e-5 - @test u_p_fx1.data.x ≈ p_fx2.data.x atol = 1e-5 - @test u_p_fx1.data.Σy ≈ p_fx2.data.Σy atol = 1e-5 - end - - @testset "elbo / dtc" begin - x = collect(Float64, range(-1.0, 1.0; length=N_cond)) - f = GP(SqExponentialKernel()) - fx = f(x, 0.1) +@testset "sparse_approximations" begin + @testset "approx_posterior_gp[$(ApproxType)]" for ApproxType in [VFE, DTC] + rng = MersenneTwister(123456) + N_cond = 3 + N_a = 5 + N_b = 6 + + # Specify prior. + f = GP(sin, Matern32Kernel()) + + # Sample from prior. + x = collect(range(-1.0, 1.0; length=N_cond)) + fx = f(x, 1e-15) y = rand(rng, fx) - # Ensure that the elbo is close to the logpdf when appropriate. - @test elbo(VFE(f(x)), fx, y) isa Real - @test elbo(VFE(f(x)), fx, y) ≈ logpdf(fx, y) - @test elbo(VFE(f(x .+ randn(rng, N_cond))), fx, y) < logpdf(fx, y) - - # Ensure that the dtc is close to the logpdf when appropriate. - @test dtc(VFE(f(x)), fx, y) isa Real - @test dtc(VFE(f(x)), fx, y) ≈ logpdf(fx, y) + # Construct posterior. + f_post = posterior(fx, y) + + # Construct optimal approximate posterior. + f_approx_post = posterior(ApproxType(f(x, 1e-12)), fx, y) + + # Verify that approximate posterior ≈ posterior at various inputs. + x_test = randn(rng, 100) + @test mean(f_post, x_test) ≈ mean(f_approx_post, x_test) + @test cov(f_post, x_test) ≈ cov(f_approx_post, x_test) + + # Verify that AbstractGP interface is implemented fully and consistently. + a = collect(range(-1.0, 1.0; length=N_a)) + b = randn(rng, N_b) + TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b) + + @testset "update_posterior (new observation)" begin + rng = MersenneTwister(1) + X = rand(rng, 10) + y = rand(rng, 10) + Z = rand(rng, 4) + + f = GP(SqExponentialKernel()) + + # online learning + p_fx1 = posterior(ApproxType(f(Z)), f(X[1:7], 0.1), y[1:7]) + u_p_fx1 = update_posterior(p_fx1, f(X[8:10], 0.1), y[8:10]) + + # batch learning + p_fx2 = posterior(ApproxType(f(Z)), f(X, 0.1), y) + + @test inducing_points(u_p_fx1) ≈ inducing_points(p_fx2) atol = 1e-5 + @test u_p_fx1.data.m_ε ≈ p_fx2.data.m_ε atol = 1e-5 + @test u_p_fx1.data.Λ_ε.U ≈ p_fx2.data.Λ_ε.U atol = 1e-5 + @test u_p_fx1.data.U ≈ p_fx2.data.U atol = 1e-5 + @test u_p_fx1.data.α ≈ p_fx2.data.α atol = 1e-5 + @test u_p_fx1.data.b_y ≈ p_fx2.data.b_y atol = 1e-5 + @test u_p_fx1.data.B_εf ≈ p_fx2.data.B_εf atol = 1e-5 + @test u_p_fx1.data.x ≈ p_fx2.data.x atol = 1e-5 + @test u_p_fx1.data.Σy ≈ p_fx2.data.Σy atol = 1e-5 + end + + @testset "update_posterior (new pseudo-points)" begin + rng = MersenneTwister(1) + X = rand(rng, 10) + y = rand(rng, 10) + Z1 = rand(rng, 4) + Z2 = rand(rng, 3) + Z = vcat(Z1, Z2) + + f = GP(SqExponentialKernel()) + + # Sequentially adding pseudo points + p_fx1 = posterior(ApproxType(f(Z1)), f(X, 0.1), y) + u_p_fx1 = update_posterior(p_fx1, f(Z2)) + + # Adding all pseudo-points at once + p_fx2 = posterior(ApproxType(f(Z)), f(X, 0.1), y) + + @test inducing_points(u_p_fx1) ≈ inducing_points(p_fx2) atol = 1e-5 + @test u_p_fx1.data.m_ε ≈ p_fx2.data.m_ε atol = 1e-5 + @test u_p_fx1.data.Λ_ε.U ≈ p_fx2.data.Λ_ε.U atol = 1e-5 + @test u_p_fx1.data.U ≈ p_fx2.data.U atol = 1e-5 + @test u_p_fx1.data.α ≈ p_fx2.data.α atol = 1e-2 rtol = 1e-2 + @test u_p_fx1.data.b_y ≈ p_fx2.data.b_y atol = 1e-5 + @test u_p_fx1.data.B_εf ≈ p_fx2.data.B_εf atol = 1e-5 + @test u_p_fx1.data.x ≈ p_fx2.data.x atol = 1e-5 + @test u_p_fx1.data.Σy ≈ p_fx2.data.Σy atol = 1e-5 + end + + @testset "approx_log_evidence" begin + x = collect(Float64, range(-1.0, 1.0; length=N_cond)) + f = GP(SqExponentialKernel()) + fx = f(x, 0.1) + y = rand(rng, fx) + + # Ensure that the elbo/dtc objective is close to the logpdf when appropriate. + @test approx_log_evidence(ApproxType(f(x)), fx, y) isa Real + @test approx_log_evidence(ApproxType(f(x)), fx, y) ≈ logpdf(fx, y) + + if ApproxType === VFE + @test elbo(ApproxType(f(x)), fx, y) == + approx_log_evidence(ApproxType(f(x)), fx, y) + @test elbo(ApproxType(f(x .+ randn(rng, N_cond))), fx, y) < logpdf(fx, y) + end + end + + @testset "Type Stability - $T" for T in [Float64, Float32] + x = collect(T, range(-1.0, 1.0; length=N_cond)) + f = GP(T(0), SqExponentialKernel()) + fx = f(x, T(0.1)) + y = rand(rng, fx) + jitter = T(1e-12) + + @test elbo(ApproxType(f(x, jitter)), fx, y) isa T + @test dtc(ApproxType(f(x, jitter)), fx, y) isa T + + post = posterior(ApproxType(f(x, jitter)), fx, y) + p_fx = post(x, jitter) + + @test rand(rng, p_fx) isa Vector{T} + @test logpdf(p_fx, y) isa T + end end - @testset "Type Stability - $T" for T in [Float64, Float32] - x = collect(T, range(-1.0, 1.0; length=N_cond)) - f = GP(T(0), SqExponentialKernel()) - fx = f(x, T(0.1)) - y = rand(rng, fx) - jitter = T(1e-12) - - @test elbo(VFE(f(x, jitter)), fx, y) isa T - @test dtc(VFE(f(x, jitter)), fx, y) isa T - - post = posterior(VFE(f(x, jitter)), fx, y) - p_fx = post(x, jitter) - - @test rand(rng, p_fx) isa Vector{T} - @test logpdf(p_fx, y) isa T - end -end + @testset "tr_Cf_invΣy" begin + N = 3 + x = rand(N) + σ² = 1.234 + Σy_Diagonal = Diagonal(Fill(σ², N)) + Σy_ScalMat = ScalMat(N, σ²) + # Σy_dense = _to_psd(randn(N, N - 2)) # dense observation covariance not currently implemented for sparse approximation, and might be slower than exact inference anyways + f = GP(SqExponentialKernel()) -@testset "tr_Cf_invΣy" begin - N = 3 - x = rand(N) - σ² = 1.234 - Σy_Diagonal = Diagonal(Fill(σ², N)) - Σy_ScalMat = ScalMat(N, σ²) - # Σy_dense = _to_psd(randn(N, N - 2)) # dense observation covariance not currently implemented for sparse approximation, and might be slower than exact inference anyways - f = GP(SqExponentialKernel()) - - for Σy in (Σy_Diagonal, Σy_ScalMat) - fx = f(x, Σy) - Cf = cov(f, x) - @test AbstractGPs.tr_Cf_invΣy(fx, Σy) ≈ tr(Cf / Matrix(Σy)) + for Σy in (Σy_Diagonal, Σy_ScalMat) + fx = f(x, Σy) + Cf = cov(f, x) + @test AbstractGPs.tr_Cf_invΣy(fx, Σy) ≈ tr(Cf / Matrix(Σy)) + end end end