diff --git a/ADBench/run-all.ps1 b/ADBench/run-all.ps1
old mode 100644
new mode 100755
index 4893cc24..c97384d7
--- a/ADBench/run-all.ps1
+++ b/ADBench/run-all.ps1
@@ -29,52 +29,52 @@ https://github.com/awf/ADBench
 
 #>
 
-param(# Which build to test.  
+param(# Which build to test.
       # Builds should leave a script file 'cmake-vars-$buildtype.ps1' in the ADBench directory.
       # which sets $bindir to the build directory,
       # And if only some D,K are valid for GMM, sets $gmm_d_vals, and $gmm_k_vals
       [string]$buildtype="Release",
-      
-      # Estimated time of accurate result achievement. 
-      # A runner cyclically reruns measured function until total time becomes more than that value. 
+
+      # Estimated time of accurate result achievement.
+      # A runner cyclically reruns measured function until total time becomes more than that value.
       # Supported only by the benchmark runner-based tools
       # (those with ToolType cpp, dotnet, julia, or python).
       [double]$minimum_measurable_time = 0.5,
 
-      # Maximum number of times to run the function for timing                
-      [int]$nruns_f=10,        
-      
+      # Maximum number of times to run the function for timing
+      [int]$nruns_f=10,
+
       # Maximum number of times to run the jacobian for timing
-      [int]$nruns_J=10,        
-      
+      [int]$nruns_J=10,
+
       # How many seconds to wait before we believe we have accurate timings
-      [double]$time_limit=10,  
-      
-      # Kill the test after this many seconds 
+      [double]$time_limit=10,
+
+      # Kill the test after this many seconds
       [double]$timeout=300,
 
       # Kill the test if it consumes more than this many gigabytes of RAM.
       [double]$max_memory_amount_in_gb=[double]::PositiveInfinity,
-      
-      # Where to store the ouput, defaults to tmp/ in the project root 
+
+      # Where to store the ouput, defaults to tmp/ in the project root
       [string]$tmpdir="",
-      
+
       # Repeat tests, even if output file exists
       [switch]$repeat,
-      
+
       # Repeat only failed tests
       [switch]$repeat_failures,
-      
+
       # List of tools to run
       [string[]]$tools=@(),
-      
+
       # Don't delete produced jacobians even if they're accurate
-      [switch]$keep_correct_jacobians, 
-      
+      [switch]$keep_correct_jacobians,
+
       # GMM D values to try.  Must be a subset of the list of
-      # compiled values in ADBench/cmake-vars-$buildtype.ps1                         
+      # compiled values in ADBench/cmake-vars-$buildtype.ps1
       [int[]]$gmm_d_vals_param,
-      
+
       # GMM K values to run.  As above.
       [int[]]$gmm_k_vals_param,
 
@@ -209,7 +209,7 @@ function run_command ($indent, $outfile, $timeout, $cmd) {
                 $mem = [math]::round($mem / (1024 * 1024 * 1024), 2)
                 Write-Host "${indent}Killed due to consuming $mem GB of operating memory"
                 Store-NonFatalError "Process killed due to consuming $mem GB of operating memory`n[$cmd $args]"
-                
+
                 $status = [RunCommandStatus]::OutOfMemory
                 break
             }
@@ -327,16 +327,16 @@ Class Tool {
         This will create a Tool:
         - called "Finite"
         - run from binary executables
-        - runs all four tests 
+        - runs all four tests
         - does not do GMM in separate FULL and SPLIT modes
         - doesn't require separate executables for different GMM sizes
         - does not check the correctness of the computed jacobians
 
         .NOTES
-        $objectives is an enumerable variable, 
+        $objectives is an enumerable variable,
         where each flag determines whether to run a certain objective:
         GMM, BA, HAND, LSTM
-        
+
         #>
 
         $this.name = $name
@@ -358,16 +358,16 @@ Class Tool {
         This will create a Tool:
         - called "Finite"
         - run from binary executables
-        - runs all four tests 
+        - runs all four tests
         - does not do GMM in separate FULL and SPLIT modes
         - doesn't require separate executables for different GMM sizes
         - does not check the correctness of the computed jacobians
 
         .NOTES
-        $objectives is an enumerable variable, 
+        $objectives is an enumerable variable,
         where each flag determines whether to run a certain objective:
         GMM, BA, HAND, LSTM
-        
+
         #>
 
         $this.name = $name
@@ -477,11 +477,11 @@ Class Tool {
         if ($run_command_status -eq [RunCommandStatus]::OutOfMemory) {
             $status = [RunTestStatus]::OutOfMemory
         }
-        
+
         return $status
     }
 
-    # Get postfix for tool output file name 
+    # Get postfix for tool output file name
     [string] get_out_name_postfix([string]$objective) {
         $postfix = $this.name
 
@@ -697,7 +697,10 @@ $tool_descriptors = @(
     [Tool]::new("ManualEigen", "cpp", [ObjectiveType] "GMM, BA, Hand, LSTM", $true, $default_tolerance)
     [Tool]::new("ManualEigenVector", "cpp", [ObjectiveType] "GMM", $true, $default_tolerance)
     [Tool]::new("DiffSharpModule", "dotnet", [ObjectiveType] "GMM, BA, Hand, LSTM", $true, $default_tolerance)
-    [Tool]::new("Tapenade", "cpp", [ObjectiveType] "BA, LSTM, GMM, Hand", $true, $default_tolerance)
+    [Tool]::new("Tapenade", "cpp", [ObjectiveType] "GMM, BA", $true, $default_tolerance)
+    [Tool]::new("Clad", "cpp", [ObjectiveType] "GMM, BA", $true, $default_tolerance)
+    [Tool]::new("CladLatest", "cpp", [ObjectiveType] "GMM, BA", $true, $default_tolerance)
+    [Tool]::new("CladLatestTBR", "cpp", [ObjectiveType] "GMM, BA", $true, $default_tolerance)
     [Tool]::new("PyTorch", "python", [ObjectiveType] "BA, LSTM, GMM, Hand", $true, 1e-7)
     [Tool]::new("TorchScript", "python", [ObjectiveType] "GMM", $true, 1e-7)
     [Tool]::new("Tensorflow", "python", [ObjectiveType] "BA, LSTM, GMM, Hand", $true, $default_tolerance)
diff --git a/src/cpp/modules/CMakeLists.txt b/src/cpp/modules/CMakeLists.txt
index eecc1f8c..9a58864f 100644
--- a/src/cpp/modules/CMakeLists.txt
+++ b/src/cpp/modules/CMakeLists.txt
@@ -1,5 +1,8 @@
 project("ADBench_Modules")
 
+# add_subdirectory ("clad")
+add_subdirectory ("cladLatest")
+add_subdirectory ("cladLatestTBR")
 add_subdirectory ("manual")
 add_subdirectory ("manualEigen")
 add_subdirectory ("manualEigenVector")
diff --git a/src/cpp/modules/cladLatest/CMakeLists.txt b/src/cpp/modules/cladLatest/CMakeLists.txt
new file mode 100644
index 00000000..eb70843d
--- /dev/null
+++ b/src/cpp/modules/cladLatest/CMakeLists.txt
@@ -0,0 +1,15 @@
+project("ADBench_CladLatest" CXX)
+
+add_library("CladLatest" MODULE)
+
+SET(clad_install_root "/home/mad-scientist/clubs/open-source/clad/formal/builds/build-llvm-16/inst")
+
+target_sources("CladLatest" PRIVATE "${CMAKE_SOURCE_DIR}/src/cpp/shared/utils.cpp")
+target_sources("CladLatest" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/ba/ba_grad.cpp")
+target_sources("CladLatest" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/CladLatestBA.cpp")
+target_sources("CladLatest" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/gmm/gmm_grad.cpp")
+target_sources("CladLatest" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/CladLatestGMM.cpp")
+
+target_include_directories("CladLatest" PRIVATE ${clad_install_root}/include)
+target_include_directories("CladLatest" PRIVATE ${CMAKE_SOURCE_DIR}/src/cpp/shared)
+# add_compile_definitions("CladLatest" CLAD_NO_NUM_DIFF)
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/CladLatestBA.cpp b/src/cpp/modules/cladLatest/CladLatestBA.cpp
new file mode 100644
index 00000000..f5b06174
--- /dev/null
+++ b/src/cpp/modules/cladLatest/CladLatestBA.cpp
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "CladLatestBA.h"
+#include "ba/ba_grad.h"
+
+// This function must be called before any other function.
+void CladLatestBA::prepare(BAInput &&input) {
+  this->input = input;
+  result = {std::vector<double>(2 * this->input.p),
+            std::vector<double>(this->input.p),
+            BASparseMat(this->input.n, this->input.m, this->input.p)};
+
+  reproj_err_d = std::vector<double>(2 * (BA_NCAMPARAMS + 3 + 1));
+  reproj_err_d_row = std::vector<double>(BA_NCAMPARAMS + 3 + 1);
+}
+
+BAOutput CladLatestBA::output() { return result; }
+
+void CladLatestBA::calculate_objective(int times) {
+  for (int i = 0; i < times; i++) {
+    ba_objective(input.n, input.m, input.p, input.cams.data(), input.X.data(),
+                 input.w.data(), input.obs.data(), input.feats.data(),
+                 result.reproj_err.data(), result.w_err.data());
+  }
+}
+
+void CladLatestBA::calculate_jacobian(int times) {
+  for (int i = 0; i < times; i++) {
+    result.J.clear();
+    calculate_reproj_error_jacobian_part();
+    calculate_weight_error_jacobian_part();
+  }
+}
+
+void CladLatestBA::calculate_reproj_error_jacobian_part() {
+  double
+      errb[2]; // stores dY
+               // (i-th element equals to 1.0 for calculating i-th jacobian row)
+
+  double err[2]; // stores fictive result
+                 // (Tapenade doesn't calculate an original function in
+                 // reverse mode)
+
+  clad::array_ref<double> cam_gradient_part(reproj_err_d_row.data(),
+                                            reproj_err_d_row.size());
+  clad::array_ref<double> x_gradient_part(
+      reproj_err_d_row.data() + BA_NCAMPARAMS,
+      reproj_err_d_row.size() - BA_NCAMPARAMS);
+  clad::array_ref<double> weight_gradient_part(
+      reproj_err_d_row.data() + BA_NCAMPARAMS + 3,
+      reproj_err_d_row.size() - BA_NCAMPARAMS - 3);
+  std::vector<double> feats_gradient_part(2, 0);
+  clad::array_ref<double> feats_gradient_part_ref(feats_gradient_part.data(),
+                                                  feats_gradient_part.size());
+  clad::array_ref<double> errb_ref(errb, 2);
+
+  for (int i = 0; i < input.p; i++) {
+    std::fill(reproj_err_d_row.begin(), reproj_err_d_row.end(), 0);
+    std::fill(feats_gradient_part.begin(), feats_gradient_part.end(), 0);
+
+    int camIdx = input.obs[2 * i + 0];
+    int ptIdx = input.obs[2 * i + 1];
+
+    // calculate first row
+    errb[0] = 1.0;
+    errb[1] = 0.0;
+    computeReprojError_pullback(
+        &input.cams[camIdx * BA_NCAMPARAMS], &input.X[ptIdx * 3], &input.w[i],
+        &input.feats[i * 2], err, cam_gradient_part, x_gradient_part,
+        weight_gradient_part, feats_gradient_part_ref, errb_ref);
+
+    // fill first row elements
+    for (int j = 0; j < BA_NCAMPARAMS + 3 + 1; j++) {
+      reproj_err_d[2 * j] = reproj_err_d_row[j];
+    }
+
+    std::fill(reproj_err_d_row.begin(), reproj_err_d_row.end(), 0);
+    std::fill(feats_gradient_part.begin(), feats_gradient_part.end(), 0);
+    // calculate second row
+    errb[0] = 0.0;
+    errb[1] = 1.0;
+    computeReprojError_pullback(
+        &input.cams[camIdx * BA_NCAMPARAMS], &input.X[ptIdx * 3], &input.w[i],
+        &input.feats[i * 2], err, cam_gradient_part, x_gradient_part,
+        weight_gradient_part, feats_gradient_part_ref, errb);
+
+    // fill second row elements
+    for (int j = 0; j < BA_NCAMPARAMS + 3 + 1; j++) {
+      reproj_err_d[2 * j + 1] = reproj_err_d_row[j];
+    }
+
+    result.J.insert_reproj_err_block(i, camIdx, ptIdx, reproj_err_d.data());
+  }
+}
+
+void CladLatestBA::calculate_weight_error_jacobian_part() {
+  for (int j = 0; j < input.p; j++) {
+    double wb = 0; // stores calculated derivative
+
+    double err = 0.0; // stores fictive result
+                      // (Tapenade doesn't calculate an original function in
+                      // reverse mode)
+
+    double errb = 1.0; // stores dY
+                       // (equals to 1.0 for derivative calculation)
+
+    computeZachWeightError_pullback(&input.w[j], &err, &wb, &errb);
+    result.J.insert_w_err_block(j, wb);
+  }
+}
+
+extern "C" DLL_PUBLIC ITest<BAInput, BAOutput> *get_ba_test() {
+  return new CladLatestBA();
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/CladLatestBA.h b/src/cpp/modules/cladLatest/CladLatestBA.h
new file mode 100644
index 00000000..f3d4e133
--- /dev/null
+++ b/src/cpp/modules/cladLatest/CladLatestBA.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include "../../shared/ITest.h"
+#include "../../shared/BAData.h"
+#include "../../shared/defs.h"
+
+#include "ba/ba.h"
+
+#include <vector>
+
+class CladLatestBA : public ITest<BAInput, BAOutput>
+{
+private:
+    BAInput input;
+    BAOutput result;
+    std::vector<double> state;
+
+    // buffer for reprojection error jacobian part holding (column-major)
+    std::vector<double> reproj_err_d;
+
+    // buffer for reprojection error jacobian block row holding
+    std::vector<double> reproj_err_d_row;
+public:
+    // This function must be called before any other function.
+    virtual void prepare(BAInput&& input) override;
+
+    virtual void calculate_objective(int times) override;
+    virtual void calculate_jacobian(int times) override;
+    virtual BAOutput output() override;
+
+    ~CladLatestBA() {}
+
+private:
+    void calculate_weight_error_jacobian_part();
+    void calculate_reproj_error_jacobian_part();
+};
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/CladLatestGMM.cpp b/src/cpp/modules/cladLatest/CladLatestGMM.cpp
new file mode 100644
index 00000000..c5d20b0f
--- /dev/null
+++ b/src/cpp/modules/cladLatest/CladLatestGMM.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "CladLatestGMM.h"
+
+#include "gmm/gmm.h"
+#include "gmm/gmm_grad.h"
+
+// This function must be called before any other function.
+void CladLatestGMM::prepare(GMMInput&& input)
+{
+    this->input = input;
+    int Jcols = (this->input.k * (this->input.d + 1) * (this->input.d + 2)) / 2;
+    result = { 0, std::vector<double>(Jcols) };
+}
+
+
+
+GMMOutput CladLatestGMM::output()
+{
+    return result;
+}
+
+
+
+void CladLatestGMM::calculate_objective(int times)
+{
+    for (int i = 0; i < times; i++)
+    {
+        gmm_objective(
+            input.d,
+            input.k,
+            input.n,
+            input.alphas.data(),
+            input.means.data(),
+            input.icf.data(),
+            input.x.data(),
+            input.wishart,
+            &result.objective
+        );
+    }
+}
+
+
+
+void CladLatestGMM::calculate_jacobian(int times)
+{
+    double* alphas_gradient_part = result.gradient.data();
+    double* means_gradient_part = result.gradient.data() + input.alphas.size();
+    double* icf_gradient_part =
+        result.gradient.data() +
+        input.alphas.size() +
+        input.means.size();
+    std::vector<double> d_x(input.x.size(), 0);
+
+    for (int i = 0; i < times; i++)
+    {
+        double tmp = 0.0;       // stores fictive result
+                                // (Tapenade doesn't calculate an original function in reverse mode)
+
+        double errb = 1.0;      // stores dY
+                                // (equals to 1.0 for gradient calculation)
+        
+        int d_d = 0, d_k = 0, d_n = 0;
+        Wishart d_wishart{0, 0};
+        std::fill(result.gradient.begin(), result.gradient.end(), 0);
+        std::fill(d_x.begin(), d_x.end(), 0);
+
+        gmm_objective_pullback(
+            input.d,
+            input.k,
+            input.n,
+            input.alphas.data(),
+            input.means.data(),
+            input.icf.data(),
+            input.x.data(),
+            input.wishart,
+            &tmp,
+            &d_d,
+            &d_k,
+            &d_n,
+            alphas_gradient_part,
+            means_gradient_part,
+            icf_gradient_part,
+            d_x.data(),
+            &d_wishart,
+            &errb
+        );
+    }
+}
+
+
+
+extern "C" DLL_PUBLIC ITest<GMMInput, GMMOutput>* get_gmm_test()
+{
+    return new CladLatestGMM();
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/CladLatestGMM.h b/src/cpp/modules/cladLatest/CladLatestGMM.h
new file mode 100644
index 00000000..7d276dac
--- /dev/null
+++ b/src/cpp/modules/cladLatest/CladLatestGMM.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include "../../shared/ITest.h"
+#include "../../shared/GMMData.h"
+
+#include <vector>
+
+class CladLatestGMM : public ITest<GMMInput, GMMOutput>
+{
+private:
+    GMMInput input;
+    GMMOutput result;
+    std::vector<double> state;
+
+public:
+    // This function must be called before any other function.
+    virtual void prepare(GMMInput&& input) override;
+
+    virtual void calculate_objective(int times) override;
+    virtual void calculate_jacobian(int times) override;
+    virtual GMMOutput output() override;
+
+    ~CladLatestGMM() {}
+};
+
diff --git a/src/cpp/modules/cladLatest/ba/ba.h b/src/cpp/modules/cladLatest/ba/ba.h
new file mode 100644
index 00000000..937a686f
--- /dev/null
+++ b/src/cpp/modules/cladLatest/ba/ba.h
@@ -0,0 +1,192 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+#ifndef CLADLATEST_BA_H
+#define CLADLATEST_BA_H
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include "defs.h"
+#include "matrix.h"
+
+////////////////////////////////////////////////////////////
+//////////////////// Declarations //////////////////////////
+////////////////////////////////////////////////////////////
+
+// cam: 11 camera in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2]
+//            r1, r2, r3 are angle - axis rotation parameters(Rodrigues)
+//            [C1 C2 C3]' is the camera center
+//            f is the focal length in pixels
+//            [u0 v0]' is the principal point
+//            k1, k2 are radial distortion parameters
+// X: 3 point
+// feats: 2 feature (x,y coordinates)
+// reproj_err: 2
+// projection function:
+// Xcam = R * (X - C)
+// distorted = radial_distort(projective2euclidean(Xcam), radial_parameters)
+// proj = distorted * f + principal_point
+// err = sqsum(proj - measurement)
+template <typename T>
+void computeReprojError(const T *const cam, const T *const X, const T *const w,
+                        const double *const feat, T *err);
+
+// w: 1
+// w_err: 1
+template <typename T> void computeZachWeightError(const T *const w, T *err);
+
+// n number of cameras
+// m number of points
+// p number of observations
+// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2]
+//            r1, r2, r3 are angle - axis rotation parameters(Rodrigues)
+//            [C1 C2 C3]' is the camera center
+//            f is the focal length in pixels
+//            [u0 v0]' is the principal point
+//            k1, k2 are radial distortion parameters
+// X: 3*m points
+// obs: 2*p observations (pairs cameraIdx, pointIdx)
+// feats: 2*p features (x,y coordinates corresponding to observations)
+// reproj_err: 2*p errors of observations
+// w_err: p weight "error" terms
+// projection function:
+// Xcam = R * (X - C)
+// distorted = radial_distort(projective2euclidean(Xcam), radial_parameters)
+// proj = distorted * f + principal_point
+// err = sqsum(proj - measurement)
+template <typename T>
+void ba_objective(int n, int m, int p, const T *const cams, const T *const X,
+                  const T *const w, const int *const obs,
+                  const double *const feats, T *reproj_err, T *w_err);
+
+// rot: 3 rotation parameters
+// pt: 3 point to be rotated
+// rotatedPt: 3 rotated point
+// this is an efficient evaluation (part of
+// the Ceres implementation)
+// easy to understand calculation in matlab:
+//  theta = sqrt(sum(w. ^ 2));
+//  n = w / theta;
+//  n_x = au_cross_matrix(n);
+//  R = eye(3) + n_x*sin(theta) + n_x*n_x*(1 - cos(theta));
+template <typename T>
+void rodrigues_rotate_point(const T *const rot, const T *const pt,
+                            T *rotatedPt);
+
+////////////////////////////////////////////////////////////
+//////////////////// Definitions ///////////////////////////
+////////////////////////////////////////////////////////////
+
+template <typename T> T sqsum(int n, const T *const x) {
+  T res = 0;
+  int i = 0;
+  for (; i < n; i++)
+    res = res + x[i] * x[i];
+  return res;
+}
+
+template <typename T>
+void rodrigues_rotate_point(const T * rot, const T * pt,
+                            T *rotatedPt) {
+  T sqtheta = sqsum(3, rot);
+  int i0 = 0;
+  int i1 = 0;
+  int i2 = 0;
+  T theta, costheta, sintheta, theta_inverse, tmp;
+  T w[3], w_cross_pt[3];
+  T rot_cross_pt[3];
+  if (sqtheta != 0) {
+    theta = sqrt(sqtheta);
+    costheta = cos(theta);
+    sintheta = sin(theta);
+    theta_inverse = 1.0 / theta;
+
+    for (; i0 < 3; i0++)
+      w[i0] = rot[i0] * theta_inverse;
+
+    cross(w, pt, w_cross_pt);
+
+    tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta);
+
+    for (; i1 < 3; i1++)
+      rotatedPt[i1] = pt[i1] * costheta + w_cross_pt[i1] * sintheta + w[i1] * tmp;
+  } else {
+    cross(rot, pt, rot_cross_pt);
+
+    for (; i2 < 3; i2++)
+      rotatedPt[i2] = pt[i2] + rot_cross_pt[i2];
+  }
+}
+
+template <typename T> void radial_distort(const T *const rad_params, T *proj) {
+  T rsq, L;
+  rsq = sqsum(2, proj);
+  L = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq;
+  proj[0] = proj[0] * L;
+  proj[1] = proj[1] * L;
+}
+
+template <typename T>
+void project(const T * cam, const T * X, T *proj) {
+  const T *const C = &cam[3];
+  T Xo[3], Xcam[3];
+
+  Xo[0] = X[0] - C[0];
+  Xo[1] = X[1] - C[1];
+  Xo[2] = X[2] - C[2];
+
+  rodrigues_rotate_point(cam + 0, Xo, Xcam);
+
+  proj[0] = Xcam[0] / Xcam[2];
+  proj[1] = Xcam[1] / Xcam[2];
+
+  radial_distort(cam + 9, proj);
+
+  proj[0] = proj[0] * cam[6] + cam[7];
+  proj[1] = proj[1] * cam[6] + cam[8];
+}
+
+template <typename T>
+void computeReprojError(const T * cam, const T * X, const T * w,
+                        const double * feat, T *err) {
+  T proj[2];
+  project(cam, X, proj);
+
+  err[0] = (*w) * (proj[0] - feat[0]);
+  err[1] = (*w) * (proj[1] - feat[1]);
+}
+
+template <typename T> void computeZachWeightError(const T *const w, T *err) {
+  *err = 1 - (*w) * (*w);
+}
+
+template <typename T>
+void ba_objective(int n, int m, int p, const T *const cams, const T *const X,
+                  const T *const w, const int *const obs,
+                  const double *const feats, T *reproj_err, T *w_err) {
+  for (int i = 0; i < p; i++) {
+    int camIdx = obs[i * 2 + 0];
+    int ptIdx = obs[i * 2 + 1];
+    computeReprojError(&cams[camIdx * BA_NCAMPARAMS], &X[ptIdx * 3], &w[i],
+                       &feats[i * 2], &reproj_err[2 * i]);
+  }
+
+  for (int i = 0; i < p; i++) {
+    computeZachWeightError(&w[i], &w_err[i]);
+  }
+}
+
+template <typename T>
+T computeReprojError_wrapper(const T * cam, const T * X,
+                             const T * w, const double * feat,
+                             T *err) {
+  computeReprojError(cam, X, w, feat, err);
+  return err[0];
+}
+
+template <typename T>
+T computeZachWeightError_wrapper(const T * w, T *err) {
+    computeZachWeightError(w, err);
+    return err[0];
+}
+#endif
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/ba/ba_grad.cpp b/src/cpp/modules/cladLatest/ba/ba_grad.cpp
new file mode 100644
index 00000000..3f9a7772
--- /dev/null
+++ b/src/cpp/modules/cladLatest/ba/ba_grad.cpp
@@ -0,0 +1,519 @@
+#include "ba.h"
+#include "clad/Differentiator/Differentiator.h"
+
+void sqsum_pullback(int n, const double *const x, double _d_y,
+                    clad::array_ref<int> _d_n, clad::array_ref<double> _d_x) {
+  double _d_res = 0;
+  int _d_i = 0;
+  unsigned long _t0;
+  clad::tape<double> _t1 = {};
+  double res = 0;
+  int i = 0;
+  _t0 = 0;
+  for (; i < n; i++) {
+    _t0++;
+    clad::push(_t1, res);
+    res = res + x[i] * x[i];
+  }
+  goto _label0;
+_label0:
+  _d_res += _d_y;
+  for (; _t0; _t0--) {
+    i--;
+    res = clad::pop(_t1);
+    double _r_d0 = _d_res;
+    _d_res += _r_d0;
+    _d_x[i] += _r_d0 * x[i];
+    _d_x[i] += x[i] * _r_d0;
+    _d_res -= _r_d0;
+  }
+}
+void cross_pullback(const double *const a, const double *const b, double *out,
+                    clad::array_ref<double> _d_a, clad::array_ref<double> _d_b,
+                    clad::array_ref<double> _d_out) {
+  double _t0;
+  double _t1;
+  double _t2;
+  _t0 = out[0];
+  out[0] = a[1] * b[2] - a[2] * b[1];
+  _t1 = out[1];
+  out[1] = a[2] * b[0] - a[0] * b[2];
+  _t2 = out[2];
+  out[2] = a[0] * b[1] - a[1] * b[0];
+  {
+    out[2] = _t2;
+    double _r_d2 = _d_out[2];
+    _d_a[0] += _r_d2 * b[1];
+    _d_b[1] += a[0] * _r_d2;
+    _d_a[1] += -_r_d2 * b[0];
+    _d_b[0] += a[1] * -_r_d2;
+    _d_out[2] -= _r_d2;
+    _d_out[2];
+  }
+  {
+    out[1] = _t1;
+    double _r_d1 = _d_out[1];
+    _d_a[2] += _r_d1 * b[0];
+    _d_b[0] += a[2] * _r_d1;
+    _d_a[0] += -_r_d1 * b[2];
+    _d_b[2] += a[0] * -_r_d1;
+    _d_out[1] -= _r_d1;
+    _d_out[1];
+  }
+  {
+    out[0] = _t0;
+    double _r_d0 = _d_out[0];
+    _d_a[1] += _r_d0 * b[2];
+    _d_b[2] += a[1] * _r_d0;
+    _d_a[2] += -_r_d0 * b[1];
+    _d_b[1] += a[2] * -_r_d0;
+    _d_out[0] -= _r_d0;
+    _d_out[0];
+  }
+}
+void rodrigues_rotate_point_pullback(const double *rot, const double *pt,
+                                     double *rotatedPt,
+                                     clad::array_ref<double> _d_rot,
+                                     clad::array_ref<double> _d_pt,
+                                     clad::array_ref<double> _d_rotatedPt) {
+  const double *_t0;
+  double _d_sqtheta = 0;
+  int _d_i0 = 0;
+  int _d_i1 = 0;
+  int _d_i2 = 0;
+  double _d_theta = 0, _d_costheta = 0, _d_sintheta = 0, _d_theta_inverse = 0,
+         _d_tmp = 0;
+  clad::array<double> _d_w(3UL), _d_w_cross_pt(3UL);
+  clad::array<double> _d_rot_cross_pt(3UL);
+  bool _cond0;
+  double _t1;
+  double _t2;
+  double _t3;
+  double _t4;
+  unsigned long _t5;
+  clad::tape<double> _t6 = {};
+  const double *_t7;
+  double _t8;
+  unsigned long _t9;
+  clad::tape<double> _t10 = {};
+  const double *_t11;
+  const double *_t12;
+  unsigned long _t13;
+  clad::tape<double> _t14 = {};
+  _t0 = rot;
+  double sqtheta = sqsum(3, rot);
+  int i0 = 0;
+  int i1 = 0;
+  int i2 = 0;
+  double theta, costheta, sintheta, theta_inverse, tmp;
+  double w[3], w_cross_pt[3];
+  double rot_cross_pt[3];
+  _cond0 = sqtheta != 0;
+  if (_cond0) {
+    _t1 = theta;
+    theta = sqrt(sqtheta);
+    _t2 = costheta;
+    costheta = cos(theta);
+    _t3 = sintheta;
+    sintheta = sin(theta);
+    _t4 = theta_inverse;
+    theta_inverse = 1. / theta;
+    _t5 = 0;
+    for (; i0 < 3; i0++) {
+      _t5++;
+      clad::push(_t6, w[i0]);
+      w[i0] = rot[i0] * theta_inverse;
+    }
+    _t7 = pt;
+    cross(w, pt, w_cross_pt);
+    _t8 = tmp;
+    tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta);
+    _t9 = 0;
+    for (; i1 < 3; i1++) {
+      _t9++;
+      clad::push(_t10, rotatedPt[i1]);
+      rotatedPt[i1] =
+          pt[i1] * costheta + w_cross_pt[i1] * sintheta + w[i1] * tmp;
+    }
+  } else {
+    _t11 = rot;
+    _t12 = pt;
+    cross(rot, pt, rot_cross_pt);
+    _t13 = 0;
+    for (; i2 < 3; i2++) {
+      _t13++;
+      clad::push(_t14, rotatedPt[i2]);
+      rotatedPt[i2] = pt[i2] + rot_cross_pt[i2];
+    }
+  }
+  if (_cond0) {
+    for (; _t9; _t9--) {
+      i1--;
+      rotatedPt[i1] = clad::pop(_t10);
+      double _r_d6 = _d_rotatedPt[i1];
+      _d_pt[i1] += _r_d6 * costheta;
+      _d_costheta += pt[i1] * _r_d6;
+      _d_w_cross_pt[i1] += _r_d6 * sintheta;
+      _d_sintheta += w_cross_pt[i1] * _r_d6;
+      _d_w[i1] += _r_d6 * tmp;
+      _d_tmp += w[i1] * _r_d6;
+      _d_rotatedPt[i1] -= _r_d6;
+      _d_rotatedPt[i1];
+    }
+    {
+      tmp = _t8;
+      double _r_d5 = _d_tmp;
+      _d_w[0] += _r_d5 * (1. - costheta) * pt[0];
+      _d_pt[0] += w[0] * _r_d5 * (1. - costheta);
+      _d_w[1] += _r_d5 * (1. - costheta) * pt[1];
+      _d_pt[1] += w[1] * _r_d5 * (1. - costheta);
+      _d_w[2] += _r_d5 * (1. - costheta) * pt[2];
+      _d_pt[2] += w[2] * _r_d5 * (1. - costheta);
+      _d_costheta += -(w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * _r_d5;
+      _d_tmp -= _r_d5;
+    }
+    {
+      pt = _t7;
+      cross_pullback(w, _t7, w_cross_pt, _d_w, _d_pt, _d_w_cross_pt);
+    }
+    for (; _t5; _t5--) {
+      i0--;
+      w[i0] = clad::pop(_t6);
+      double _r_d4 = _d_w[i0];
+      _d_rot[i0] += _r_d4 * theta_inverse;
+      _d_theta_inverse += rot[i0] * _r_d4;
+      _d_w[i0] -= _r_d4;
+      _d_w[i0];
+    }
+    {
+      theta_inverse = _t4;
+      double _r_d3 = _d_theta_inverse;
+      double _r4 = _r_d3 * -1. / (theta * theta);
+      _d_theta += _r4;
+      _d_theta_inverse -= _r_d3;
+    }
+    {
+      sintheta = _t3;
+      double _r_d2 = _d_sintheta;
+      double _r3 =
+          _r_d2 *
+          clad::custom_derivatives::sin_pushforward(theta, 1.).pushforward;
+      _d_theta += _r3;
+      _d_sintheta -= _r_d2;
+    }
+    {
+      costheta = _t2;
+      double _r_d1 = _d_costheta;
+      double _r2 =
+          _r_d1 *
+          clad::custom_derivatives::cos_pushforward(theta, 1.).pushforward;
+      _d_theta += _r2;
+      _d_costheta -= _r_d1;
+    }
+    {
+      theta = _t1;
+      double _r_d0 = _d_theta;
+      double _r1 =
+          _r_d0 *
+          clad::custom_derivatives::sqrt_pushforward(sqtheta, 1.).pushforward;
+      _d_sqtheta += _r1;
+      _d_theta -= _r_d0;
+    }
+  } else {
+    for (; _t13; _t13--) {
+      i2--;
+      rotatedPt[i2] = clad::pop(_t14);
+      double _r_d7 = _d_rotatedPt[i2];
+      _d_pt[i2] += _r_d7;
+      _d_rot_cross_pt[i2] += _r_d7;
+      _d_rotatedPt[i2] -= _r_d7;
+      _d_rotatedPt[i2];
+    }
+    {
+      rot = _t11;
+      pt = _t12;
+      cross_pullback(_t11, _t12, rot_cross_pt, _d_rot, _d_pt, _d_rot_cross_pt);
+    }
+  }
+  {
+    rot = _t0;
+    int _grad0 = 0;
+    sqsum_pullback(3, _t0, _d_sqtheta, &_grad0, _d_rot);
+    int _r0 = _grad0;
+  }
+}
+void radial_distort_pullback(const double *const rad_params, double *proj,
+                             clad::array_ref<double> _d_rad_params,
+                             clad::array_ref<double> _d_proj) {
+  double _d_rsq = 0, _d_L = 0;
+  double _t0;
+  double *_t1;
+  double _t2;
+  double _t3;
+  double _t4;
+  double rsq, L;
+  _t0 = rsq;
+  _t1 = proj;
+  rsq = sqsum(2, proj);
+  _t2 = L;
+  L = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq;
+  _t3 = proj[0];
+  proj[0] = proj[0] * L;
+  _t4 = proj[1];
+  proj[1] = proj[1] * L;
+  {
+    proj[1] = _t4;
+    double _r_d3 = _d_proj[1];
+    _d_proj[1] += _r_d3 * L;
+    _d_L += proj[1] * _r_d3;
+    _d_proj[1] -= _r_d3;
+    _d_proj[1];
+  }
+  {
+    proj[0] = _t3;
+    double _r_d2 = _d_proj[0];
+    _d_proj[0] += _r_d2 * L;
+    _d_L += proj[0] * _r_d2;
+    _d_proj[0] -= _r_d2;
+    _d_proj[0];
+  }
+  {
+    L = _t2;
+    double _r_d1 = _d_L;
+    _d_rad_params[0] += _r_d1 * rsq;
+    _d_rsq += rad_params[0] * _r_d1;
+    _d_rad_params[1] += _r_d1 * rsq * rsq;
+    _d_rsq += rad_params[1] * _r_d1 * rsq;
+    _d_rsq += rad_params[1] * rsq * _r_d1;
+    _d_L -= _r_d1;
+  }
+  {
+    rsq = _t0;
+    double _r_d0 = _d_rsq;
+    proj = _t1;
+    int _grad0 = 0;
+    sqsum_pullback(2, _t1, _r_d0, &_grad0, _d_proj);
+    int _r0 = _grad0;
+    _d_rsq -= _r_d0;
+  }
+}
+void project_pullback(const double *cam, const double *X, double *proj,
+                      clad::array_ref<double> _d_cam,
+                      clad::array_ref<double> _d_X,
+                      clad::array_ref<double> _d_proj) {
+  double *_d_C = 0;
+  clad::array<double> _d_Xo(3UL), _d_Xcam(3UL);
+  double _t0;
+  double _t1;
+  double _t2;
+  const double *_t3;
+  double _t4;
+  double _t5;
+  const double *_t6;
+  double *_t7;
+  double _t8;
+  double _t9;
+  _d_C = &_d_cam[3];
+  const double *const C = &cam[3];
+  double Xo[3], Xcam[3];
+  _t0 = Xo[0];
+  Xo[0] = X[0] - C[0];
+  _t1 = Xo[1];
+  Xo[1] = X[1] - C[1];
+  _t2 = Xo[2];
+  Xo[2] = X[2] - C[2];
+  _t3 = cam + 0;
+  rodrigues_rotate_point(cam + 0, Xo, Xcam);
+  _t4 = proj[0];
+  proj[0] = Xcam[0] / Xcam[2];
+  _t5 = proj[1];
+  proj[1] = Xcam[1] / Xcam[2];
+  _t6 = cam + 9;
+  _t7 = proj;
+  // CFIX-start
+  double tproj0 = proj[0];
+  double tproj1 = proj[1];
+  // CFIX-end
+  radial_distort(cam + 9, proj);
+  _t8 = proj[0];
+  proj[0] = proj[0] * cam[6] + cam[7];
+  _t9 = proj[1];
+  proj[1] = proj[1] * cam[6] + cam[8];
+  {
+    proj[1] = _t9;
+    double _r_d6 = _d_proj[1];
+    _d_proj[1] += _r_d6 * cam[6];
+    _d_cam[6] += proj[1] * _r_d6;
+    _d_cam[8] += _r_d6;
+    _d_proj[1] -= _r_d6;
+    _d_proj[1];
+  }
+  {
+    proj[0] = _t8;
+    double _r_d5 = _d_proj[0];
+    _d_proj[0] += _r_d5 * cam[6];
+    _d_cam[6] += proj[0] * _r_d5;
+    _d_cam[7] += _r_d5;
+    _d_proj[0] -= _r_d5;
+    _d_proj[0];
+  }
+  {
+    proj = _t7;
+    // CFIX-start
+    proj[0] = tproj0;
+    proj[1] = tproj1;
+    // CFIX-end
+    radial_distort_pullback(_t6, _t7, _d_cam.ptr_ref() + 9, _d_proj);
+  }
+  {
+    proj[1] = _t5;
+    double _r_d4 = _d_proj[1];
+    _d_Xcam[1] += _r_d4 / Xcam[2];
+    double _r1 = _r_d4 * -Xcam[1] / (Xcam[2] * Xcam[2]);
+    _d_Xcam[2] += _r1;
+    _d_proj[1] -= _r_d4;
+    _d_proj[1];
+  }
+  {
+    proj[0] = _t4;
+    double _r_d3 = _d_proj[0];
+    _d_Xcam[0] += _r_d3 / Xcam[2];
+    double _r0 = _r_d3 * -Xcam[0] / (Xcam[2] * Xcam[2]);
+    _d_Xcam[2] += _r0;
+    _d_proj[0] -= _r_d3;
+    _d_proj[0];
+  }
+  rodrigues_rotate_point_pullback(_t3, Xo, Xcam, _d_cam.ptr_ref() + 0, _d_Xo,
+                                  _d_Xcam);
+  {
+    Xo[2] = _t2;
+    double _r_d2 = _d_Xo[2];
+    _d_X[2] += _r_d2;
+    _d_C[2] += -_r_d2;
+    _d_Xo[2] -= _r_d2;
+    _d_Xo[2];
+  }
+  {
+    Xo[1] = _t1;
+    double _r_d1 = _d_Xo[1];
+    _d_X[1] += _r_d1;
+    _d_C[1] += -_r_d1;
+    _d_Xo[1] -= _r_d1;
+    _d_Xo[1];
+  }
+  {
+    Xo[0] = _t0;
+    double _r_d0 = _d_Xo[0];
+    _d_X[0] += _r_d0;
+    _d_C[0] += -_r_d0;
+    _d_Xo[0] -= _r_d0;
+    _d_Xo[0];
+  }
+}
+
+void computeReprojError_pullback(const double *cam, const double *X,
+                                 const double *w, const double *feat,
+                                 double *err, clad::array_ref<double> _d_cam,
+                                 clad::array_ref<double> _d_X,
+                                 clad::array_ref<double> _d_w,
+                                 clad::array_ref<double> _d_feat,
+                                 clad::array_ref<double> _d_err) {
+  clad::array<double> _d_proj(2UL);
+  const double *_t0;
+  const double *_t1;
+  double _t2;
+  double _t3;
+  double proj[2];
+  _t0 = cam;
+  _t1 = X;
+  project(cam, X, proj);
+  _t2 = err[0];
+  err[0] = *w * (proj[0] - feat[0]);
+  _t3 = err[1];
+  err[1] = *w * (proj[1] - feat[1]);
+  {
+    err[1] = _t3;
+    double _r_d1 = _d_err[1];
+    *_d_w += _r_d1 * (proj[1] - feat[1]);
+    _d_proj[1] += *w * _r_d1;
+    _d_feat[1] += -*w * _r_d1;
+    _d_err[1] -= _r_d1;
+    _d_err[1];
+  }
+  {
+    err[0] = _t2;
+    double _r_d0 = _d_err[0];
+    *_d_w += _r_d0 * (proj[0] - feat[0]);
+    _d_proj[0] += *w * _r_d0;
+    _d_feat[0] += -*w * _r_d0;
+    _d_err[0] -= _r_d0;
+    _d_err[0];
+  }
+  {
+    cam = _t0;
+    X = _t1;
+    project_pullback(_t0, _t1, proj, _d_cam, _d_X, _d_proj);
+  }
+}
+void computeReprojError_wrapper_grad(
+    const double *cam, const double *X, const double *w, const double *feat,
+    double *err, clad::array_ref<double> _d_cam, clad::array_ref<double> _d_X,
+    clad::array_ref<double> _d_w, clad::array_ref<double> _d_feat,
+    clad::array_ref<double> _d_err) {
+  const double *_t0;
+  const double *_t1;
+  const double *_t2;
+  const double *_t3;
+  double *_t4;
+  _t0 = cam;
+  _t1 = X;
+  _t2 = w;
+  _t3 = feat;
+  _t4 = err;
+  computeReprojError(cam, X, w, feat, err);
+  goto _label0;
+_label0:
+  _d_err[0] += 1;
+  {
+    cam = _t0;
+    X = _t1;
+    w = _t2;
+    feat = _t3;
+    err = _t4;
+    computeReprojError_pullback(_t0, _t1, _t2, _t3, _t4, _d_cam, _d_X, _d_w,
+                                _d_feat, _d_err);
+  }
+}
+void computeZachWeightError_pullback(const double *const w, double *err,
+                                     clad::array_ref<double> _d_w,
+                                     clad::array_ref<double> _d_err) {
+  double _t0;
+  _t0 = *err;
+  *err = 1 - *w * (*w);
+  {
+    *err = _t0;
+    double _r_d0 = *_d_err;
+    *_d_w += -_r_d0 * (*w);
+    *_d_w += *w * -_r_d0;
+    *_d_err -= _r_d0;
+    *_d_err;
+  }
+}
+
+void computeZachWeightError_wrapper_grad(const double *w, double *err,
+                                         clad::array_ref<double> _d_w,
+                                         clad::array_ref<double> _d_err) {
+  const double *_t0;
+  double *_t1;
+  _t0 = w;
+  _t1 = err;
+  computeZachWeightError(w, err);
+  goto _label0;
+_label0:
+  _d_err[0] += 1;
+  {
+    w = _t0;
+    err = _t1;
+    computeZachWeightError_pullback(_t0, _t1, _d_w, _d_err);
+  }
+}
diff --git a/src/cpp/modules/cladLatest/ba/ba_grad.h b/src/cpp/modules/cladLatest/ba/ba_grad.h
new file mode 100644
index 00000000..de42ade1
--- /dev/null
+++ b/src/cpp/modules/cladLatest/ba/ba_grad.h
@@ -0,0 +1,13 @@
+#include "clad/Differentiator/ArrayRef.h"
+
+void computeReprojError_pullback(const double *cam, const double *X,
+                                 const double *w, const double *feat,
+                                 double *err, clad::array_ref<double> _d_cam,
+                                 clad::array_ref<double> _d_X,
+                                 clad::array_ref<double> _d_w,
+                                 clad::array_ref<double> _d_feat,
+                                 clad::array_ref<double> _d_err);
+
+void computeZachWeightError_pullback(const double *const w, double *err,
+                                     clad::array_ref<double> _d_w,
+                                     clad::array_ref<double> _d_err);
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/gmm/gmm.h b/src/cpp/modules/cladLatest/gmm/gmm.h
new file mode 100644
index 00000000..c7908de3
--- /dev/null
+++ b/src/cpp/modules/cladLatest/gmm/gmm.h
@@ -0,0 +1,299 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+using std::vector;
+#include "defs.h"
+#include "matrix.h"
+
+// CLADFIX-Add-Start
+// namespace clad {
+//     namespace custom_derivatives {
+//         namespace class_functions {
+//             template <typename T>
+//             void resize_pullback(::std::vector<T> *v, size_t n, ::std::vector<T> *d_v, size_t d_n) {
+
+//             }
+//         }
+//     }
+// }
+// CLADFIX-Add-end
+
+////////////////////////////////////////////////////////////
+//////////////////// Declarations //////////////////////////
+////////////////////////////////////////////////////////////
+
+// d: dim
+// k: number of gaussians
+// n: number of points
+// alphas: k logs of mixture weights (unnormalized), so
+//          weights = exp(log_alphas) / sum(exp(log_alphas))
+// means: d*k component means
+// icf: (d*(d+1)/2)*k inverse covariance factors 
+//                  every icf entry stores firstly log of diagonal and then 
+//          columnwise other entris
+//          To generate icf in MATLAB given covariance C :
+//              L = inv(chol(C, 'lower'));
+//              inv_cov_factor = [log(diag(L)); L(au_tril_indices(d, -1))]
+// wishart: wishart distribution parameters
+// x: d*n points
+// err: 1 output
+template<typename T>
+void gmm_objective(int d, int k, int n, const T* const alphas, const T* const means,
+    const T* const icf, const double* const x, Wishart wishart, T* err);
+
+// split of the outer loop over points
+template<typename T>
+void gmm_objective_split_inner(int d, int k,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    const double* const x,
+    Wishart wishart,
+    T* err);
+// other terms which are outside the loop
+template<typename T>
+void gmm_objective_split_other(int d, int k, int n,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    Wishart wishart,
+    T* err);
+
+template<typename T>
+T logsumexp(int n, const T* const x);
+
+// p: dim
+// k: number of components
+// wishart parameters
+// sum_qs: k sums of log diags of Qs
+// Qdiags: d*k
+// icf: (p*(p+1)/2)*k inverse covariance factors
+template<typename T>
+T log_wishart_prior(int p, int k,
+    Wishart wishart,
+    const T* const sum_qs,
+    const T* const Qdiags,
+    const T* const icf);
+
+template<typename T>
+void preprocess_qs(int d, int k,
+    const T* const icf,
+    T* sum_qs,
+    T* Qdiags);
+
+template<typename T>
+void Qtimesx(int d,
+    const T* const Qdiag,
+    const T* const ltri, // strictly lower triangular part
+    const T* const x,
+    T* out);
+
+////////////////////////////////////////////////////////////
+//////////////////// Definitions ///////////////////////////
+////////////////////////////////////////////////////////////
+
+template<typename T>
+T logsumexp(int n, const T* const x)
+{
+    T mx = arr_max(n, x);
+    T semx = 0.;
+    for (int i = 0; i < n; i++)
+    {
+        semx = semx + exp(x[i] - mx);
+    }
+    return log(semx) + mx;
+}
+
+template<typename T>
+T log_wishart_prior(int p, int k,
+    Wishart wishart,
+    const T* const sum_qs,
+    const T* const Qdiags,
+    const T* const icf)
+{
+    int n = p + wishart.m + 1;
+    int icf_sz = p * (p + 1) / 2;
+
+    double C = n * p * (log(wishart.gamma) - 0.5 * log(2)) - log_gamma_distrib(0.5 * n, p);
+
+    T out = 0;
+    for (int ik = 0; ik < k; ik++)
+    {
+        T frobenius = sqnorm(p, &Qdiags[ik * p]) + sqnorm(icf_sz - p, &icf[ik * icf_sz + p]);
+        out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius)
+            -wishart.m * sum_qs[ik];
+    }
+
+    return out - k * C;
+}
+
+template<typename T>
+void preprocess_qs(int d, int k,
+    const T* const icf,
+    T* sum_qs,
+    T* Qdiags)
+{
+    int icf_sz = d * (d + 1) / 2;
+    for (int ik = 0; ik < k; ik++)
+    {
+        sum_qs[ik] = 0.;
+        for (int id = 0; id < d; id++)
+        {
+            T q = icf[ik * icf_sz + id];
+            sum_qs[ik] = sum_qs[ik] + q;
+            Qdiags[ik * d + id] = exp(q);
+        }
+    }
+}
+
+template<typename T>
+void Qtimesx(int d,
+    const T* const Qdiag,
+    const T* const ltri, // strictly lower triangular part
+    const T* const x,
+    T* out)
+{
+    for (int id = 0; id < d; id++)
+        out[id] = Qdiag[id] * x[id];
+
+    int Lparamsidx = 0;
+    for (int i = 0; i < d; i++)
+    {
+        for (int j = i + 1; j < d; j++)
+        {
+            out[j] = out[j] + ltri[Lparamsidx] * x[i];
+            Lparamsidx++;
+        }
+    }
+}
+
+template<typename T>
+void gmm_objective(int d, int k, int n,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    const double* const x,
+    Wishart wishart,
+    T* err)
+{
+    const double CONSTANT = -n * d * 0.5 * log(2 * PI);
+    int icf_sz = d * (d + 1) / 2;
+
+    // CLADFIX-Modify-DeleteStart(VectorInit)
+    // vector<T> Qdiags(d * k);
+    // vector<T> sum_qs(k);
+    // vector<T> xcentered(d);
+    // vector<T> Qxcentered(d);
+    // vector<T> main_term(k);
+    // CLADFIX-Modify-DeleteEnd(VectorInit)
+    // CLADFIX-Modify-AddStart(VectorInit)
+    T Qdiags[d * k];
+    T sum_qs[k];
+    T xcentered[d];
+    T Qxcentered[d];
+    T main_term[k];
+    //CLADFIX-Modify-AddEnd(VectorInit)
+
+    preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]);
+
+    T slse = 0.;
+    for (int ix = 0; ix < n; ix++)
+    {
+        for (int ik = 0; ik < k; ik++)
+        {
+            subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]);
+            Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0]);
+
+            main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * sqnorm(d, &Qxcentered[0]);
+        }
+        slse = slse + logsumexp(k, &main_term[0]);
+    }
+
+    T lse_alphas = logsumexp(k, alphas);
+
+    *err = CONSTANT + slse - n * lse_alphas;
+
+    *err = *err + log_wishart_prior(d, k, wishart, &sum_qs[0], &Qdiags[0], icf);
+
+}
+
+template<typename T>
+void gmm_objective_split_inner(int d, int k,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    const double* const x,
+    Wishart wishart,
+    T* err)
+{
+    int icf_sz = d * (d + 1) / 2;
+
+    T* Ldiag = new T[d];
+    T* xcentered = new T[d];
+    T* mahal = new T[d];
+    T* lse = new T[k];
+
+    for (int ik = 0; ik < k; ik++)
+    {
+        int icf_off = ik * icf_sz;
+        T sumlog_Ldiag(0.);
+        for (int id = 0; id < d; id++)
+        {
+            sumlog_Ldiag = sumlog_Ldiag + icf[icf_off + id];
+            Ldiag[id] = exp(icf[icf_off + id]);
+        }
+
+        for (int id = 0; id < d; id++)
+        {
+            xcentered[id] = x[id] - means[ik * d + id];
+            mahal[id] = Ldiag[id] * xcentered[id];
+        }
+        int Lparamsidx = d;
+        for (int i = 0; i < d; i++)
+        {
+            for (int j = i + 1; j < d; j++)
+            {
+                mahal[j] = mahal[j] + icf[icf_off + Lparamsidx] * xcentered[i];
+                Lparamsidx++;
+            }
+        }
+        T sqsum_mahal(0.);
+        for (int id = 0; id < d; id++)
+        {
+            sqsum_mahal = sqsum_mahal + mahal[id] * mahal[id];
+        }
+
+        lse[ik] = alphas[ik] + sumlog_Ldiag - 0.5 * sqsum_mahal;
+    }
+
+    *err = logsumexp(k, lse);
+
+    delete[] mahal;
+    delete[] xcentered;
+    delete[] Ldiag;
+    delete[] lse;
+}
+
+template<typename T>
+void gmm_objective_split_other(int d, int k, int n,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    Wishart wishart,
+    T* err)
+{
+    const double CONSTANT = -n * d * 0.5 * log(2 * PI);
+
+    T lse_alphas = logsumexp(k, alphas);
+
+    T* sum_qs = new T[k];
+    T* Qdiags = new T[d * k];
+    preprocess_qs(d, k, icf, sum_qs, Qdiags);
+    *err = CONSTANT - n * lse_alphas + log_wishart_prior(d, k, wishart, sum_qs, Qdiags, icf);
+    delete[] sum_qs;
+    delete[] Qdiags;
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/gmm/gmm_grad.cpp b/src/cpp/modules/cladLatest/gmm/gmm_grad.cpp
new file mode 100644
index 00000000..660263b8
--- /dev/null
+++ b/src/cpp/modules/cladLatest/gmm/gmm_grad.cpp
@@ -0,0 +1,553 @@
+#include "gmm.h"
+// CLADFIX-Add-Start
+#include "clad/Differentiator/Differentiator.h"
+// CLADFIX-Add-End
+
+void gmm_objective_pullback(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err);
+void gmm_objective_wrapper_grad(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err) {
+    gmm_objective(d, k, n, alphas, means, icf, x, wishart, err);
+    goto _label0;
+  _label0:
+    _d_err[0] += 1;
+    {
+        int _r0 = 0;
+        int _r1 = 0;
+        int _r2 = 0;
+        Wishart _r3 = {};
+        gmm_objective_pullback(d, k, n, alphas, means, icf, x, wishart, err, &_r0, &_r1, &_r2, _d_alphas, _d_means, _d_icf, _d_x, &_r3, _d_err);
+        *_d_d += _r0;
+        *_d_k += _r1;
+        *_d_n += _r2;
+    }
+}
+void preprocess_qs_pullback(int d, int k, const double *const icf, double *sum_qs, double *Qdiags, int *_d_d, int *_d_k, double *_d_icf, double *_d_sum_qs, double *_d_Qdiags);
+void subtract_pullback(int d, const double *const x, const double *const y, double *out, int *_d_d, double *_d_x, double *_d_y, double *_d_out);
+void Qtimesx_pullback(int d, const double *const Qdiag, const double *const ltri, const double *const x, double *out, int *_d_d, double *_d_Qdiag, double *_d_ltri, double *_d_x, double *_d_out);
+void sqnorm_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x);
+void logsumexp_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x);
+void log_wishart_prior_pullback(int p, int k, Wishart wishart, const double *const sum_qs, const double *const Qdiags, const double *const icf, double _d_y, int *_d_p, int *_d_k, Wishart *_d_wishart, double *_d_sum_qs, double *_d_Qdiags, double *_d_icf);
+void gmm_objective_pullback(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err) {
+    double _t0;
+    double _d_CONSTANT = 0;
+    int _d_icf_sz = 0;
+    double _d_slse = 0;
+    unsigned long _t1;
+    int _d_ix = 0;
+    int ix = 0;
+    clad::tape<unsigned long> _t2 = {};
+    clad::tape<int> _t3 = {};
+    int _d_ik = 0;
+    int ik = 0;
+    clad::tape<double> _t4 = {};
+    clad::tape<double> _t5 = {};
+    clad::tape<double> _t6 = {};
+    double _d_lse_alphas = 0;
+    double _t7;
+    double _t8;
+    _t0 = log(2 * 3.1415926535900001);
+    const double CONSTANT = -n * d * 0.5 * _t0;
+    int icf_sz = d * (d + 1) / 2;
+    double _d_Qdiags[d * k];
+    clad::zero_init(_d_Qdiags, d * k);
+    double Qdiags[d * k];
+    double _d_sum_qs[k];
+    clad::zero_init(_d_sum_qs, k);
+    double sum_qs[k];
+    double _d_xcentered[d];
+    clad::zero_init(_d_xcentered, d);
+    double xcentered[d];
+    double _d_Qxcentered[d];
+    clad::zero_init(_d_Qxcentered, d);
+    double Qxcentered[d];
+    double _d_main_term[k];
+    clad::zero_init(_d_main_term, k);
+    double main_term[k];
+    preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]);
+    double slse = 0.;
+    _t1 = 0;
+    for (ix = 0; ix < n; ix++) {
+        _t1++;
+        clad::push(_t2, 0UL);
+        for (clad::push(_t3, ik) , ik = 0; ik < k; ik++) {
+            clad::back(_t2)++;
+            subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]);
+            Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0]);
+            clad::push(_t4, main_term[ik]);
+            main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * clad::push(_t5, sqnorm(d, &Qxcentered[0]));
+        }
+        clad::push(_t6, slse);
+        slse = slse + logsumexp(k, &main_term[0]);
+    }
+    double lse_alphas = logsumexp(k, alphas);
+    _t7 = *err;
+    *err = CONSTANT + slse - n * lse_alphas;
+    _t8 = *err;
+    *err = *err + log_wishart_prior(d, k, wishart, &sum_qs[0], &Qdiags[0], icf);
+    {
+        *err = _t8;
+        double _r_d3 = *_d_err;
+        *_d_err -= _r_d3;
+        *_d_err += _r_d3;
+        int _r7 = 0;
+        int _r8 = 0;
+        Wishart _r9 = {};
+        log_wishart_prior_pullback(d, k, wishart, &sum_qs[0], &Qdiags[0], icf, _r_d3, &_r7, &_r8, &_r9, &_d_sum_qs[0], &_d_Qdiags[0], _d_icf);
+        *_d_d += _r7;
+        *_d_k += _r8;
+    }
+    {
+        *err = _t7;
+        double _r_d2 = *_d_err;
+        *_d_err -= _r_d2;
+        _d_CONSTANT += _r_d2;
+        _d_slse += _r_d2;
+        *_d_n += -_r_d2 * lse_alphas;
+        _d_lse_alphas += n * -_r_d2;
+    }
+    {
+        int _r6 = 0;
+        logsumexp_pullback(k, alphas, _d_lse_alphas, &_r6, _d_alphas);
+        *_d_k += _r6;
+    }
+    for (; _t1; _t1--) {
+        ix--;
+        {
+            slse = clad::pop(_t6);
+            double _r_d1 = _d_slse;
+            _d_slse -= _r_d1;
+            _d_slse += _r_d1;
+            int _r5 = 0;
+            logsumexp_pullback(k, &main_term[0], _r_d1, &_r5, &_d_main_term[0]);
+            *_d_k += _r5;
+        }
+        {
+            for (; clad::back(_t2); clad::back(_t2)--) {
+                ik--;
+                {
+                    main_term[ik] = clad::pop(_t4);
+                    double _r_d0 = _d_main_term[ik];
+                    _d_main_term[ik] -= _r_d0;
+                    _d_alphas[ik] += _r_d0;
+                    _d_sum_qs[ik] += _r_d0;
+                    int _r4 = 0;
+                    sqnorm_pullback(d, &Qxcentered[0], 0.5 * -_r_d0, &_r4, &_d_Qxcentered[0]);
+                    *_d_d += _r4;
+                }
+                {
+                    int _r3 = 0;
+                    Qtimesx_pullback(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0], &_r3, &_d_Qdiags[ik * d], &_d_icf[ik * icf_sz + d], &_d_xcentered[0], &_d_Qxcentered[0]);
+                    *_d_d += _r3;
+                }
+                {
+                    int _r2 = 0;
+                    subtract_pullback(d, &x[ix * d], &means[ik * d], &xcentered[0], &_r2, &_d_x[ix * d], &_d_means[ik * d], &_d_xcentered[0]);
+                    *_d_d += _r2;
+                }
+            }
+            {
+                _d_ik = 0;
+                ik = clad::pop(_t3);
+            }
+            clad::pop(_t2);
+        }
+    }
+    {
+        int _r0 = 0;
+        int _r1 = 0;
+        preprocess_qs_pullback(d, k, icf, &sum_qs[0], &Qdiags[0], &_r0, &_r1, _d_icf, &_d_sum_qs[0], &_d_Qdiags[0]);
+        *_d_d += _r0;
+        *_d_k += _r1;
+    }
+    {
+        *_d_d += _d_icf_sz / 2 * (d + 1);
+        *_d_d += d * _d_icf_sz / 2;
+    }
+    {
+        *_d_n += -_d_CONSTANT * _t0 * 0.5 * d;
+        *_d_d += -n * _d_CONSTANT * _t0 * 0.5;
+    }
+}
+void preprocess_qs_pullback(int d, int k, const double *const icf, double *sum_qs, double *Qdiags, int *_d_d, int *_d_k, double *_d_icf, double *_d_sum_qs, double *_d_Qdiags) {
+    int _d_icf_sz = 0;
+    unsigned long _t0;
+    int _d_ik = 0;
+    int ik = 0;
+    clad::tape<double> _t1 = {};
+    clad::tape<unsigned long> _t2 = {};
+    clad::tape<int> _t3 = {};
+    int _d_id = 0;
+    int id = 0;
+    clad::tape<double> _t4 = {};
+    double _d_q = 0;
+    double q = 0;
+    clad::tape<double> _t5 = {};
+    clad::tape<double> _t6 = {};
+    int icf_sz = d * (d + 1) / 2;
+    _t0 = 0;
+    for (ik = 0; ik < k; ik++) {
+        _t0++;
+        clad::push(_t1, sum_qs[ik]);
+        sum_qs[ik] = 0.;
+        clad::push(_t2, 0UL);
+        for (clad::push(_t3, id) , id = 0; id < d; id++) {
+            clad::back(_t2)++;
+            clad::push(_t4, q) , q = icf[ik * icf_sz + id];
+            clad::push(_t5, sum_qs[ik]);
+            sum_qs[ik] = sum_qs[ik] + q;
+            clad::push(_t6, Qdiags[ik * d + id]);
+            Qdiags[ik * d + id] = exp(q);
+        }
+    }
+    for (; _t0; _t0--) {
+        ik--;
+        {
+            for (; clad::back(_t2); clad::back(_t2)--) {
+                id--;
+                {
+                    Qdiags[ik * d + id] = clad::pop(_t6);
+                    double _r_d2 = _d_Qdiags[ik * d + id];
+                    _d_Qdiags[ik * d + id] -= _r_d2;
+                    double _r0 = 0;
+                    _r0 += _r_d2 * clad::custom_derivatives::exp_pushforward(q, 1.).pushforward;
+                    _d_q += _r0;
+                }
+                {
+                    sum_qs[ik] = clad::pop(_t5);
+                    double _r_d1 = _d_sum_qs[ik];
+                    _d_sum_qs[ik] -= _r_d1;
+                    _d_sum_qs[ik] += _r_d1;
+                    _d_q += _r_d1;
+                }
+                {
+                    _d_icf[ik * icf_sz + id] += _d_q;
+                    _d_q = 0;
+                    q = clad::pop(_t4);
+                }
+            }
+            {
+                _d_id = 0;
+                id = clad::pop(_t3);
+            }
+            clad::pop(_t2);
+        }
+        {
+            sum_qs[ik] = clad::pop(_t1);
+            double _r_d0 = _d_sum_qs[ik];
+            _d_sum_qs[ik] -= _r_d0;
+        }
+    }
+    {
+        *_d_d += _d_icf_sz / 2 * (d + 1);
+        *_d_d += d * _d_icf_sz / 2;
+    }
+}
+void subtract_pullback(int d, const double *const x, const double *const y, double *out, int *_d_d, double *_d_x, double *_d_y, double *_d_out) {
+    unsigned long _t0;
+    int _d_id = 0;
+    int id = 0;
+    clad::tape<double> _t1 = {};
+    _t0 = 0;
+    for (id = 0; id < d; id++) {
+        _t0++;
+        clad::push(_t1, out[id]);
+        out[id] = x[id] - y[id];
+    }
+    for (; _t0; _t0--) {
+        id--;
+        {
+            out[id] = clad::pop(_t1);
+            double _r_d0 = _d_out[id];
+            _d_out[id] -= _r_d0;
+            _d_x[id] += _r_d0;
+            _d_y[id] += -_r_d0;
+        }
+    }
+}
+void Qtimesx_pullback(int d, const double *const Qdiag, const double *const ltri, const double *const x, double *out, int *_d_d, double *_d_Qdiag, double *_d_ltri, double *_d_x, double *_d_out) {
+    unsigned long _t0;
+    int _d_id = 0;
+    int id = 0;
+    clad::tape<double> _t1 = {};
+    int _d_Lparamsidx = 0;
+    unsigned long _t2;
+    int _d_i = 0;
+    int i = 0;
+    clad::tape<unsigned long> _t3 = {};
+    clad::tape<int> _t4 = {};
+    int _d_j = 0;
+    int j = 0;
+    clad::tape<double> _t5 = {};
+    _t0 = 0;
+    for (id = 0; id < d; id++) {
+        _t0++;
+        clad::push(_t1, out[id]);
+        out[id] = Qdiag[id] * x[id];
+    }
+    int Lparamsidx = 0;
+    _t2 = 0;
+    for (i = 0; i < d; i++) {
+        _t2++;
+        clad::push(_t3, 0UL);
+        for (clad::push(_t4, j) , j = i + 1; j < d; j++) {
+            clad::back(_t3)++;
+            clad::push(_t5, out[j]);
+            out[j] = out[j] + ltri[Lparamsidx] * x[i];
+            Lparamsidx++;
+        }
+    }
+    for (; _t2; _t2--) {
+        i--;
+        {
+            for (; clad::back(_t3); clad::back(_t3)--) {
+                j--;
+                Lparamsidx--;
+                {
+                    out[j] = clad::pop(_t5);
+                    double _r_d1 = _d_out[j];
+                    _d_out[j] -= _r_d1;
+                    _d_out[j] += _r_d1;
+                    _d_ltri[Lparamsidx] += _r_d1 * x[i];
+                    _d_x[i] += ltri[Lparamsidx] * _r_d1;
+                }
+            }
+            {
+                _d_i += _d_j;
+                _d_j = 0;
+                j = clad::pop(_t4);
+            }
+            clad::pop(_t3);
+        }
+    }
+    for (; _t0; _t0--) {
+        id--;
+        out[id] = clad::pop(_t1);
+        double _r_d0 = _d_out[id];
+        _d_out[id] -= _r_d0;
+        _d_Qdiag[id] += _r_d0 * x[id];
+        _d_x[id] += Qdiag[id] * _r_d0;
+    }
+}
+void sqnorm_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x) {
+    double _d_res = 0;
+    unsigned long _t0;
+    int _d_i = 0;
+    int i = 0;
+    clad::tape<double> _t1 = {};
+    double res = x[0] * x[0];
+    _t0 = 0;
+    for (i = 1; i < n; i++) {
+        _t0++;
+        clad::push(_t1, res);
+        res = res + x[i] * x[i];
+    }
+    goto _label0;
+  _label0:
+    _d_res += _d_y;
+    for (; _t0; _t0--) {
+        i--;
+        res = clad::pop(_t1);
+        double _r_d0 = _d_res;
+        _d_res -= _r_d0;
+        _d_res += _r_d0;
+        _d_x[i] += _r_d0 * x[i];
+        _d_x[i] += x[i] * _r_d0;
+    }
+    {
+        _d_x[0] += _d_res * x[0];
+        _d_x[0] += x[0] * _d_res;
+    }
+}
+void arr_max_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x);
+void logsumexp_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x) {
+    double _d_mx = 0;
+    double _d_semx = 0;
+    unsigned long _t0;
+    int _d_i = 0;
+    int i = 0;
+    clad::tape<double> _t1 = {};
+    double mx = arr_max(n, x);
+    double semx = 0.;
+    _t0 = 0;
+    for (i = 0; i < n; i++) {
+        _t0++;
+        clad::push(_t1, semx);
+        semx = semx + exp(x[i] - mx);
+    }
+    goto _label0;
+  _label0:
+    {
+        double _r2 = 0;
+        _r2 += _d_y * clad::custom_derivatives::log_pushforward(semx, 1.).pushforward;
+        _d_semx += _r2;
+        _d_mx += _d_y;
+    }
+    for (; _t0; _t0--) {
+        i--;
+        {
+            semx = clad::pop(_t1);
+            double _r_d0 = _d_semx;
+            _d_semx -= _r_d0;
+            _d_semx += _r_d0;
+            double _r1 = 0;
+            _r1 += _r_d0 * clad::custom_derivatives::exp_pushforward(x[i] - mx, 1.).pushforward;
+            _d_x[i] += _r1;
+            _d_mx += -_r1;
+        }
+    }
+    {
+        int _r0 = 0;
+        arr_max_pullback(n, x, _d_mx, &_r0, _d_x);
+        *_d_n += _r0;
+    }
+}
+inline void log_gamma_distrib_pullback(double a, double p, double _d_y, double *_d_a, double *_d_p);
+void log_wishart_prior_pullback(int p, int k, Wishart wishart, const double *const sum_qs, const double *const Qdiags, const double *const icf, double _d_y, int *_d_p, int *_d_k, Wishart *_d_wishart, double *_d_sum_qs, double *_d_Qdiags, double *_d_icf) {
+    int _d_n = 0;
+    int _d_icf_sz = 0;
+    double _t0;
+    double _t1;
+    double _d_C = 0;
+    double _d_out = 0;
+    unsigned long _t2;
+    int _d_ik = 0;
+    int ik = 0;
+    clad::tape<double> _t3 = {};
+    double _d_frobenius = 0;
+    double frobenius = 0;
+    clad::tape<double> _t4 = {};
+    int n = p + wishart.m + 1;
+    int icf_sz = p * (p + 1) / 2;
+    _t1 = log(2);
+    _t0 = (log(wishart.gamma) - 0.5 * _t1);
+    double C = n * p * _t0 - log_gamma_distrib(0.5 * n, p);
+    double out = 0;
+    _t2 = 0;
+    for (ik = 0; ik < k; ik++) {
+        _t2++;
+        clad::push(_t3, frobenius) , frobenius = sqnorm(p, &Qdiags[ik * p]) + sqnorm(icf_sz - p, &icf[ik * icf_sz + p]);
+        clad::push(_t4, out);
+        out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - wishart.m * sum_qs[ik];
+    }
+    goto _label0;
+  _label0:
+    {
+        _d_out += _d_y;
+        *_d_k += -_d_y * C;
+        _d_C += k * -_d_y;
+    }
+    for (; _t2; _t2--) {
+        ik--;
+        {
+            out = clad::pop(_t4);
+            double _r_d0 = _d_out;
+            _d_out -= _r_d0;
+            _d_out += _r_d0;
+            (*_d_wishart).gamma += 0.5 * _r_d0 * (frobenius) * wishart.gamma;
+            (*_d_wishart).gamma += 0.5 * wishart.gamma * _r_d0 * (frobenius);
+            _d_frobenius += 0.5 * wishart.gamma * wishart.gamma * _r_d0;
+            (*_d_wishart).m += -_r_d0 * sum_qs[ik];
+            _d_sum_qs[ik] += wishart.m * -_r_d0;
+        }
+        {
+            int _r3 = 0;
+            sqnorm_pullback(p, &Qdiags[ik * p], _d_frobenius, &_r3, &_d_Qdiags[ik * p]);
+            *_d_p += _r3;
+            int _r4 = 0;
+            sqnorm_pullback(icf_sz - p, &icf[ik * icf_sz + p], _d_frobenius, &_r4, &_d_icf[ik * icf_sz + p]);
+            _d_icf_sz += _r4;
+            *_d_p += -_r4;
+            _d_frobenius = 0;
+            frobenius = clad::pop(_t3);
+        }
+    }
+    {
+        _d_n += _d_C * _t0 * p;
+        *_d_p += n * _d_C * _t0;
+        double _r0 = 0;
+        _r0 += n * p * _d_C * clad::custom_derivatives::log_pushforward(wishart.gamma, 1.).pushforward;
+        (*_d_wishart).gamma += _r0;
+        double _r1 = 0;
+        double _r2 = 0;
+        log_gamma_distrib_pullback(0.5 * n, p, -_d_C, &_r1, &_r2);
+        _d_n += 0.5 * _r1;
+        *_d_p += _r2;
+    }
+    {
+        *_d_p += _d_icf_sz / 2 * (p + 1);
+        *_d_p += p * _d_icf_sz / 2;
+    }
+    {
+        *_d_p += _d_n;
+        (*_d_wishart).m += _d_n;
+    }
+}
+void arr_max_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x) {
+    double _d_m = 0;
+    unsigned long _t0;
+    int _d_i = 0;
+    int i = 0;
+    clad::tape<bool> _t2 = {};
+    clad::tape<double> _t3 = {};
+    double m = x[0];
+    _t0 = 0;
+    for (i = 1; i < n; i++) {
+        _t0++;
+        bool _t1 = m < x[i];
+        {
+            if (_t1) {
+                clad::push(_t3, m);
+                m = x[i];
+            }
+            clad::push(_t2, _t1);
+        }
+    }
+    goto _label0;
+  _label0:
+    _d_m += _d_y;
+    for (; _t0; _t0--) {
+        i--;
+        if (clad::pop(_t2)) {
+            m = clad::pop(_t3);
+            double _r_d0 = _d_m;
+            _d_m -= _r_d0;
+            _d_x[i] += _r_d0;
+        }
+    }
+    _d_x[0] += _d_m;
+}
+inline void log_gamma_distrib_pullback(double a, double p, double _d_y, double *_d_a, double *_d_p) {
+    double _t0;
+    double _d_out = 0;
+    unsigned long _t1;
+    int _d_j = 0;
+    int j = 0;
+    clad::tape<double> _t2 = {};
+    _t0 = log(3.1415926535900001);
+    double out = 0.25 * p * (p - 1) * _t0;
+    _t1 = 0;
+    for (j = 1; j <= p; j++) {
+        _t1++;
+        clad::push(_t2, out);
+        out = out + lgamma(a + 0.5 * (1 - j));
+    }
+    goto _label0;
+  _label0:
+    _d_out += _d_y;
+    for (; _t1; _t1--) {
+        j--;
+        {
+            out = clad::pop(_t2);
+            double _r_d0 = _d_out;
+            _d_out -= _r_d0;
+            _d_out += _r_d0;
+            double _r0 = 0;
+            _r0 += _r_d0 * numerical_diff::forward_central_difference(lgamma, a + 0.5 * (1 - j), 0, 0, a + 0.5 * (1 - j));
+            *_d_a += _r0;
+            _d_j += -0.5 * _r0;
+        }
+    }
+    {
+        *_d_p += 0.25 * _d_out * _t0 * (p - 1);
+        *_d_p += 0.25 * p * _d_out * _t0;
+    }
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatest/gmm/gmm_grad.h b/src/cpp/modules/cladLatest/gmm/gmm_grad.h
new file mode 100644
index 00000000..c39f2aa1
--- /dev/null
+++ b/src/cpp/modules/cladLatest/gmm/gmm_grad.h
@@ -0,0 +1,3 @@
+#include "gmm.h"
+
+void gmm_objective_pullback(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err);
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/CMakeLists.txt b/src/cpp/modules/cladLatestTBR/CMakeLists.txt
new file mode 100644
index 00000000..7575ef22
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/CMakeLists.txt
@@ -0,0 +1,15 @@
+project("ADBench_CladLatestTBR" CXX)
+
+add_library("CladLatestTBR" MODULE)
+
+SET(clad_install_root "/home/mad-scientist/clubs/open-source/clad/formal/builds/build-llvm-16/inst")
+
+target_sources("CladLatestTBR" PRIVATE "${CMAKE_SOURCE_DIR}/src/cpp/shared/utils.cpp")
+target_sources("CladLatestTBR" PRIVATE "ba/ba_grad.cpp")
+target_sources("CladLatestTBR" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/CladLatestTBRBA.cpp")
+target_sources("CladLatestTBR" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/gmm/gmm_grad_tbr.cpp")
+target_sources("CladLatestTBR" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/CladLatestTBRGMM.cpp")
+
+target_include_directories("CladLatestTBR" PRIVATE ${clad_install_root}/include)
+target_include_directories("CladLatestTBR" PRIVATE ${CMAKE_SOURCE_DIR}/src/cpp/shared)
+# add_compile_definitions("CladLatestTBR" CLAD_NO_NUM_DIFF)
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/CladLatestTBRBA.cpp b/src/cpp/modules/cladLatestTBR/CladLatestTBRBA.cpp
new file mode 100644
index 00000000..5a89e5ee
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/CladLatestTBRBA.cpp
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "CladLatestTBRBA.h"
+#include "ba/ba_grad.h"
+
+// This function must be called before any other function.
+void CladLatestTBRBA::prepare(BAInput &&input) {
+  this->input = input;
+  result = {std::vector<double>(2 * this->input.p),
+            std::vector<double>(this->input.p),
+            BASparseMat(this->input.n, this->input.m, this->input.p)};
+
+  reproj_err_d = std::vector<double>(2 * (BA_NCAMPARAMS + 3 + 1));
+  reproj_err_d_row = std::vector<double>(BA_NCAMPARAMS + 3 + 1);
+}
+
+BAOutput CladLatestTBRBA::output() { return result; }
+
+void CladLatestTBRBA::calculate_objective(int times) {
+  for (int i = 0; i < times; i++) {
+    ba_objective(input.n, input.m, input.p, input.cams.data(), input.X.data(),
+                 input.w.data(), input.obs.data(), input.feats.data(),
+                 result.reproj_err.data(), result.w_err.data());
+  }
+}
+
+void CladLatestTBRBA::calculate_jacobian(int times) {
+  for (int i = 0; i < times; i++) {
+    result.J.clear();
+    calculate_reproj_error_jacobian_part();
+    calculate_weight_error_jacobian_part();
+  }
+}
+
+void CladLatestTBRBA::calculate_reproj_error_jacobian_part() {
+  double
+      errb[2]; // stores dY
+               // (i-th element equals to 1.0 for calculating i-th jacobian row)
+
+  double err[2]; // stores fictive result
+                 // (Tapenade doesn't calculate an original function in
+                 // reverse mode)
+
+  clad::array_ref<double> cam_gradient_part(reproj_err_d_row.data(),
+                                            reproj_err_d_row.size());
+  clad::array_ref<double> x_gradient_part(
+      reproj_err_d_row.data() + BA_NCAMPARAMS,
+      reproj_err_d_row.size() - BA_NCAMPARAMS);
+  clad::array_ref<double> weight_gradient_part(
+      reproj_err_d_row.data() + BA_NCAMPARAMS + 3,
+      reproj_err_d_row.size() - BA_NCAMPARAMS - 3);
+  std::vector<double> feats_gradient_part(2, 0);
+  clad::array_ref<double> feats_gradient_part_ref(feats_gradient_part.data(),
+                                                  feats_gradient_part.size());
+  clad::array_ref<double> errb_ref(errb, 2);
+
+  for (int i = 0; i < input.p; i++) {
+    std::fill(reproj_err_d_row.begin(), reproj_err_d_row.end(), 0);
+    std::fill(feats_gradient_part.begin(), feats_gradient_part.end(), 0);
+
+    int camIdx = input.obs[2 * i + 0];
+    int ptIdx = input.obs[2 * i + 1];
+
+    // calculate first row
+    errb[0] = 1.0;
+    errb[1] = 0.0;
+    computeReprojError_pullback(
+        &input.cams[camIdx * BA_NCAMPARAMS], &input.X[ptIdx * 3], &input.w[i],
+        &input.feats[i * 2], err, cam_gradient_part, x_gradient_part,
+        weight_gradient_part, feats_gradient_part_ref, errb_ref);
+
+    // fill first row elements
+    for (int j = 0; j < BA_NCAMPARAMS + 3 + 1; j++) {
+      reproj_err_d[2 * j] = reproj_err_d_row[j];
+    }
+
+    std::fill(reproj_err_d_row.begin(), reproj_err_d_row.end(), 0);
+    std::fill(feats_gradient_part.begin(), feats_gradient_part.end(), 0);
+    // calculate second row
+    errb[0] = 0.0;
+    errb[1] = 1.0;
+    computeReprojError_pullback(
+        &input.cams[camIdx * BA_NCAMPARAMS], &input.X[ptIdx * 3], &input.w[i],
+        &input.feats[i * 2], err, cam_gradient_part, x_gradient_part,
+        weight_gradient_part, feats_gradient_part_ref, errb);
+
+    // fill second row elements
+    for (int j = 0; j < BA_NCAMPARAMS + 3 + 1; j++) {
+      reproj_err_d[2 * j + 1] = reproj_err_d_row[j];
+    }
+
+    result.J.insert_reproj_err_block(i, camIdx, ptIdx, reproj_err_d.data());
+  }
+}
+
+void CladLatestTBRBA::calculate_weight_error_jacobian_part() {
+  for (int j = 0; j < input.p; j++) {
+    double wb = 0; // stores calculated derivative
+
+    double err = 0.0; // stores fictive result
+                      // (Tapenade doesn't calculate an original function in
+                      // reverse mode)
+
+    double errb = 1.0; // stores dY
+                       // (equals to 1.0 for derivative calculation)
+
+    computeZachWeightError_pullback(&input.w[j], &err, &wb, &errb);
+    result.J.insert_w_err_block(j, wb);
+  }
+}
+
+extern "C" DLL_PUBLIC ITest<BAInput, BAOutput> *get_ba_test() {
+  return new CladLatestTBRBA();
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/CladLatestTBRBA.h b/src/cpp/modules/cladLatestTBR/CladLatestTBRBA.h
new file mode 100644
index 00000000..b0edce27
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/CladLatestTBRBA.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include "../../shared/ITest.h"
+#include "../../shared/BAData.h"
+#include "../../shared/defs.h"
+
+#include "ba/ba.h"
+
+#include <vector>
+
+class CladLatestTBRBA : public ITest<BAInput, BAOutput>
+{
+private:
+    BAInput input;
+    BAOutput result;
+    std::vector<double> state;
+
+    // buffer for reprojection error jacobian part holding (column-major)
+    std::vector<double> reproj_err_d;
+
+    // buffer for reprojection error jacobian block row holding
+    std::vector<double> reproj_err_d_row;
+public:
+    // This function must be called before any other function.
+    virtual void prepare(BAInput&& input) override;
+
+    virtual void calculate_objective(int times) override;
+    virtual void calculate_jacobian(int times) override;
+    virtual BAOutput output() override;
+
+    ~CladLatestTBRBA() {}
+
+private:
+    void calculate_weight_error_jacobian_part();
+    void calculate_reproj_error_jacobian_part();
+};
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/CladLatestTBRGMM.cpp b/src/cpp/modules/cladLatestTBR/CladLatestTBRGMM.cpp
new file mode 100644
index 00000000..c00c7273
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/CladLatestTBRGMM.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "CladLatestTBRGMM.h"
+
+#include "gmm/gmm.h"
+#include "gmm/gmm_grad_tbr.h"
+
+// This function must be called before any other function.
+void CladLatestTBRGMM::prepare(GMMInput&& input)
+{
+    this->input = input;
+    int Jcols = (this->input.k * (this->input.d + 1) * (this->input.d + 2)) / 2;
+    result = { 0, std::vector<double>(Jcols) };
+}
+
+
+
+GMMOutput CladLatestTBRGMM::output()
+{
+    return result;
+}
+
+
+
+void CladLatestTBRGMM::calculate_objective(int times)
+{
+    for (int i = 0; i < times; i++)
+    {
+        gmm_objective(
+            input.d,
+            input.k,
+            input.n,
+            input.alphas.data(),
+            input.means.data(),
+            input.icf.data(),
+            input.x.data(),
+            input.wishart,
+            &result.objective
+        );
+    }
+}
+
+
+
+void CladLatestTBRGMM::calculate_jacobian(int times)
+{
+    double* alphas_gradient_part = result.gradient.data();
+    double* means_gradient_part = result.gradient.data() + input.alphas.size();
+    double* icf_gradient_part =
+        result.gradient.data() +
+        input.alphas.size() +
+        input.means.size();
+    std::vector<double> d_x(input.x.size(), 0);
+
+    for (int i = 0; i < times; i++)
+    {
+        double tmp = 0.0;       // stores fictive result
+                                // (Tapenade doesn't calculate an original function in reverse mode)
+
+        double errb = 1.0;      // stores dY
+                                // (equals to 1.0 for gradient calculation)
+        
+        int d_d = 0, d_k = 0, d_n = 0;
+        Wishart d_wishart{0, 0};
+        std::fill(result.gradient.begin(), result.gradient.end(), 0);
+        std::fill(d_x.begin(), d_x.end(), 0);
+
+        gmm_objective_pullback(
+            input.d,
+            input.k,
+            input.n,
+            input.alphas.data(),
+            input.means.data(),
+            input.icf.data(),
+            input.x.data(),
+            input.wishart,
+            &tmp,
+            &d_d,
+            &d_k,
+            &d_n,
+            alphas_gradient_part,
+            means_gradient_part,
+            icf_gradient_part,
+            d_x.data(),
+            &d_wishart,
+            &errb
+        );
+    }
+}
+
+
+
+extern "C" DLL_PUBLIC ITest<GMMInput, GMMOutput>* get_gmm_test()
+{
+    return new CladLatestTBRGMM();
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/CladLatestTBRGMM.h b/src/cpp/modules/cladLatestTBR/CladLatestTBRGMM.h
new file mode 100644
index 00000000..390b585a
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/CladLatestTBRGMM.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include "../../shared/ITest.h"
+#include "../../shared/GMMData.h"
+
+#include <vector>
+
+class CladLatestTBRGMM : public ITest<GMMInput, GMMOutput>
+{
+private:
+    GMMInput input;
+    GMMOutput result;
+    std::vector<double> state;
+
+public:
+    // This function must be called before any other function.
+    virtual void prepare(GMMInput&& input) override;
+
+    virtual void calculate_objective(int times) override;
+    virtual void calculate_jacobian(int times) override;
+    virtual GMMOutput output() override;
+
+    ~CladLatestTBRGMM() {}
+};
+
diff --git a/src/cpp/modules/cladLatestTBR/ba/ba.h b/src/cpp/modules/cladLatestTBR/ba/ba.h
new file mode 100644
index 00000000..937a686f
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/ba/ba.h
@@ -0,0 +1,192 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+#ifndef CLADLATEST_BA_H
+#define CLADLATEST_BA_H
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include "defs.h"
+#include "matrix.h"
+
+////////////////////////////////////////////////////////////
+//////////////////// Declarations //////////////////////////
+////////////////////////////////////////////////////////////
+
+// cam: 11 camera in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2]
+//            r1, r2, r3 are angle - axis rotation parameters(Rodrigues)
+//            [C1 C2 C3]' is the camera center
+//            f is the focal length in pixels
+//            [u0 v0]' is the principal point
+//            k1, k2 are radial distortion parameters
+// X: 3 point
+// feats: 2 feature (x,y coordinates)
+// reproj_err: 2
+// projection function:
+// Xcam = R * (X - C)
+// distorted = radial_distort(projective2euclidean(Xcam), radial_parameters)
+// proj = distorted * f + principal_point
+// err = sqsum(proj - measurement)
+template <typename T>
+void computeReprojError(const T *const cam, const T *const X, const T *const w,
+                        const double *const feat, T *err);
+
+// w: 1
+// w_err: 1
+template <typename T> void computeZachWeightError(const T *const w, T *err);
+
+// n number of cameras
+// m number of points
+// p number of observations
+// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2]
+//            r1, r2, r3 are angle - axis rotation parameters(Rodrigues)
+//            [C1 C2 C3]' is the camera center
+//            f is the focal length in pixels
+//            [u0 v0]' is the principal point
+//            k1, k2 are radial distortion parameters
+// X: 3*m points
+// obs: 2*p observations (pairs cameraIdx, pointIdx)
+// feats: 2*p features (x,y coordinates corresponding to observations)
+// reproj_err: 2*p errors of observations
+// w_err: p weight "error" terms
+// projection function:
+// Xcam = R * (X - C)
+// distorted = radial_distort(projective2euclidean(Xcam), radial_parameters)
+// proj = distorted * f + principal_point
+// err = sqsum(proj - measurement)
+template <typename T>
+void ba_objective(int n, int m, int p, const T *const cams, const T *const X,
+                  const T *const w, const int *const obs,
+                  const double *const feats, T *reproj_err, T *w_err);
+
+// rot: 3 rotation parameters
+// pt: 3 point to be rotated
+// rotatedPt: 3 rotated point
+// this is an efficient evaluation (part of
+// the Ceres implementation)
+// easy to understand calculation in matlab:
+//  theta = sqrt(sum(w. ^ 2));
+//  n = w / theta;
+//  n_x = au_cross_matrix(n);
+//  R = eye(3) + n_x*sin(theta) + n_x*n_x*(1 - cos(theta));
+template <typename T>
+void rodrigues_rotate_point(const T *const rot, const T *const pt,
+                            T *rotatedPt);
+
+////////////////////////////////////////////////////////////
+//////////////////// Definitions ///////////////////////////
+////////////////////////////////////////////////////////////
+
+template <typename T> T sqsum(int n, const T *const x) {
+  T res = 0;
+  int i = 0;
+  for (; i < n; i++)
+    res = res + x[i] * x[i];
+  return res;
+}
+
+template <typename T>
+void rodrigues_rotate_point(const T * rot, const T * pt,
+                            T *rotatedPt) {
+  T sqtheta = sqsum(3, rot);
+  int i0 = 0;
+  int i1 = 0;
+  int i2 = 0;
+  T theta, costheta, sintheta, theta_inverse, tmp;
+  T w[3], w_cross_pt[3];
+  T rot_cross_pt[3];
+  if (sqtheta != 0) {
+    theta = sqrt(sqtheta);
+    costheta = cos(theta);
+    sintheta = sin(theta);
+    theta_inverse = 1.0 / theta;
+
+    for (; i0 < 3; i0++)
+      w[i0] = rot[i0] * theta_inverse;
+
+    cross(w, pt, w_cross_pt);
+
+    tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta);
+
+    for (; i1 < 3; i1++)
+      rotatedPt[i1] = pt[i1] * costheta + w_cross_pt[i1] * sintheta + w[i1] * tmp;
+  } else {
+    cross(rot, pt, rot_cross_pt);
+
+    for (; i2 < 3; i2++)
+      rotatedPt[i2] = pt[i2] + rot_cross_pt[i2];
+  }
+}
+
+template <typename T> void radial_distort(const T *const rad_params, T *proj) {
+  T rsq, L;
+  rsq = sqsum(2, proj);
+  L = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq;
+  proj[0] = proj[0] * L;
+  proj[1] = proj[1] * L;
+}
+
+template <typename T>
+void project(const T * cam, const T * X, T *proj) {
+  const T *const C = &cam[3];
+  T Xo[3], Xcam[3];
+
+  Xo[0] = X[0] - C[0];
+  Xo[1] = X[1] - C[1];
+  Xo[2] = X[2] - C[2];
+
+  rodrigues_rotate_point(cam + 0, Xo, Xcam);
+
+  proj[0] = Xcam[0] / Xcam[2];
+  proj[1] = Xcam[1] / Xcam[2];
+
+  radial_distort(cam + 9, proj);
+
+  proj[0] = proj[0] * cam[6] + cam[7];
+  proj[1] = proj[1] * cam[6] + cam[8];
+}
+
+template <typename T>
+void computeReprojError(const T * cam, const T * X, const T * w,
+                        const double * feat, T *err) {
+  T proj[2];
+  project(cam, X, proj);
+
+  err[0] = (*w) * (proj[0] - feat[0]);
+  err[1] = (*w) * (proj[1] - feat[1]);
+}
+
+template <typename T> void computeZachWeightError(const T *const w, T *err) {
+  *err = 1 - (*w) * (*w);
+}
+
+template <typename T>
+void ba_objective(int n, int m, int p, const T *const cams, const T *const X,
+                  const T *const w, const int *const obs,
+                  const double *const feats, T *reproj_err, T *w_err) {
+  for (int i = 0; i < p; i++) {
+    int camIdx = obs[i * 2 + 0];
+    int ptIdx = obs[i * 2 + 1];
+    computeReprojError(&cams[camIdx * BA_NCAMPARAMS], &X[ptIdx * 3], &w[i],
+                       &feats[i * 2], &reproj_err[2 * i]);
+  }
+
+  for (int i = 0; i < p; i++) {
+    computeZachWeightError(&w[i], &w_err[i]);
+  }
+}
+
+template <typename T>
+T computeReprojError_wrapper(const T * cam, const T * X,
+                             const T * w, const double * feat,
+                             T *err) {
+  computeReprojError(cam, X, w, feat, err);
+  return err[0];
+}
+
+template <typename T>
+T computeZachWeightError_wrapper(const T * w, T *err) {
+    computeZachWeightError(w, err);
+    return err[0];
+}
+#endif
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/ba/ba_grad.cpp b/src/cpp/modules/cladLatestTBR/ba/ba_grad.cpp
new file mode 100644
index 00000000..6a0c58e0
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/ba/ba_grad.cpp
@@ -0,0 +1,452 @@
+#include "ba.h"
+#include "clad/Differentiator/Differentiator.h"
+
+void sqsum_pullback(int n, const double *const x, double _d_y,
+                    clad::array_ref<int> _d_n, clad::array_ref<double> _d_x) {
+  double _d_res = 0;
+  int _d_i = 0;
+  unsigned long _t0;
+  double res = 0;
+  int i = 0;
+  _t0 = 0;
+  for (; i < n; i++) {
+    _t0++;
+    res = res + x[i] * x[i];
+  }
+  goto _label0;
+_label0:
+  _d_res += _d_y;
+  for (; _t0; _t0--) {
+    i--;
+    double _r_d0 = _d_res;
+    _d_res += _r_d0;
+    _d_x[i] += _r_d0 * x[i];
+    _d_x[i] += x[i] * _r_d0;
+    _d_res -= _r_d0;
+  }
+}
+void cross_pullback(const double *const a, const double *const b, double *out,
+                    clad::array_ref<double> _d_a, clad::array_ref<double> _d_b,
+                    clad::array_ref<double> _d_out) {
+  out[0] = a[1] * b[2] - a[2] * b[1];
+  out[1] = a[2] * b[0] - a[0] * b[2];
+  out[2] = a[0] * b[1] - a[1] * b[0];
+  {
+    double _r_d2 = _d_out[2];
+    _d_a[0] += _r_d2 * b[1];
+    _d_b[1] += a[0] * _r_d2;
+    _d_a[1] += -_r_d2 * b[0];
+    _d_b[0] += a[1] * -_r_d2;
+    _d_out[2] -= _r_d2;
+    _d_out[2];
+  }
+  {
+    double _r_d1 = _d_out[1];
+    _d_a[2] += _r_d1 * b[0];
+    _d_b[0] += a[2] * _r_d1;
+    _d_a[0] += -_r_d1 * b[2];
+    _d_b[2] += a[0] * -_r_d1;
+    _d_out[1] -= _r_d1;
+    _d_out[1];
+  }
+  {
+    double _r_d0 = _d_out[0];
+    _d_a[1] += _r_d0 * b[2];
+    _d_b[2] += a[1] * _r_d0;
+    _d_a[2] += -_r_d0 * b[1];
+    _d_b[1] += a[2] * -_r_d0;
+    _d_out[0] -= _r_d0;
+    _d_out[0];
+  }
+}
+void rodrigues_rotate_point_pullback(const double *rot, const double *pt,
+                                     double *rotatedPt,
+                                     clad::array_ref<double> _d_rot,
+                                     clad::array_ref<double> _d_pt,
+                                     clad::array_ref<double> _d_rotatedPt) {
+  const double *_t0;
+  double _d_sqtheta = 0;
+  int _d_i0 = 0;
+  int _d_i1 = 0;
+  int _d_i2 = 0;
+  double _d_theta = 0, _d_costheta = 0, _d_sintheta = 0, _d_theta_inverse = 0,
+         _d_tmp = 0;
+  clad::array<double> _d_w(3UL), _d_w_cross_pt(3UL);
+  clad::array<double> _d_rot_cross_pt(3UL);
+  bool _cond0;
+  unsigned long _t1;
+  const double *_t2;
+  unsigned long _t3;
+  const double *_t4;
+  const double *_t5;
+  unsigned long _t6;
+  _t0 = rot;
+  double sqtheta = sqsum(3, rot);
+  int i0 = 0;
+  int i1 = 0;
+  int i2 = 0;
+  double theta, costheta, sintheta, theta_inverse, tmp;
+  double w[3], w_cross_pt[3];
+  double rot_cross_pt[3];
+  _cond0 = sqtheta != 0;
+  if (_cond0) {
+    theta = sqrt(sqtheta);
+    costheta = cos(theta);
+    sintheta = sin(theta);
+    theta_inverse = 1. / theta;
+    _t1 = 0;
+    for (; i0 < 3; i0++) {
+      _t1++;
+      w[i0] = rot[i0] * theta_inverse;
+    }
+    _t2 = pt;
+    cross(w, pt, w_cross_pt);
+    tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta);
+    _t3 = 0;
+    for (; i1 < 3; i1++) {
+      _t3++;
+      rotatedPt[i1] =
+          pt[i1] * costheta + w_cross_pt[i1] * sintheta + w[i1] * tmp;
+    }
+  } else {
+    _t4 = rot;
+    _t5 = pt;
+    cross(rot, pt, rot_cross_pt);
+    _t6 = 0;
+    for (; i2 < 3; i2++) {
+      _t6++;
+      rotatedPt[i2] = pt[i2] + rot_cross_pt[i2];
+    }
+  }
+  if (_cond0) {
+    for (; _t3; _t3--) {
+      i1--;
+      double _r_d6 = _d_rotatedPt[i1];
+      _d_pt[i1] += _r_d6 * costheta;
+      _d_costheta += pt[i1] * _r_d6;
+      _d_w_cross_pt[i1] += _r_d6 * sintheta;
+      _d_sintheta += w_cross_pt[i1] * _r_d6;
+      _d_w[i1] += _r_d6 * tmp;
+      _d_tmp += w[i1] * _r_d6;
+      _d_rotatedPt[i1] -= _r_d6;
+      _d_rotatedPt[i1];
+    }
+    {
+      double _r_d5 = _d_tmp;
+      _d_w[0] += _r_d5 * (1. - costheta) * pt[0];
+      _d_pt[0] += w[0] * _r_d5 * (1. - costheta);
+      _d_w[1] += _r_d5 * (1. - costheta) * pt[1];
+      _d_pt[1] += w[1] * _r_d5 * (1. - costheta);
+      _d_w[2] += _r_d5 * (1. - costheta) * pt[2];
+      _d_pt[2] += w[2] * _r_d5 * (1. - costheta);
+      _d_costheta += -(w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * _r_d5;
+      _d_tmp -= _r_d5;
+    }
+    {
+      pt = _t2;
+      cross_pullback(w, _t2, w_cross_pt, _d_w, _d_pt, _d_w_cross_pt);
+    }
+    for (; _t1; _t1--) {
+      i0--;
+      double _r_d4 = _d_w[i0];
+      _d_rot[i0] += _r_d4 * theta_inverse;
+      _d_theta_inverse += rot[i0] * _r_d4;
+      _d_w[i0] -= _r_d4;
+      _d_w[i0];
+    }
+    {
+      double _r_d3 = _d_theta_inverse;
+      double _r4 = _r_d3 * -1. / (theta * theta);
+      _d_theta += _r4;
+      _d_theta_inverse -= _r_d3;
+    }
+    {
+      double _r_d2 = _d_sintheta;
+      double _r3 =
+          _r_d2 *
+          clad::custom_derivatives::sin_pushforward(theta, 1.).pushforward;
+      _d_theta += _r3;
+      _d_sintheta -= _r_d2;
+    }
+    {
+      double _r_d1 = _d_costheta;
+      double _r2 =
+          _r_d1 *
+          clad::custom_derivatives::cos_pushforward(theta, 1.).pushforward;
+      _d_theta += _r2;
+      _d_costheta -= _r_d1;
+    }
+    {
+      double _r_d0 = _d_theta;
+      double _r1 =
+          _r_d0 *
+          clad::custom_derivatives::sqrt_pushforward(sqtheta, 1.).pushforward;
+      _d_sqtheta += _r1;
+      _d_theta -= _r_d0;
+    }
+  } else {
+    for (; _t6; _t6--) {
+      i2--;
+      double _r_d7 = _d_rotatedPt[i2];
+      _d_pt[i2] += _r_d7;
+      _d_rot_cross_pt[i2] += _r_d7;
+      _d_rotatedPt[i2] -= _r_d7;
+      _d_rotatedPt[i2];
+    }
+    {
+      rot = _t4;
+      pt = _t5;
+      cross_pullback(_t4, _t5, rot_cross_pt, _d_rot, _d_pt, _d_rot_cross_pt);
+    }
+  }
+  {
+    rot = _t0;
+    int _grad0 = 0;
+    sqsum_pullback(3, _t0, _d_sqtheta, &_grad0, _d_rot);
+    int _r0 = _grad0;
+  }
+}
+void radial_distort_pullback(const double *const rad_params, double *proj,
+                             clad::array_ref<double> _d_rad_params,
+                             clad::array_ref<double> _d_proj) {
+  double _d_rsq = 0, _d_L = 0;
+  double *_t0;
+  double _t1;
+  double _t2;
+  double rsq, L;
+  _t0 = proj;
+  rsq = sqsum(2, proj);
+  L = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq;
+  _t1 = proj[0];
+  proj[0] = proj[0] * L;
+  _t2 = proj[1];
+  proj[1] = proj[1] * L;
+  {
+    proj[1] = _t2;
+    double _r_d3 = _d_proj[1];
+    _d_proj[1] += _r_d3 * L;
+    _d_L += proj[1] * _r_d3;
+    _d_proj[1] -= _r_d3;
+    _d_proj[1];
+  }
+  {
+    proj[0] = _t1;
+    double _r_d2 = _d_proj[0];
+    _d_proj[0] += _r_d2 * L;
+    _d_L += proj[0] * _r_d2;
+    _d_proj[0] -= _r_d2;
+    _d_proj[0];
+  }
+  {
+    double _r_d1 = _d_L;
+    _d_rad_params[0] += _r_d1 * rsq;
+    _d_rsq += rad_params[0] * _r_d1;
+    _d_rad_params[1] += _r_d1 * rsq * rsq;
+    _d_rsq += rad_params[1] * _r_d1 * rsq;
+    _d_rsq += rad_params[1] * rsq * _r_d1;
+    _d_L -= _r_d1;
+  }
+  {
+    double _r_d0 = _d_rsq;
+    proj = _t0;
+    int _grad0 = 0;
+    sqsum_pullback(2, _t0, _r_d0, &_grad0, _d_proj);
+    int _r0 = _grad0;
+    _d_rsq -= _r_d0;
+  }
+}
+void project_pullback(const double *cam, const double *X, double *proj,
+                      clad::array_ref<double> _d_cam,
+                      clad::array_ref<double> _d_X,
+                      clad::array_ref<double> _d_proj) {
+  double *_d_C = 0;
+  clad::array<double> _d_Xo(3UL), _d_Xcam(3UL);
+  const double *_t0;
+  const double *_t1;
+  double *_t2;
+  double _t3;
+  double _t4;
+  _d_C = &_d_cam[3];
+  const double *const C = &cam[3];
+  double Xo[3], Xcam[3];
+  Xo[0] = X[0] - C[0];
+  Xo[1] = X[1] - C[1];
+  Xo[2] = X[2] - C[2];
+  _t0 = cam + 0;
+  rodrigues_rotate_point(cam + 0, Xo, Xcam);
+  proj[0] = Xcam[0] / Xcam[2];
+  proj[1] = Xcam[1] / Xcam[2];
+  _t1 = cam + 9;
+  _t2 = proj;
+  // CFIX-start
+  double tproj0 = proj[0];
+  double tproj1 = proj[1];
+  // CFIX-end
+  radial_distort(cam + 9, proj);
+  _t3 = proj[0];
+  proj[0] = proj[0] * cam[6] + cam[7];
+  _t4 = proj[1];
+  proj[1] = proj[1] * cam[6] + cam[8];
+  {
+    proj[1] = _t4;
+    double _r_d6 = _d_proj[1];
+    _d_proj[1] += _r_d6 * cam[6];
+    _d_cam[6] += proj[1] * _r_d6;
+    _d_cam[8] += _r_d6;
+    _d_proj[1] -= _r_d6;
+    _d_proj[1];
+  }
+  {
+    proj[0] = _t3;
+    double _r_d5 = _d_proj[0];
+    _d_proj[0] += _r_d5 * cam[6];
+    _d_cam[6] += proj[0] * _r_d5;
+    _d_cam[7] += _r_d5;
+    _d_proj[0] -= _r_d5;
+    _d_proj[0];
+  }
+  {
+    proj = _t2;
+    proj[0] = tproj0;
+    proj[1] = tproj1;
+    radial_distort_pullback(_t1, _t2, _d_cam.ptr_ref() + 9, _d_proj);
+  }
+  {
+    double _r_d4 = _d_proj[1];
+    _d_Xcam[1] += _r_d4 / Xcam[2];
+    double _r1 = _r_d4 * -Xcam[1] / (Xcam[2] * Xcam[2]);
+    _d_Xcam[2] += _r1;
+    _d_proj[1] -= _r_d4;
+    _d_proj[1];
+  }
+  {
+    double _r_d3 = _d_proj[0];
+    _d_Xcam[0] += _r_d3 / Xcam[2];
+    double _r0 = _r_d3 * -Xcam[0] / (Xcam[2] * Xcam[2]);
+    _d_Xcam[2] += _r0;
+    _d_proj[0] -= _r_d3;
+    _d_proj[0];
+  }
+  rodrigues_rotate_point_pullback(_t0, Xo, Xcam, _d_cam.ptr_ref() + 0, _d_Xo,
+                                  _d_Xcam);
+  {
+    double _r_d2 = _d_Xo[2];
+    _d_X[2] += _r_d2;
+    _d_C[2] += -_r_d2;
+    _d_Xo[2] -= _r_d2;
+    _d_Xo[2];
+  }
+  {
+    double _r_d1 = _d_Xo[1];
+    _d_X[1] += _r_d1;
+    _d_C[1] += -_r_d1;
+    _d_Xo[1] -= _r_d1;
+    _d_Xo[1];
+  }
+  {
+    double _r_d0 = _d_Xo[0];
+    _d_X[0] += _r_d0;
+    _d_C[0] += -_r_d0;
+    _d_Xo[0] -= _r_d0;
+    _d_Xo[0];
+  }
+}
+void computeReprojError_pullback(const double *cam, const double *X,
+                                 const double *w, const double *feat,
+                                 double *err, clad::array_ref<double> _d_cam,
+                                 clad::array_ref<double> _d_X,
+                                 clad::array_ref<double> _d_w,
+                                 clad::array_ref<double> _d_feat,
+                                 clad::array_ref<double> _d_err) {
+  clad::array<double> _d_proj(2UL);
+  const double *_t0;
+  const double *_t1;
+  double proj[2];
+  _t0 = cam;
+  _t1 = X;
+  project(cam, X, proj);
+  err[0] = *w * (proj[0] - feat[0]);
+  err[1] = *w * (proj[1] - feat[1]);
+  {
+    double _r_d1 = _d_err[1];
+    *_d_w += _r_d1 * (proj[1] - feat[1]);
+    _d_proj[1] += *w * _r_d1;
+    _d_feat[1] += -*w * _r_d1;
+    _d_err[1] -= _r_d1;
+    _d_err[1];
+  }
+  {
+    double _r_d0 = _d_err[0];
+    *_d_w += _r_d0 * (proj[0] - feat[0]);
+    _d_proj[0] += *w * _r_d0;
+    _d_feat[0] += -*w * _r_d0;
+    _d_err[0] -= _r_d0;
+    _d_err[0];
+  }
+  {
+    cam = _t0;
+    X = _t1;
+    project_pullback(_t0, _t1, proj, _d_cam, _d_X, _d_proj);
+  }
+}
+void computeReprojError_wrapper_grad(
+    const double *cam, const double *X, const double *w, const double *feat,
+    double *err, clad::array_ref<double> _d_cam, clad::array_ref<double> _d_X,
+    clad::array_ref<double> _d_w, clad::array_ref<double> _d_feat,
+    clad::array_ref<double> _d_err) {
+  const double *_t0;
+  const double *_t1;
+  const double *_t2;
+  const double *_t3;
+  double *_t4;
+  _t0 = cam;
+  _t1 = X;
+  _t2 = w;
+  _t3 = feat;
+  _t4 = err;
+  computeReprojError(cam, X, w, feat, err);
+  goto _label0;
+_label0:
+  _d_err[0] += 1;
+  {
+    cam = _t0;
+    X = _t1;
+    w = _t2;
+    feat = _t3;
+    err = _t4;
+    computeReprojError_pullback(_t0, _t1, _t2, _t3, _t4, _d_cam, _d_X, _d_w,
+                                _d_feat, _d_err);
+  }
+}
+void computeZachWeightError_pullback(const double *const w, double *err,
+                                     clad::array_ref<double> _d_w,
+                                     clad::array_ref<double> _d_err) {
+  double _t0;
+  _t0 = *err;
+  *err = 1 - *w * (*w);
+  {
+    *err = _t0;
+    double _r_d0 = *_d_err;
+    *_d_w += -_r_d0 * (*w);
+    *_d_w += *w * -_r_d0;
+    *_d_err -= _r_d0;
+    *_d_err;
+  }
+}
+void computeZachWeightError_wrapper_grad(const double *w, double *err,
+                                         clad::array_ref<double> _d_w,
+                                         clad::array_ref<double> _d_err) {
+  const double *_t0;
+  double *_t1;
+  _t0 = w;
+  _t1 = err;
+  computeZachWeightError(w, err);
+  goto _label0;
+_label0:
+  _d_err[0] += 1;
+  {
+    w = _t0;
+    err = _t1;
+    computeZachWeightError_pullback(_t0, _t1, _d_w, _d_err);
+  }
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/ba/ba_grad.h b/src/cpp/modules/cladLatestTBR/ba/ba_grad.h
new file mode 100644
index 00000000..de42ade1
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/ba/ba_grad.h
@@ -0,0 +1,13 @@
+#include "clad/Differentiator/ArrayRef.h"
+
+void computeReprojError_pullback(const double *cam, const double *X,
+                                 const double *w, const double *feat,
+                                 double *err, clad::array_ref<double> _d_cam,
+                                 clad::array_ref<double> _d_X,
+                                 clad::array_ref<double> _d_w,
+                                 clad::array_ref<double> _d_feat,
+                                 clad::array_ref<double> _d_err);
+
+void computeZachWeightError_pullback(const double *const w, double *err,
+                                     clad::array_ref<double> _d_w,
+                                     clad::array_ref<double> _d_err);
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/gmm/gmm.h b/src/cpp/modules/cladLatestTBR/gmm/gmm.h
new file mode 100644
index 00000000..c7908de3
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/gmm/gmm.h
@@ -0,0 +1,299 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+using std::vector;
+#include "defs.h"
+#include "matrix.h"
+
+// CLADFIX-Add-Start
+// namespace clad {
+//     namespace custom_derivatives {
+//         namespace class_functions {
+//             template <typename T>
+//             void resize_pullback(::std::vector<T> *v, size_t n, ::std::vector<T> *d_v, size_t d_n) {
+
+//             }
+//         }
+//     }
+// }
+// CLADFIX-Add-end
+
+////////////////////////////////////////////////////////////
+//////////////////// Declarations //////////////////////////
+////////////////////////////////////////////////////////////
+
+// d: dim
+// k: number of gaussians
+// n: number of points
+// alphas: k logs of mixture weights (unnormalized), so
+//          weights = exp(log_alphas) / sum(exp(log_alphas))
+// means: d*k component means
+// icf: (d*(d+1)/2)*k inverse covariance factors 
+//                  every icf entry stores firstly log of diagonal and then 
+//          columnwise other entris
+//          To generate icf in MATLAB given covariance C :
+//              L = inv(chol(C, 'lower'));
+//              inv_cov_factor = [log(diag(L)); L(au_tril_indices(d, -1))]
+// wishart: wishart distribution parameters
+// x: d*n points
+// err: 1 output
+template<typename T>
+void gmm_objective(int d, int k, int n, const T* const alphas, const T* const means,
+    const T* const icf, const double* const x, Wishart wishart, T* err);
+
+// split of the outer loop over points
+template<typename T>
+void gmm_objective_split_inner(int d, int k,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    const double* const x,
+    Wishart wishart,
+    T* err);
+// other terms which are outside the loop
+template<typename T>
+void gmm_objective_split_other(int d, int k, int n,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    Wishart wishart,
+    T* err);
+
+template<typename T>
+T logsumexp(int n, const T* const x);
+
+// p: dim
+// k: number of components
+// wishart parameters
+// sum_qs: k sums of log diags of Qs
+// Qdiags: d*k
+// icf: (p*(p+1)/2)*k inverse covariance factors
+template<typename T>
+T log_wishart_prior(int p, int k,
+    Wishart wishart,
+    const T* const sum_qs,
+    const T* const Qdiags,
+    const T* const icf);
+
+template<typename T>
+void preprocess_qs(int d, int k,
+    const T* const icf,
+    T* sum_qs,
+    T* Qdiags);
+
+template<typename T>
+void Qtimesx(int d,
+    const T* const Qdiag,
+    const T* const ltri, // strictly lower triangular part
+    const T* const x,
+    T* out);
+
+////////////////////////////////////////////////////////////
+//////////////////// Definitions ///////////////////////////
+////////////////////////////////////////////////////////////
+
+template<typename T>
+T logsumexp(int n, const T* const x)
+{
+    T mx = arr_max(n, x);
+    T semx = 0.;
+    for (int i = 0; i < n; i++)
+    {
+        semx = semx + exp(x[i] - mx);
+    }
+    return log(semx) + mx;
+}
+
+template<typename T>
+T log_wishart_prior(int p, int k,
+    Wishart wishart,
+    const T* const sum_qs,
+    const T* const Qdiags,
+    const T* const icf)
+{
+    int n = p + wishart.m + 1;
+    int icf_sz = p * (p + 1) / 2;
+
+    double C = n * p * (log(wishart.gamma) - 0.5 * log(2)) - log_gamma_distrib(0.5 * n, p);
+
+    T out = 0;
+    for (int ik = 0; ik < k; ik++)
+    {
+        T frobenius = sqnorm(p, &Qdiags[ik * p]) + sqnorm(icf_sz - p, &icf[ik * icf_sz + p]);
+        out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius)
+            -wishart.m * sum_qs[ik];
+    }
+
+    return out - k * C;
+}
+
+template<typename T>
+void preprocess_qs(int d, int k,
+    const T* const icf,
+    T* sum_qs,
+    T* Qdiags)
+{
+    int icf_sz = d * (d + 1) / 2;
+    for (int ik = 0; ik < k; ik++)
+    {
+        sum_qs[ik] = 0.;
+        for (int id = 0; id < d; id++)
+        {
+            T q = icf[ik * icf_sz + id];
+            sum_qs[ik] = sum_qs[ik] + q;
+            Qdiags[ik * d + id] = exp(q);
+        }
+    }
+}
+
+template<typename T>
+void Qtimesx(int d,
+    const T* const Qdiag,
+    const T* const ltri, // strictly lower triangular part
+    const T* const x,
+    T* out)
+{
+    for (int id = 0; id < d; id++)
+        out[id] = Qdiag[id] * x[id];
+
+    int Lparamsidx = 0;
+    for (int i = 0; i < d; i++)
+    {
+        for (int j = i + 1; j < d; j++)
+        {
+            out[j] = out[j] + ltri[Lparamsidx] * x[i];
+            Lparamsidx++;
+        }
+    }
+}
+
+template<typename T>
+void gmm_objective(int d, int k, int n,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    const double* const x,
+    Wishart wishart,
+    T* err)
+{
+    const double CONSTANT = -n * d * 0.5 * log(2 * PI);
+    int icf_sz = d * (d + 1) / 2;
+
+    // CLADFIX-Modify-DeleteStart(VectorInit)
+    // vector<T> Qdiags(d * k);
+    // vector<T> sum_qs(k);
+    // vector<T> xcentered(d);
+    // vector<T> Qxcentered(d);
+    // vector<T> main_term(k);
+    // CLADFIX-Modify-DeleteEnd(VectorInit)
+    // CLADFIX-Modify-AddStart(VectorInit)
+    T Qdiags[d * k];
+    T sum_qs[k];
+    T xcentered[d];
+    T Qxcentered[d];
+    T main_term[k];
+    //CLADFIX-Modify-AddEnd(VectorInit)
+
+    preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]);
+
+    T slse = 0.;
+    for (int ix = 0; ix < n; ix++)
+    {
+        for (int ik = 0; ik < k; ik++)
+        {
+            subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]);
+            Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0]);
+
+            main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * sqnorm(d, &Qxcentered[0]);
+        }
+        slse = slse + logsumexp(k, &main_term[0]);
+    }
+
+    T lse_alphas = logsumexp(k, alphas);
+
+    *err = CONSTANT + slse - n * lse_alphas;
+
+    *err = *err + log_wishart_prior(d, k, wishart, &sum_qs[0], &Qdiags[0], icf);
+
+}
+
+template<typename T>
+void gmm_objective_split_inner(int d, int k,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    const double* const x,
+    Wishart wishart,
+    T* err)
+{
+    int icf_sz = d * (d + 1) / 2;
+
+    T* Ldiag = new T[d];
+    T* xcentered = new T[d];
+    T* mahal = new T[d];
+    T* lse = new T[k];
+
+    for (int ik = 0; ik < k; ik++)
+    {
+        int icf_off = ik * icf_sz;
+        T sumlog_Ldiag(0.);
+        for (int id = 0; id < d; id++)
+        {
+            sumlog_Ldiag = sumlog_Ldiag + icf[icf_off + id];
+            Ldiag[id] = exp(icf[icf_off + id]);
+        }
+
+        for (int id = 0; id < d; id++)
+        {
+            xcentered[id] = x[id] - means[ik * d + id];
+            mahal[id] = Ldiag[id] * xcentered[id];
+        }
+        int Lparamsidx = d;
+        for (int i = 0; i < d; i++)
+        {
+            for (int j = i + 1; j < d; j++)
+            {
+                mahal[j] = mahal[j] + icf[icf_off + Lparamsidx] * xcentered[i];
+                Lparamsidx++;
+            }
+        }
+        T sqsum_mahal(0.);
+        for (int id = 0; id < d; id++)
+        {
+            sqsum_mahal = sqsum_mahal + mahal[id] * mahal[id];
+        }
+
+        lse[ik] = alphas[ik] + sumlog_Ldiag - 0.5 * sqsum_mahal;
+    }
+
+    *err = logsumexp(k, lse);
+
+    delete[] mahal;
+    delete[] xcentered;
+    delete[] Ldiag;
+    delete[] lse;
+}
+
+template<typename T>
+void gmm_objective_split_other(int d, int k, int n,
+    const T* const alphas,
+    const T* const means,
+    const T* const icf,
+    Wishart wishart,
+    T* err)
+{
+    const double CONSTANT = -n * d * 0.5 * log(2 * PI);
+
+    T lse_alphas = logsumexp(k, alphas);
+
+    T* sum_qs = new T[k];
+    T* Qdiags = new T[d * k];
+    preprocess_qs(d, k, icf, sum_qs, Qdiags);
+    *err = CONSTANT - n * lse_alphas + log_wishart_prior(d, k, wishart, sum_qs, Qdiags, icf);
+    delete[] sum_qs;
+    delete[] Qdiags;
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/gmm/gmm_grad_tbr.cpp b/src/cpp/modules/cladLatestTBR/gmm/gmm_grad_tbr.cpp
new file mode 100644
index 00000000..b7067741
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/gmm/gmm_grad_tbr.cpp
@@ -0,0 +1,516 @@
+#include "gmm.h"
+// CLADFIX-Add-Start
+#include "clad/Differentiator/Differentiator.h"
+// CLADFIX-Add-End
+
+void gmm_objective_pullback(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err);
+void gmm_objective_wrapper_grad(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err) {
+    gmm_objective(d, k, n, alphas, means, icf, x, wishart, err);
+    goto _label0;
+  _label0:
+    _d_err[0] += 1;
+    {
+        int _r0 = 0;
+        int _r1 = 0;
+        int _r2 = 0;
+        Wishart _r3 = {};
+        gmm_objective_pullback(d, k, n, alphas, means, icf, x, wishart, err, &_r0, &_r1, &_r2, _d_alphas, _d_means, _d_icf, _d_x, &_r3, _d_err);
+        *_d_d += _r0;
+        *_d_k += _r1;
+        *_d_n += _r2;
+    }
+}
+void preprocess_qs_pullback(int d, int k, const double *const icf, double *sum_qs, double *Qdiags, int *_d_d, int *_d_k, double *_d_icf, double *_d_sum_qs, double *_d_Qdiags);
+void subtract_pullback(int d, const double *const x, const double *const y, double *out, int *_d_d, double *_d_x, double *_d_y, double *_d_out);
+void Qtimesx_pullback(int d, const double *const Qdiag, const double *const ltri, const double *const x, double *out, int *_d_d, double *_d_Qdiag, double *_d_ltri, double *_d_x, double *_d_out);
+void sqnorm_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x);
+void logsumexp_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x);
+void log_wishart_prior_pullback(int p, int k, Wishart wishart, const double *const sum_qs, const double *const Qdiags, const double *const icf, double _d_y, int *_d_p, int *_d_k, Wishart *_d_wishart, double *_d_sum_qs, double *_d_Qdiags, double *_d_icf);
+void gmm_objective_pullback(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err) {
+    double _t0;
+    double _d_CONSTANT = 0;
+    int _d_icf_sz = 0;
+    double _d_slse = 0;
+    unsigned long _t1;
+    int _d_ix = 0;
+    int ix = 0;
+    clad::tape<unsigned long> _t2 = {};
+    clad::tape<int> _t3 = {};
+    int _d_ik = 0;
+    int ik = 0;
+    clad::tape<double> _t4 = {};
+    clad::tape<double> _t5 = {};
+    double _d_lse_alphas = 0;
+    double _t6;
+    double _t7;
+    _t0 = log(2 * 3.1415926535900001);
+    const double CONSTANT = -n * d * 0.5 * _t0;
+    int icf_sz = d * (d + 1) / 2;
+    double _d_Qdiags[d * k];
+    clad::zero_init(_d_Qdiags, d * k);
+    double Qdiags[d * k];
+    double _d_sum_qs[k];
+    clad::zero_init(_d_sum_qs, k);
+    double sum_qs[k];
+    double _d_xcentered[d];
+    clad::zero_init(_d_xcentered, d);
+    double xcentered[d];
+    double _d_Qxcentered[d];
+    clad::zero_init(_d_Qxcentered, d);
+    double Qxcentered[d];
+    double _d_main_term[k];
+    clad::zero_init(_d_main_term, k);
+    double main_term[k];
+    preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]);
+    double slse = 0.;
+    _t1 = 0;
+    for (ix = 0; ix < n; ix++) {
+        _t1++;
+        clad::push(_t2, 0UL);
+        for (clad::push(_t3, ik) , ik = 0; ik < k; ik++) {
+            clad::back(_t2)++;
+            subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]);
+            Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0]);
+            clad::push(_t4, main_term[ik]);
+            main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * clad::push(_t5, sqnorm(d, &Qxcentered[0]));
+        }
+        slse = slse + logsumexp(k, &main_term[0]);
+    }
+    double lse_alphas = logsumexp(k, alphas);
+    _t6 = *err;
+    *err = CONSTANT + slse - n * lse_alphas;
+    _t7 = *err;
+    *err = *err + log_wishart_prior(d, k, wishart, &sum_qs[0], &Qdiags[0], icf);
+    {
+        *err = _t7;
+        double _r_d3 = *_d_err;
+        *_d_err -= _r_d3;
+        *_d_err += _r_d3;
+        int _r7 = 0;
+        int _r8 = 0;
+        Wishart _r9 = {};
+        log_wishart_prior_pullback(d, k, wishart, &sum_qs[0], &Qdiags[0], icf, _r_d3, &_r7, &_r8, &_r9, &_d_sum_qs[0], &_d_Qdiags[0], _d_icf);
+        *_d_d += _r7;
+        *_d_k += _r8;
+    }
+    {
+        *err = _t6;
+        double _r_d2 = *_d_err;
+        *_d_err -= _r_d2;
+        _d_CONSTANT += _r_d2;
+        _d_slse += _r_d2;
+        *_d_n += -_r_d2 * lse_alphas;
+        _d_lse_alphas += n * -_r_d2;
+    }
+    {
+        int _r6 = 0;
+        logsumexp_pullback(k, alphas, _d_lse_alphas, &_r6, _d_alphas);
+        *_d_k += _r6;
+    }
+    for (; _t1; _t1--) {
+        ix--;
+        {
+            double _r_d1 = _d_slse;
+            _d_slse -= _r_d1;
+            _d_slse += _r_d1;
+            int _r5 = 0;
+            logsumexp_pullback(k, &main_term[0], _r_d1, &_r5, &_d_main_term[0]);
+            *_d_k += _r5;
+        }
+        {
+            for (; clad::back(_t2); clad::back(_t2)--) {
+                ik--;
+                {
+                    main_term[ik] = clad::pop(_t4);
+                    double _r_d0 = _d_main_term[ik];
+                    _d_main_term[ik] -= _r_d0;
+                    _d_alphas[ik] += _r_d0;
+                    _d_sum_qs[ik] += _r_d0;
+                    int _r4 = 0;
+                    sqnorm_pullback(d, &Qxcentered[0], 0.5 * -_r_d0, &_r4, &_d_Qxcentered[0]);
+                    *_d_d += _r4;
+                }
+                {
+                    int _r3 = 0;
+                    Qtimesx_pullback(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], &Qxcentered[0], &_r3, &_d_Qdiags[ik * d], &_d_icf[ik * icf_sz + d], &_d_xcentered[0], &_d_Qxcentered[0]);
+                    *_d_d += _r3;
+                }
+                {
+                    int _r2 = 0;
+                    subtract_pullback(d, &x[ix * d], &means[ik * d], &xcentered[0], &_r2, &_d_x[ix * d], &_d_means[ik * d], &_d_xcentered[0]);
+                    *_d_d += _r2;
+                }
+            }
+            {
+                _d_ik = 0;
+                ik = clad::pop(_t3);
+            }
+            clad::pop(_t2);
+        }
+    }
+    {
+        int _r0 = 0;
+        int _r1 = 0;
+        preprocess_qs_pullback(d, k, icf, &sum_qs[0], &Qdiags[0], &_r0, &_r1, _d_icf, &_d_sum_qs[0], &_d_Qdiags[0]);
+        *_d_d += _r0;
+        *_d_k += _r1;
+    }
+    {
+        *_d_d += _d_icf_sz / 2 * (d + 1);
+        *_d_d += d * _d_icf_sz / 2;
+    }
+    {
+        *_d_n += -_d_CONSTANT * _t0 * 0.5 * d;
+        *_d_d += -n * _d_CONSTANT * _t0 * 0.5;
+    }
+}
+void preprocess_qs_pullback(int d, int k, const double *const icf, double *sum_qs, double *Qdiags, int *_d_d, int *_d_k, double *_d_icf, double *_d_sum_qs, double *_d_Qdiags) {
+    int _d_icf_sz = 0;
+    unsigned long _t0;
+    int _d_ik = 0;
+    int ik = 0;
+    clad::tape<unsigned long> _t1 = {};
+    clad::tape<int> _t2 = {};
+    int _d_id = 0;
+    int id = 0;
+    clad::tape<double> _t3 = {};
+    double _d_q = 0;
+    double q = 0;
+    int icf_sz = d * (d + 1) / 2;
+    _t0 = 0;
+    for (ik = 0; ik < k; ik++) {
+        _t0++;
+        sum_qs[ik] = 0.;
+        clad::push(_t1, 0UL);
+        for (clad::push(_t2, id) , id = 0; id < d; id++) {
+            clad::back(_t1)++;
+            clad::push(_t3, q) , q = icf[ik * icf_sz + id];
+            sum_qs[ik] = sum_qs[ik] + q;
+            Qdiags[ik * d + id] = exp(q);
+        }
+    }
+    for (; _t0; _t0--) {
+        ik--;
+        {
+            for (; clad::back(_t1); clad::back(_t1)--) {
+                id--;
+                {
+                    double _r_d2 = _d_Qdiags[ik * d + id];
+                    _d_Qdiags[ik * d + id] -= _r_d2;
+                    double _r0 = 0;
+                    _r0 += _r_d2 * clad::custom_derivatives::exp_pushforward(q, 1.).pushforward;
+                    _d_q += _r0;
+                }
+                {
+                    double _r_d1 = _d_sum_qs[ik];
+                    _d_sum_qs[ik] -= _r_d1;
+                    _d_sum_qs[ik] += _r_d1;
+                    _d_q += _r_d1;
+                }
+                {
+                    _d_icf[ik * icf_sz + id] += _d_q;
+                    _d_q = 0;
+                    q = clad::pop(_t3);
+                }
+            }
+            {
+                _d_id = 0;
+                id = clad::pop(_t2);
+            }
+            clad::pop(_t1);
+        }
+        {
+            double _r_d0 = _d_sum_qs[ik];
+            _d_sum_qs[ik] -= _r_d0;
+        }
+    }
+    {
+        *_d_d += _d_icf_sz / 2 * (d + 1);
+        *_d_d += d * _d_icf_sz / 2;
+    }
+}
+void subtract_pullback(int d, const double *const x, const double *const y, double *out, int *_d_d, double *_d_x, double *_d_y, double *_d_out) {
+    unsigned long _t0;
+    int _d_id = 0;
+    int id = 0;
+    _t0 = 0;
+    for (id = 0; id < d; id++) {
+        _t0++;
+        out[id] = x[id] - y[id];
+    }
+    for (; _t0; _t0--) {
+        id--;
+        {
+            double _r_d0 = _d_out[id];
+            _d_out[id] -= _r_d0;
+            _d_x[id] += _r_d0;
+            _d_y[id] += -_r_d0;
+        }
+    }
+}
+void Qtimesx_pullback(int d, const double *const Qdiag, const double *const ltri, const double *const x, double *out, int *_d_d, double *_d_Qdiag, double *_d_ltri, double *_d_x, double *_d_out) {
+    unsigned long _t0;
+    int _d_id = 0;
+    int id = 0;
+    int _d_Lparamsidx = 0;
+    unsigned long _t1;
+    int _d_i = 0;
+    int i = 0;
+    clad::tape<unsigned long> _t2 = {};
+    clad::tape<int> _t3 = {};
+    int _d_j = 0;
+    int j = 0;
+    _t0 = 0;
+    for (id = 0; id < d; id++) {
+        _t0++;
+        out[id] = Qdiag[id] * x[id];
+    }
+    int Lparamsidx = 0;
+    _t1 = 0;
+    for (i = 0; i < d; i++) {
+        _t1++;
+        clad::push(_t2, 0UL);
+        for (clad::push(_t3, j) , j = i + 1; j < d; j++) {
+            clad::back(_t2)++;
+            out[j] = out[j] + ltri[Lparamsidx] * x[i];
+            Lparamsidx++;
+        }
+    }
+    for (; _t1; _t1--) {
+        i--;
+        {
+            for (; clad::back(_t2); clad::back(_t2)--) {
+                j--;
+                Lparamsidx--;
+                {
+                    double _r_d1 = _d_out[j];
+                    _d_out[j] -= _r_d1;
+                    _d_out[j] += _r_d1;
+                    _d_ltri[Lparamsidx] += _r_d1 * x[i];
+                    _d_x[i] += ltri[Lparamsidx] * _r_d1;
+                }
+            }
+            {
+                _d_i += _d_j;
+                _d_j = 0;
+                j = clad::pop(_t3);
+            }
+            clad::pop(_t2);
+        }
+    }
+    for (; _t0; _t0--) {
+        id--;
+        double _r_d0 = _d_out[id];
+        _d_out[id] -= _r_d0;
+        _d_Qdiag[id] += _r_d0 * x[id];
+        _d_x[id] += Qdiag[id] * _r_d0;
+    }
+}
+void sqnorm_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x) {
+    double _d_res = 0;
+    unsigned long _t0;
+    int _d_i = 0;
+    int i = 0;
+    double res = x[0] * x[0];
+    _t0 = 0;
+    for (i = 1; i < n; i++) {
+        _t0++;
+        res = res + x[i] * x[i];
+    }
+    goto _label0;
+  _label0:
+    _d_res += _d_y;
+    for (; _t0; _t0--) {
+        i--;
+        double _r_d0 = _d_res;
+        _d_res -= _r_d0;
+        _d_res += _r_d0;
+        _d_x[i] += _r_d0 * x[i];
+        _d_x[i] += x[i] * _r_d0;
+    }
+    {
+        _d_x[0] += _d_res * x[0];
+        _d_x[0] += x[0] * _d_res;
+    }
+}
+void arr_max_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x);
+void logsumexp_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x) {
+    double _d_mx = 0;
+    double _d_semx = 0;
+    unsigned long _t0;
+    int _d_i = 0;
+    int i = 0;
+    double mx = arr_max(n, x);
+    double semx = 0.;
+    _t0 = 0;
+    for (i = 0; i < n; i++) {
+        _t0++;
+        semx = semx + exp(x[i] - mx);
+    }
+    goto _label0;
+  _label0:
+    {
+        double _r2 = 0;
+        _r2 += _d_y * clad::custom_derivatives::log_pushforward(semx, 1.).pushforward;
+        _d_semx += _r2;
+        _d_mx += _d_y;
+    }
+    for (; _t0; _t0--) {
+        i--;
+        {
+            double _r_d0 = _d_semx;
+            _d_semx -= _r_d0;
+            _d_semx += _r_d0;
+            double _r1 = 0;
+            _r1 += _r_d0 * clad::custom_derivatives::exp_pushforward(x[i] - mx, 1.).pushforward;
+            _d_x[i] += _r1;
+            _d_mx += -_r1;
+        }
+    }
+    {
+        int _r0 = 0;
+        arr_max_pullback(n, x, _d_mx, &_r0, _d_x);
+        *_d_n += _r0;
+    }
+}
+inline void log_gamma_distrib_pullback(double a, double p, double _d_y, double *_d_a, double *_d_p);
+void log_wishart_prior_pullback(int p, int k, Wishart wishart, const double *const sum_qs, const double *const Qdiags, const double *const icf, double _d_y, int *_d_p, int *_d_k, Wishart *_d_wishart, double *_d_sum_qs, double *_d_Qdiags, double *_d_icf) {
+    int _d_n = 0;
+    int _d_icf_sz = 0;
+    double _t0;
+    double _t1;
+    double _d_C = 0;
+    double _d_out = 0;
+    unsigned long _t2;
+    int _d_ik = 0;
+    int ik = 0;
+    clad::tape<double> _t3 = {};
+    double _d_frobenius = 0;
+    double frobenius = 0;
+    int n = p + wishart.m + 1;
+    int icf_sz = p * (p + 1) / 2;
+    _t1 = log(2);
+    _t0 = (log(wishart.gamma) - 0.5 * _t1);
+    double C = n * p * _t0 - log_gamma_distrib(0.5 * n, p);
+    double out = 0;
+    _t2 = 0;
+    for (ik = 0; ik < k; ik++) {
+        _t2++;
+        clad::push(_t3, frobenius) , frobenius = sqnorm(p, &Qdiags[ik * p]) + sqnorm(icf_sz - p, &icf[ik * icf_sz + p]);
+        out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - wishart.m * sum_qs[ik];
+    }
+    goto _label0;
+  _label0:
+    {
+        _d_out += _d_y;
+        *_d_k += -_d_y * C;
+        _d_C += k * -_d_y;
+    }
+    for (; _t2; _t2--) {
+        ik--;
+        {
+            double _r_d0 = _d_out;
+            _d_out -= _r_d0;
+            _d_out += _r_d0;
+            (*_d_wishart).gamma += 0.5 * _r_d0 * (frobenius) * wishart.gamma;
+            (*_d_wishart).gamma += 0.5 * wishart.gamma * _r_d0 * (frobenius);
+            _d_frobenius += 0.5 * wishart.gamma * wishart.gamma * _r_d0;
+            (*_d_wishart).m += -_r_d0 * sum_qs[ik];
+            _d_sum_qs[ik] += wishart.m * -_r_d0;
+        }
+        {
+            int _r3 = 0;
+            sqnorm_pullback(p, &Qdiags[ik * p], _d_frobenius, &_r3, &_d_Qdiags[ik * p]);
+            *_d_p += _r3;
+            int _r4 = 0;
+            sqnorm_pullback(icf_sz - p, &icf[ik * icf_sz + p], _d_frobenius, &_r4, &_d_icf[ik * icf_sz + p]);
+            _d_icf_sz += _r4;
+            *_d_p += -_r4;
+            _d_frobenius = 0;
+            frobenius = clad::pop(_t3);
+        }
+    }
+    {
+        _d_n += _d_C * _t0 * p;
+        *_d_p += n * _d_C * _t0;
+        double _r0 = 0;
+        _r0 += n * p * _d_C * clad::custom_derivatives::log_pushforward(wishart.gamma, 1.).pushforward;
+        (*_d_wishart).gamma += _r0;
+        double _r1 = 0;
+        double _r2 = 0;
+        log_gamma_distrib_pullback(0.5 * n, p, -_d_C, &_r1, &_r2);
+        _d_n += 0.5 * _r1;
+        *_d_p += _r2;
+    }
+    {
+        *_d_p += _d_icf_sz / 2 * (p + 1);
+        *_d_p += p * _d_icf_sz / 2;
+    }
+    {
+        *_d_p += _d_n;
+        (*_d_wishart).m += _d_n;
+    }
+}
+void arr_max_pullback(int n, const double *const x, double _d_y, int *_d_n, double *_d_x) {
+    double _d_m = 0;
+    unsigned long _t0;
+    int _d_i = 0;
+    int i = 0;
+    clad::tape<bool> _t2 = {};
+    double m = x[0];
+    _t0 = 0;
+    for (i = 1; i < n; i++) {
+        _t0++;
+        bool _t1 = m < x[i];
+        {
+            if (_t1)
+                m = x[i];
+            clad::push(_t2, _t1);
+        }
+    }
+    goto _label0;
+  _label0:
+    _d_m += _d_y;
+    for (; _t0; _t0--) {
+        i--;
+        if (clad::pop(_t2)) {
+            double _r_d0 = _d_m;
+            _d_m -= _r_d0;
+            _d_x[i] += _r_d0;
+        }
+    }
+    _d_x[0] += _d_m;
+}
+inline void log_gamma_distrib_pullback(double a, double p, double _d_y, double *_d_a, double *_d_p) {
+    double _t0;
+    double _d_out = 0;
+    unsigned long _t1;
+    int _d_j = 0;
+    int j = 0;
+    _t0 = log(3.1415926535900001);
+    double out = 0.25 * p * (p - 1) * _t0;
+    _t1 = 0;
+    for (j = 1; j <= p; j++) {
+        _t1++;
+        out = out + lgamma(a + 0.5 * (1 - j));
+    }
+    goto _label0;
+  _label0:
+    _d_out += _d_y;
+    for (; _t1; _t1--) {
+        j--;
+        {
+            double _r_d0 = _d_out;
+            _d_out -= _r_d0;
+            _d_out += _r_d0;
+            double _r0 = 0;
+            _r0 += _r_d0 * numerical_diff::forward_central_difference(lgamma, a + 0.5 * (1 - j), 0, 0, a + 0.5 * (1 - j));
+            *_d_a += _r0;
+            _d_j += -0.5 * _r0;
+        }
+    }
+    {
+        *_d_p += 0.25 * _d_out * _t0 * (p - 1);
+        *_d_p += 0.25 * p * _d_out * _t0;
+    }
+}
\ No newline at end of file
diff --git a/src/cpp/modules/cladLatestTBR/gmm/gmm_grad_tbr.h b/src/cpp/modules/cladLatestTBR/gmm/gmm_grad_tbr.h
new file mode 100644
index 00000000..dceb27a1
--- /dev/null
+++ b/src/cpp/modules/cladLatestTBR/gmm/gmm_grad_tbr.h
@@ -0,0 +1,2 @@
+#include "gmm.h"
+void gmm_objective_pullback(int d, int k, int n, const double *const alphas, const double *const means, const double *const icf, const double *const x, Wishart wishart, double *err, int *_d_d, int *_d_k, int *_d_n, double *_d_alphas, double *_d_means, double *_d_icf, double *_d_x, Wishart *_d_wishart, double *_d_err);
\ No newline at end of file