From c87b9d9dd10fb1e3b3f58310cf478d370d589c52 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Tue, 7 Jan 2025 17:14:05 -0500
Subject: [PATCH 01/25] Add a memmap before Parallel

---
 econml/grf/_base_grf.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py
index dac530fa6..fbff9d5f1 100644
--- a/econml/grf/_base_grf.py
+++ b/econml/grf/_base_grf.py
@@ -8,7 +8,7 @@
 #
 # Copyright (c) 2007-2020 The scikit-learn developers.
 # All rights reserved.
-
+import gc
 import numbers
 from warnings import warn
 from abc import ABCMeta, abstractmethod
@@ -27,6 +27,7 @@
 from sklearn.utils import check_X_y
 import scipy.stats
 from scipy.special import erfc
+import tempfile
 
 __all__ = ["BaseGRF"]
 
@@ -384,14 +385,25 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
                 s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False)
                           for _ in range(n_more_estimators)]
 
+            # Make a memmap for better performance on large number of treatment variables
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file:
+                filename = temp_file.name
+            np.save(filename, yaug)  # Save array to disk
+            # Remove references to (potentially) large data before Parallel
+            del yaug, pointJ
+            gc.collect()
+            # Create the memmap version
+            yaug_mmap = np.load(filename + '.npy', mmap_mode='r')
+
             # Parallel loop: we prefer the threading backend as the Cython code
             # for fitting the trees is internally releasing the Python GIL
             # making threading more efficient than multiprocessing in
             # that case. However, for joblib 0.12+ we respect any
             # parallel_backend contexts set at a higher level,
             # since correctness does not rely on using threads.
+
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')(
-                delayed(t.fit)(X[s], yaug[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_,
+                delayed(t.fit)(X[s], yaug_mmap[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_,
                                sample_weight=sample_weight[s] if sample_weight is not None else None,
                                check_input=False)
                 for t, s in zip(trees, s_inds))

From 672699fc753fbb342429c6db49f888908b773ab3 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Tue, 7 Jan 2025 18:03:51 -0500
Subject: [PATCH 02/25] Filename fix. Add a test script

---
 econml/grf/_base_grf.py       |  2 +-
 scripts/memory_test_script.py | 75 +++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 scripts/memory_test_script.py

diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py
index fbff9d5f1..80c5c3c5b 100644
--- a/econml/grf/_base_grf.py
+++ b/econml/grf/_base_grf.py
@@ -393,7 +393,7 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
             del yaug, pointJ
             gc.collect()
             # Create the memmap version
-            yaug_mmap = np.load(filename + '.npy', mmap_mode='r')
+            yaug_mmap = np.load(filename, mmap_mode='r')
 
             # Parallel loop: we prefer the threading backend as the Cython code
             # for fitting the trees is internally releasing the Python GIL
diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
new file mode 100644
index 000000000..9d477b0fe
--- /dev/null
+++ b/scripts/memory_test_script.py
@@ -0,0 +1,75 @@
+import json
+from datetime import datetime
+
+import sklearn.metrics
+import argparse
+from econml.dml import SparseLinearDML, CausalForestDML
+from econml.validate import DRTester
+import collinearity
+from itertools import product
+import joblib
+import numpy as np
+import os
+import pandas as pd
+import scipy
+from sklearn.preprocessing import StandardScaler, FunctionTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import roc_auc_score
+from xgboost import XGBRegressor, XGBClassifier
+import sys
+import logging
+from sklearn.model_selection import KFold
+
+logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+
+def causalforestdml_memory_test(
+    file_name: str
+):
+    """
+    Main function for testing DML on high dimensional marketing data sets. Adds pre-processing
+    like scaling and removing highly collinear features. It can cross validates parameters of either
+    first stage model, or the final model.  Saves out the scores of every model to a file.
+
+    :param file_name: File that contains the data, assumed to be joblib
+    :param model_name: Name of the model to use, either SparseLinearDML or CausalForestDML
+
+    """
+
+    root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")
+    logger.info(f"Using root dir {root_dir} loading data file {file_name}")
+    data_file = os.path.join(os.path.abspath(root_dir), file_name)
+    data = joblib.load(data_file)
+
+    # This is specific to Offerfit - names of the data are derived from contextual bandits
+    X = data['X_no_act']
+    T = data['a_processed']
+    y = data['real_reward']
+    # y[y > 1] = 1
+
+    logger.info(f"X has a shape of: {X.shape}")
+    logger.info(f"T has a shape of: {T.shape}")
+    logger.info(f"y has a shape of: {y.shape}")
+
+
+    est = CausalForestDML(
+        model_t=XGBRegressor(n_estimators=50),
+        model_y=XGBClassifier(n_estimators=50),
+        discrete_outcome=True,
+        n_jobs=1
+    )
+    est.fit(y,T,X=X)
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib")
+    args = parser.parse_args(sys.argv[1:])
+    data_file = args.data_file
+
+    causalforestdml_memory_test(
+        file_name=data_file
+    )

From 4e9820a09a0f59d7e1e71df52c0c29acdbe329a4 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Wed, 8 Jan 2025 15:15:01 -0500
Subject: [PATCH 03/25] Script update

---
 scripts/memory_test_script.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 9d477b0fe..f0056bc95 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -27,15 +27,7 @@
 def causalforestdml_memory_test(
     file_name: str
 ):
-    """
-    Main function for testing DML on high dimensional marketing data sets. Adds pre-processing
-    like scaling and removing highly collinear features. It can cross validates parameters of either
-    first stage model, or the final model.  Saves out the scores of every model to a file.
 
-    :param file_name: File that contains the data, assumed to be joblib
-    :param model_name: Name of the model to use, either SparseLinearDML or CausalForestDML
-
-    """
 
     root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")
     logger.info(f"Using root dir {root_dir} loading data file {file_name}")
@@ -46,7 +38,6 @@ def causalforestdml_memory_test(
     X = data['X_no_act']
     T = data['a_processed']
     y = data['real_reward']
-    # y[y > 1] = 1
 
     logger.info(f"X has a shape of: {X.shape}")
     logger.info(f"T has a shape of: {T.shape}")
@@ -54,13 +45,14 @@ def causalforestdml_memory_test(
 
 
     est = CausalForestDML(
-        model_t=XGBRegressor(n_estimators=50),
-        model_y=XGBClassifier(n_estimators=50),
+        model_t=XGBRegressor(n_estimators=10),
+        model_y=XGBClassifier(n_estimators=10),
         discrete_outcome=True,
-        n_jobs=1
+        n_jobs=2,
+        n_estimators=20
     )
     est.fit(y,T,X=X)
-
+    print(est.summary())
 
 
 

From d0d575dfa59ddc55b35f953d5dde120d2e1ff353 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Jan 2025 20:30:55 +0000
Subject: [PATCH 04/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/memory_test_script.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index f0056bc95..fb96cbd72 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -1,24 +1,11 @@
-import json
-from datetime import datetime
 
-import sklearn.metrics
 import argparse
-from econml.dml import SparseLinearDML, CausalForestDML
-from econml.validate import DRTester
-import collinearity
-from itertools import product
+from econml.dml import CausalForestDML
 import joblib
-import numpy as np
 import os
-import pandas as pd
-import scipy
-from sklearn.preprocessing import StandardScaler, FunctionTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.metrics import roc_auc_score
 from xgboost import XGBRegressor, XGBClassifier
 import sys
 import logging
-from sklearn.model_selection import KFold
 
 logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
 logger = logging.getLogger(__name__)

From 5b3db78b9c8cbb191ea7bf4d3a64e085f816e655 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Wed, 8 Jan 2025 15:33:30 -0500
Subject: [PATCH 05/25] Remove test script from the PR

---
 scripts/memory_test_script.py | 67 -----------------------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 scripts/memory_test_script.py

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
deleted file mode 100644
index f0056bc95..000000000
--- a/scripts/memory_test_script.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import json
-from datetime import datetime
-
-import sklearn.metrics
-import argparse
-from econml.dml import SparseLinearDML, CausalForestDML
-from econml.validate import DRTester
-import collinearity
-from itertools import product
-import joblib
-import numpy as np
-import os
-import pandas as pd
-import scipy
-from sklearn.preprocessing import StandardScaler, FunctionTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.metrics import roc_auc_score
-from xgboost import XGBRegressor, XGBClassifier
-import sys
-import logging
-from sklearn.model_selection import KFold
-
-logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
-logger = logging.getLogger(__name__)
-
-
-def causalforestdml_memory_test(
-    file_name: str
-):
-
-
-    root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")
-    logger.info(f"Using root dir {root_dir} loading data file {file_name}")
-    data_file = os.path.join(os.path.abspath(root_dir), file_name)
-    data = joblib.load(data_file)
-
-    # This is specific to Offerfit - names of the data are derived from contextual bandits
-    X = data['X_no_act']
-    T = data['a_processed']
-    y = data['real_reward']
-
-    logger.info(f"X has a shape of: {X.shape}")
-    logger.info(f"T has a shape of: {T.shape}")
-    logger.info(f"y has a shape of: {y.shape}")
-
-
-    est = CausalForestDML(
-        model_t=XGBRegressor(n_estimators=10),
-        model_y=XGBClassifier(n_estimators=10),
-        discrete_outcome=True,
-        n_jobs=2,
-        n_estimators=20
-    )
-    est.fit(y,T,X=X)
-    print(est.summary())
-
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib")
-    args = parser.parse_args(sys.argv[1:])
-    data_file = args.data_file
-
-    causalforestdml_memory_test(
-        file_name=data_file
-    )

From 362b57ed5f390f71802754e730d4ad0c5c0f6f9a Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Fri, 21 Feb 2025 14:52:10 -0800
Subject: [PATCH 06/25] Make memmap an option, and add the reference in the doc
 string

---
 econml/grf/_base_grf.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py
index 80c5c3c5b..68a9cb698 100644
--- a/econml/grf/_base_grf.py
+++ b/econml/grf/_base_grf.py
@@ -197,7 +197,7 @@ def decision_path(self, X):
 
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
-    def fit(self, X, T, y, *, sample_weight=None, **kwargs):
+    def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs):
         """
         Build a forest of trees from the training set (X, T, y) and any other auxiliary variables.
 
@@ -214,6 +214,9 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
             Sample weights. If None, then samples are equally weighted. Splits
             that would create child nodes with net zero or negative weight are
             ignored while searching for a split in each node.
+        use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps
+            reduce memory overhead for large data sets. For details on memmap see:
+            https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
         **kwargs : dictionary of array_like items of shape (n_samples, d_var)
             Auxiliary random variables that go into the moment function (e.g. instrument, censoring etc)
             Any of these variables will be passed on as is to the `get_pointJ` and
@@ -385,15 +388,16 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
                 s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False)
                           for _ in range(n_more_estimators)]
 
-            # Make a memmap for better performance on large number of treatment variables
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file:
-                filename = temp_file.name
-            np.save(filename, yaug)  # Save array to disk
-            # Remove references to (potentially) large data before Parallel
-            del yaug, pointJ
-            gc.collect()
-            # Create the memmap version
-            yaug_mmap = np.load(filename, mmap_mode='r')
+            if use_memmap:
+                # Make a memmap for better performance on large number of treatment variables
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file:
+                    filename = temp_file.name
+                np.save(filename, yaug)  # Save array to disk
+                # Remove references to (potentially) large data before Parallel
+                del yaug, pointJ
+                gc.collect()
+                # Create the memmap version
+                yaug = np.load(filename, mmap_mode='r')
 
             # Parallel loop: we prefer the threading backend as the Cython code
             # for fitting the trees is internally releasing the Python GIL
@@ -403,7 +407,7 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
             # since correctness does not rely on using threads.
 
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')(
-                delayed(t.fit)(X[s], yaug_mmap[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_,
+                delayed(t.fit)(X[s], yaug[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_,
                                sample_weight=sample_weight[s] if sample_weight is not None else None,
                                check_input=False)
                 for t, s in zip(trees, s_inds))

From ebb4348e2ef216d33a6e9ab5c794dfbddc2e6eb0 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Fri, 21 Feb 2025 15:00:13 -0800
Subject: [PATCH 07/25] Start a notebook to demonstrate causal forest memory
 usage, by copying from the existing Causal Forest Demo

---
 notebooks/Causal Forest Memory Demo.ipynb | 37 +++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 notebooks/Causal Forest Memory Demo.ipynb

diff --git a/notebooks/Causal Forest Memory Demo.ipynb b/notebooks/Causal Forest Memory Demo.ipynb
new file mode 100644
index 000000000..54f657bb0
--- /dev/null
+++ b/notebooks/Causal Forest Memory Demo.ipynb	
@@ -0,0 +1,37 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    ""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 5d694f74668862401622189952c1ac3eb57812cd Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Wed, 26 Feb 2025 08:38:30 -0800
Subject: [PATCH 08/25] Set use_memmap in constructor

---
 econml/dml/causal_forest.py | 11 +++++++++--
 econml/grf/_base_grf.py     | 16 ++++++++++------
 econml/grf/classes.py       |  5 +++--
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py
index 4a1a536bc..33ad2398a 100644
--- a/econml/dml/causal_forest.py
+++ b/econml/dml/causal_forest.py
@@ -578,6 +578,10 @@ class CausalForestDML(_BaseDML):
         at depth `depth`, is re-weighted by 1 / (1 + `depth`)**2.0. See the method ``feature_importances``
         for a method that allows one to change these defaults.
 
+    use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps
+        reduce memory overhead for large data sets. For details on memmap see:
+        https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
+
     References
     ----------
     .. [cfdml1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests."
@@ -619,7 +623,8 @@ def __init__(self, *,
                  verbose=0,
                  allow_missing=False,
                  use_ray=False,
-                 ray_remote_func_options=None):
+                 ray_remote_func_options=None,
+                 use_memmap=False):
 
         # TODO: consider whether we need more care around stateful featurizers,
         #       since we clone it and fit separate copies
@@ -647,6 +652,7 @@ def __init__(self, *,
         self.fit_intercept = fit_intercept
         self.subforest_size = subforest_size
         self.n_jobs = n_jobs
+        self.use_memmap = use_memmap
         self.verbose = verbose
         super().__init__(discrete_outcome=discrete_outcome,
                          discrete_treatment=discrete_treatment,
@@ -698,7 +704,8 @@ def _gen_model_final(self):
                                            n_jobs=self.n_jobs,
                                            random_state=self.random_state,
                                            verbose=self.verbose,
-                                           warm_start=False))
+                                           warm_start=False,
+                                           use_memmap=self.use_memmap))
 
     def _gen_rlearner_model_final(self):
         return _CausalForestFinalWrapper(self._gen_model_final(), self._gen_featurizer(),
diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py
index 68a9cb698..57ea2762c 100644
--- a/econml/grf/_base_grf.py
+++ b/econml/grf/_base_grf.py
@@ -52,6 +52,11 @@ class BaseGRF(BaseEnsemble, metaclass=ABCMeta):
 
     Warning: This class should not be used directly. Use derived classes
     instead.
+
+
+    use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps
+        reduce memory overhead for large data sets. For details on memmap see:
+        https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
     """
 
     def __init__(self,
@@ -74,7 +79,8 @@ def __init__(self,
                  n_jobs=-1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 use_memmap=False):
         super().__init__(
             base_estimator=GRFTree(),
             n_estimators=n_estimators,
@@ -104,6 +110,7 @@ def __init__(self,
         self.verbose = verbose
         self.warm_start = warm_start
         self.max_samples = max_samples
+        self.use_memmap = use_memmap
 
     @abstractmethod
     def _get_alpha_and_pointJ(self, X, T, y, **kwargs):
@@ -197,7 +204,7 @@ def decision_path(self, X):
 
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
-    def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs):
+    def fit(self, X, T, y, *, sample_weight=None, **kwargs):
         """
         Build a forest of trees from the training set (X, T, y) and any other auxiliary variables.
 
@@ -214,9 +221,6 @@ def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs):
             Sample weights. If None, then samples are equally weighted. Splits
             that would create child nodes with net zero or negative weight are
             ignored while searching for a split in each node.
-        use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps
-            reduce memory overhead for large data sets. For details on memmap see:
-            https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
         **kwargs : dictionary of array_like items of shape (n_samples, d_var)
             Auxiliary random variables that go into the moment function (e.g. instrument, censoring etc)
             Any of these variables will be passed on as is to the `get_pointJ` and
@@ -388,7 +392,7 @@ def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs):
                 s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False)
                           for _ in range(n_more_estimators)]
 
-            if use_memmap:
+            if self.use_memmap:
                 # Make a memmap for better performance on large number of treatment variables
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file:
                     filename = temp_file.name
diff --git a/econml/grf/classes.py b/econml/grf/classes.py
index 46d0a22ab..42e6cd399 100644
--- a/econml/grf/classes.py
+++ b/econml/grf/classes.py
@@ -359,7 +359,8 @@ def __init__(self,
                  n_jobs=-1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 use_memmap=False):
         super().__init__(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                          min_samples_split=min_samples_split,
                          min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
@@ -368,7 +369,7 @@ def __init__(self,
                          max_samples=max_samples, min_balancedness_tol=min_balancedness_tol,
                          honest=honest, inference=inference, fit_intercept=fit_intercept,
                          subforest_size=subforest_size, n_jobs=n_jobs, random_state=random_state, verbose=verbose,
-                         warm_start=warm_start)
+                         warm_start=warm_start, use_memmap=use_memmap)
 
     def fit(self, X, T, y, *, sample_weight=None):
         """

From 8df2c15468ee8f541e68381026ed645b6b774e6c Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Thu, 10 Apr 2025 08:18:05 -0400
Subject: [PATCH 09/25] Shell script to load a data file and fit a
 CausalForestDML with different number of estimators, jobs and memory map
 option

---
 scripts/memory_test_script.py | 83 +++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 scripts/memory_test_script.py

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
new file mode 100644
index 000000000..efa15dede
--- /dev/null
+++ b/scripts/memory_test_script.py
@@ -0,0 +1,83 @@
+import json
+from datetime import datetime
+
+import sklearn.metrics
+import argparse
+from econml.dml import SparseLinearDML, CausalForestDML
+from econml.validate import DRTester
+import collinearity
+from itertools import product
+import joblib
+import numpy as np
+import os
+import pandas as pd
+import scipy
+from sklearn.preprocessing import StandardScaler, FunctionTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import roc_auc_score
+from xgboost import XGBRegressor, XGBClassifier
+import sys
+import logging
+from sklearn.model_selection import KFold
+
+logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+
+def causalforestdml_memory_test(
+    file_name: str,
+    n_est_y: int,
+    n_est_t: int,
+    n_est_2: int,
+    n_jobs: int,
+    use_memmap: bool
+):
+
+
+    root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")
+    logger.info(f"Using root dir {root_dir} loading data file {file_name}")
+    data_file = os.path.join(os.path.abspath(root_dir), file_name)
+    data = joblib.load(data_file)
+
+    # This is specific to Offerfit - names of the data are derived from contextual bandits
+    X = data['X']
+    T = data['T']
+    y = data['Y']
+
+    logger.info(f"X has a shape of: {X.shape}")
+    logger.info(f"T has a shape of: {T.shape}")
+    logger.info(f"y has a shape of: {y.shape}")
+
+    est = CausalForestDML(
+        model_t=XGBRegressor(n_estimators=n_est_t),
+        model_y=XGBClassifier(n_estimators=n_est_y),
+        discrete_outcome=True,
+        n_jobs=n_jobs,
+        n_estimators=n_est_2,
+        use_memmap=use_memmap
+    )
+    logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}")
+    est.fit(y,T,X=X)
+    print(est.summary())
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib")
+    parser.add_argument("--n_est_y", type=int, help="", default=100)
+    parser.add_argument("--n_est_t", type=int, help="", default=200)
+    parser.add_argument("--n_est_2", type=int, help="", default=500)
+    parser.add_argument("--n_jobs", type=int, help="", default=-1)
+    parser.add_argument("--memmap", type=bool, help="", default=False)
+    args = parser.parse_args(sys.argv[1:])
+    data_file = args.data_file
+
+    causalforestdml_memory_test(
+        file_name=data_file,
+        n_est_y=args.n_est_y,
+        n_est_t=args.n_est_t,
+        n_est_2=args.n_est_2,
+        n_jobs=args.n_jobs,
+        use_memmap=args.memmap
+    )

From 5ea336f3578f577f8b19620006ef5d2c5596a3de Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Thu, 10 Apr 2025 08:52:17 -0400
Subject: [PATCH 10/25] Try memory profiler, memory usage

---
 scripts/memory_test_script.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index efa15dede..d4dfda324 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -5,6 +5,7 @@
 import argparse
 from econml.dml import SparseLinearDML, CausalForestDML
 from econml.validate import DRTester
+from memory_profiler import memory_usage
 import collinearity
 from itertools import product
 import joblib
@@ -56,9 +57,23 @@ def causalforestdml_memory_test(
         n_estimators=n_est_2,
         use_memmap=use_memmap
     )
-    logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}")
-    est.fit(y,T,X=X)
-    print(est.summary())
+    logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}," 
+                f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
+    # est.fit(y,T,X=X)
+
+    mem_usage = memory_usage(
+        (est.fit, [y,T], {"X":X}),
+        interval=0.1,  # Sample every 0.1 seconds
+        timeout=None,  # No timeout
+        max_usage=True,  # Get maximum memory used
+        retval=True,  # Return the fitted model too
+        include_children=True  # Include joblib's child processes
+    )
+
+    # Extract results
+    max_memory, fitted_model = mem_usage
+
+    print(f"Maximum memory usage: {max_memory} MiB")
 
 
 

From 1ef0a53faa7c060ab31844bf26c63f198522113b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Apr 2025 12:52:43 +0000
Subject: [PATCH 11/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/memory_test_script.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index d4dfda324..41fdd5634 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -1,25 +1,12 @@
-import json
-from datetime import datetime
 
-import sklearn.metrics
 import argparse
-from econml.dml import SparseLinearDML, CausalForestDML
-from econml.validate import DRTester
+from econml.dml import CausalForestDML
 from memory_profiler import memory_usage
-import collinearity
-from itertools import product
 import joblib
-import numpy as np
 import os
-import pandas as pd
-import scipy
-from sklearn.preprocessing import StandardScaler, FunctionTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.metrics import roc_auc_score
 from xgboost import XGBRegressor, XGBClassifier
 import sys
 import logging
-from sklearn.model_selection import KFold
 
 logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
 logger = logging.getLogger(__name__)
@@ -57,7 +44,7 @@ def causalforestdml_memory_test(
         n_estimators=n_est_2,
         use_memmap=use_memmap
     )
-    logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}," 
+    logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap},"
                 f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
     # est.fit(y,T,X=X)
 

From 3cb90d180017a255cd477d36018304c3457e94e8 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Thu, 10 Apr 2025 10:31:06 -0400
Subject: [PATCH 12/25] Add a printout to make sure it is working

---
 econml/grf/_base_grf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py
index 57ea2762c..a4a6e03e0 100644
--- a/econml/grf/_base_grf.py
+++ b/econml/grf/_base_grf.py
@@ -396,6 +396,7 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
                 # Make a memmap for better performance on large number of treatment variables
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file:
                     filename = temp_file.name
+                print(f"BaseGRF.fit Making memmap with temp file {filename}")
                 np.save(filename, yaug)  # Save array to disk
                 # Remove references to (potentially) large data before Parallel
                 del yaug, pointJ

From 6dbdd52c4eb6af755a5b3cad00855788661783f4 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Mon, 28 Apr 2025 08:38:33 -0700
Subject: [PATCH 13/25] Add catboost and save the output of the memory test
 script

---
 scripts/memory_test_script.py | 46 +++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 41fdd5634..bb3c2d000 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -4,8 +4,10 @@
 from memory_profiler import memory_usage
 import joblib
 import os
+from catboost import CatBoostRegressor
 from xgboost import XGBRegressor, XGBClassifier
 import sys
+import time
 import logging
 
 logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
@@ -38,16 +40,16 @@ def causalforestdml_memory_test(
 
     est = CausalForestDML(
         model_t=XGBRegressor(n_estimators=n_est_t),
-        model_y=XGBClassifier(n_estimators=n_est_y),
-        discrete_outcome=True,
+        model_y=XGBRegressor(n_estimators=n_est_y),
+        discrete_outcome=False,
         n_jobs=n_jobs,
         n_estimators=n_est_2,
         use_memmap=use_memmap
     )
-    logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap},"
+    logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap},"
                 f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
-    # est.fit(y,T,X=X)
 
+    start_time = time.time()
     mem_usage = memory_usage(
         (est.fit, [y,T], {"X":X}),
         interval=0.1,  # Sample every 0.1 seconds
@@ -56,13 +58,47 @@ def causalforestdml_memory_test(
         retval=True,  # Return the fitted model too
         include_children=True  # Include joblib's child processes
     )
+    end_time = time.time()
 
     # Extract results
     max_memory, fitted_model = mem_usage
 
-    print(f"Maximum memory usage: {max_memory} MiB")
+    logger.info(f"Maximum memory usage: {max_memory} MiB")
+    elapsed_time = end_time-start_time
+    logger.info(f"Time to fit: {elapsed_time} seconds")
 
 
+    est2 = CatBoostRegressor(n_estimators=n_est_2,allow_writing_files=False)
+
+
+    logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}")
+
+    start_time2 = time.time()
+    mem_usage2 = memory_usage(
+        (est2.fit, [X],{"y":y, "silent": True}),
+        interval=0.1,  # Sample every 0.1 seconds
+        timeout=None,  # No timeout
+        max_usage=True,  # Get maximum memory used
+        retval=True,  # Return the fitted model too
+        include_children=True  # Include joblib's child processes
+    )
+    end_time2 = time.time()
+
+    # Extract results
+    max_memory2, fitted_model2 = mem_usage2
+    elapsed_time2 = end_time2-start_time2
+
+    logger.info(f"Maximum memory usage: {max_memory2} MiB")
+    logger.info(f"Time to fit: {elapsed_time2} seconds")
+
+    file_name = os.path.join(root_dir,"mem_test_results.csv")
+    if not os.path.isfile(file_name):
+        with open(file_name,"w") as output:
+            output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n")
+
+    with open(file_name,"a") as output:
+        output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory},{elapsed_time},{max_memory2},{elapsed_time2}\n")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

From 27dc80c30ee8643cac6bab4c5e2a06a62b9bb4e9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 28 Apr 2025 15:38:43 +0000
Subject: [PATCH 14/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/memory_test_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index bb3c2d000..7a863862c 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -5,7 +5,7 @@
 import joblib
 import os
 from catboost import CatBoostRegressor
-from xgboost import XGBRegressor, XGBClassifier
+from xgboost import XGBRegressor
 import sys
 import time
 import logging

From e9a54058039267361a80e506f62dd35cd30e802e Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Mon, 28 Apr 2025 08:52:22 -0700
Subject: [PATCH 15/25] Use default estimators

---
 scripts/memory_test_script.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index bb3c2d000..80465dd35 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -103,9 +103,9 @@ def causalforestdml_memory_test(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib")
-    parser.add_argument("--n_est_y", type=int, help="", default=100)
-    parser.add_argument("--n_est_t", type=int, help="", default=200)
-    parser.add_argument("--n_est_2", type=int, help="", default=500)
+    parser.add_argument("--n_est_y", type=int, help="", default=None)
+    parser.add_argument("--n_est_t", type=int, help="", default=None)
+    parser.add_argument("--n_est_2", type=int, help="", default=None)
     parser.add_argument("--n_jobs", type=int, help="", default=-1)
     parser.add_argument("--memmap", type=bool, help="", default=False)
     args = parser.parse_args(sys.argv[1:])

From 6910ed3c649fc750b6e288cbfee11e9908944ad4 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Mon, 28 Apr 2025 09:25:44 -0700
Subject: [PATCH 16/25] Fix name conflict between input file and result file

---
 scripts/memory_test_script.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index b456c2401..ceb7021da 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -91,12 +91,12 @@ def causalforestdml_memory_test(
     logger.info(f"Maximum memory usage: {max_memory2} MiB")
     logger.info(f"Time to fit: {elapsed_time2} seconds")
 
-    file_name = os.path.join(root_dir,"mem_test_results.csv")
-    if not os.path.isfile(file_name):
-        with open(file_name,"w") as output:
+    result_file_name = os.path.join(root_dir,"mem_test_results.csv")
+    if not os.path.isfile(result_file_name):
+        with open(result_file_name,"w") as output:
             output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n")
 
-    with open(file_name,"a") as output:
+    with open(result_file_name,"a") as output:
         output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory},{elapsed_time},{max_memory2},{elapsed_time2}\n")
 
 

From bb69fee0f5cb82557b40da37d80e69599e61fc4a Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Mon, 28 Apr 2025 10:01:02 -0700
Subject: [PATCH 17/25] Make sure we remove the memory for the causal forest
 estimator

---
 scripts/memory_test_script.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index ceb7021da..3714512df 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -1,5 +1,7 @@
 
 import argparse
+import gc
+
 from econml.dml import CausalForestDML
 from memory_profiler import memory_usage
 import joblib
@@ -67,6 +69,8 @@ def causalforestdml_memory_test(
     elapsed_time = end_time-start_time
     logger.info(f"Time to fit: {elapsed_time} seconds")
 
+    del est
+    gc.collect()
 
     est2 = CatBoostRegressor(n_estimators=n_est_2,allow_writing_files=False)
 

From fee0757784695007b4a90041bab324dcced316fb Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Mon, 28 Apr 2025 10:03:32 -0700
Subject: [PATCH 18/25] Limit digits

---
 scripts/memory_test_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 3714512df..f2d4c85ba 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -101,7 +101,7 @@ def causalforestdml_memory_test(
             output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n")
 
     with open(result_file_name,"a") as output:
-        output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory},{elapsed_time},{max_memory2},{elapsed_time2}\n")
+        output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f},{max_memory2:.1f},{elapsed_time2:.1f}\n")
 
 
 if __name__ == "__main__":

From e247739bf77f04fa6cc983f494fe3a98d5b104c7 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Mon, 28 Apr 2025 14:23:00 -0700
Subject: [PATCH 19/25] Switch to separate runs for catboost and causalforest

---
 scripts/memory_test_script.py | 98 +++++++++++++++++------------------
 1 file changed, 47 insertions(+), 51 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index f2d4c85ba..07e4e6b70 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -1,5 +1,6 @@
 
 import argparse
+import datetime
 import gc
 
 from econml.dml import CausalForestDML
@@ -18,6 +19,7 @@
 
 def causalforestdml_memory_test(
     file_name: str,
+    estimator: str,
     n_est_y: int,
     n_est_t: int,
     n_est_2: int,
@@ -25,6 +27,8 @@ def causalforestdml_memory_test(
     use_memmap: bool
 ):
 
+    if not estimator.lower() in ('causalforest','catboost'):
+        raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}")
 
     root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")
     logger.info(f"Using root dir {root_dir} loading data file {file_name}")
@@ -40,68 +44,58 @@ def causalforestdml_memory_test(
     logger.info(f"T has a shape of: {T.shape}")
     logger.info(f"y has a shape of: {y.shape}")
 
-    est = CausalForestDML(
-        model_t=XGBRegressor(n_estimators=n_est_t),
-        model_y=XGBRegressor(n_estimators=n_est_y),
-        discrete_outcome=False,
-        n_jobs=n_jobs,
-        n_estimators=n_est_2,
-        use_memmap=use_memmap
-    )
-    logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap},"
-                f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
-
-    start_time = time.time()
-    mem_usage = memory_usage(
-        (est.fit, [y,T], {"X":X}),
-        interval=0.1,  # Sample every 0.1 seconds
-        timeout=None,  # No timeout
-        max_usage=True,  # Get maximum memory used
-        retval=True,  # Return the fitted model too
-        include_children=True  # Include joblib's child processes
-    )
-    end_time = time.time()
+    if estimator == 'causalforest':
+        est = CausalForestDML(
+            model_t=XGBRegressor(n_estimators=n_est_t),
+            model_y=XGBRegressor(n_estimators=n_est_y),
+            discrete_outcome=False,
+            n_jobs=n_jobs,
+            n_estimators=n_est_2,
+            use_memmap=use_memmap
+        )
+        logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap},"
+                    f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
+
+        start_time = time.time()
+        mem_usage = memory_usage(
+            (est.fit, [y,T], {"X":X}),
+            interval=0.1,  # Sample every 0.1 seconds
+            timeout=None,  # No timeout
+            max_usage=True,  # Get maximum memory used
+            retval=True,  # Return the fitted model too
+            include_children=True  # Include joblib's child processes
+        )
+        end_time = time.time()
+    elif estimator == 'catboost':
+        est = CatBoostRegressor(n_estimators=n_est_2, allow_writing_files=False)
+
+        logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}")
+
+        start_time = time.time()
+        mem_usage = memory_usage(
+            (est.fit, [X], {"y": y, "silent": True}),
+            interval=0.1,  # Sample every 0.1 seconds
+            timeout=None,  # No timeout
+            max_usage=True,  # Get maximum memory used
+            retval=True,  # Return the fitted model too
+            include_children=True  # Include joblib's child processes
+        )
+        end_time = time.time()
 
     # Extract results
     max_memory, fitted_model = mem_usage
-
-    logger.info(f"Maximum memory usage: {max_memory} MiB")
     elapsed_time = end_time-start_time
+    logger.info(f"Maximum memory usage: {max_memory} MiB")
     logger.info(f"Time to fit: {elapsed_time} seconds")
 
-    del est
-    gc.collect()
-
-    est2 = CatBoostRegressor(n_estimators=n_est_2,allow_writing_files=False)
-
-
-    logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}")
-
-    start_time2 = time.time()
-    mem_usage2 = memory_usage(
-        (est2.fit, [X],{"y":y, "silent": True}),
-        interval=0.1,  # Sample every 0.1 seconds
-        timeout=None,  # No timeout
-        max_usage=True,  # Get maximum memory used
-        retval=True,  # Return the fitted model too
-        include_children=True  # Include joblib's child processes
-    )
-    end_time2 = time.time()
-
-    # Extract results
-    max_memory2, fitted_model2 = mem_usage2
-    elapsed_time2 = end_time2-start_time2
-
-    logger.info(f"Maximum memory usage: {max_memory2} MiB")
-    logger.info(f"Time to fit: {elapsed_time2} seconds")
-
     result_file_name = os.path.join(root_dir,"mem_test_results.csv")
     if not os.path.isfile(result_file_name):
         with open(result_file_name,"w") as output:
-            output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n")
+            output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n")
 
+    run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
     with open(result_file_name,"a") as output:
-        output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f},{max_memory2:.1f},{elapsed_time2:.1f}\n")
+        output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n")
 
 
 if __name__ == "__main__":
@@ -112,11 +106,13 @@ def causalforestdml_memory_test(
     parser.add_argument("--n_est_2", type=int, help="", default=None)
     parser.add_argument("--n_jobs", type=int, help="", default=-1)
     parser.add_argument("--memmap", type=bool, help="", default=False)
+    parser.add_argument("--estimator", type=str, help="", default=None)
     args = parser.parse_args(sys.argv[1:])
     data_file = args.data_file
 
     causalforestdml_memory_test(
         file_name=data_file,
+        estimator=args.estimator,
         n_est_y=args.n_est_y,
         n_est_t=args.n_est_t,
         n_est_2=args.n_est_2,

From c39ef5ae7ae947127b02e8e1891aa8452987be5e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:23:10 +0000
Subject: [PATCH 20/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/memory_test_script.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 07e4e6b70..7be6eaa25 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -1,7 +1,6 @@
 
 import argparse
 import datetime
-import gc
 
 from econml.dml import CausalForestDML
 from memory_profiler import memory_usage
@@ -27,7 +26,7 @@ def causalforestdml_memory_test(
     use_memmap: bool
 ):
 
-    if not estimator.lower() in ('causalforest','catboost'):
+    if estimator.lower() not in ('causalforest','catboost'):
         raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}")
 
     root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")

From 2f2491c4e51424e9e4259723ffebf61861c10449 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Tue, 6 May 2025 13:37:27 -0700
Subject: [PATCH 21/25] More memory tests

---
 scripts/memory_test_script.py | 137 +++++++++++++++++++++++++---------
 1 file changed, 100 insertions(+), 37 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 07e4e6b70..8cf3a8660 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -1,23 +1,29 @@
 
 import argparse
 import datetime
-import gc
+import joblib
+import logging
+import numpy as np
+import os
+import sys
+import time
+import tracemalloc
+
+import pandas as pd
 
 from econml.dml import CausalForestDML
 from memory_profiler import memory_usage
-import joblib
-import os
+from pympler import muppy, summary
 from catboost import CatBoostRegressor
 from xgboost import XGBRegressor
-import sys
-import time
-import logging
+
 
 logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
 logger = logging.getLogger(__name__)
 
 
 def causalforestdml_memory_test(
+    test_type:str,
     file_name: str,
     estimator: str,
     n_est_y: int,
@@ -26,7 +32,8 @@ def causalforestdml_memory_test(
     n_jobs: int,
     use_memmap: bool
 ):
-
+    if test_type not in ("profile","malloc","object"):
+        raise ValueError(f"Test type should be 'profile'|'malloc'|'object' but got {test_type}")
     if not estimator.lower() in ('causalforest','catboost'):
         raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}")
 
@@ -57,14 +64,22 @@ def causalforestdml_memory_test(
                     f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
 
         start_time = time.time()
-        mem_usage = memory_usage(
-            (est.fit, [y,T], {"X":X}),
-            interval=0.1,  # Sample every 0.1 seconds
-            timeout=None,  # No timeout
-            max_usage=True,  # Get maximum memory used
-            retval=True,  # Return the fitted model too
-            include_children=True  # Include joblib's child processes
-        )
+        if test_type=='profile':
+            mem_usage = memory_usage(
+                (est.fit, [y,T], {"X":X}),
+                interval=0.1,  # Sample every 0.1 seconds
+                timeout=None,  # No timeout
+                max_usage=True,  # Get maximum memory used
+                retval=True,  # Return the fitted model too
+                include_children=True  # Include joblib's child processes
+            )
+        elif test_type=='malloc':
+            tracemalloc.start()
+            est.fit(y,T, X=X)
+            malloc_snapshot = tracemalloc.take_snapshot()
+        else:
+            est.fit(y,T, X=X)
+
         end_time = time.time()
     elif estimator == 'catboost':
         est = CatBoostRegressor(n_estimators=n_est_2, allow_writing_files=False)
@@ -72,30 +87,76 @@ def causalforestdml_memory_test(
         logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}")
 
         start_time = time.time()
-        mem_usage = memory_usage(
-            (est.fit, [X], {"y": y, "silent": True}),
-            interval=0.1,  # Sample every 0.1 seconds
-            timeout=None,  # No timeout
-            max_usage=True,  # Get maximum memory used
-            retval=True,  # Return the fitted model too
-            include_children=True  # Include joblib's child processes
-        )
+        if test_type=='profile':
+            mem_usage = memory_usage(
+                (est.fit, [X], {"y": y, "silent": True}),
+                interval=0.1,  # Sample every 0.1 seconds
+                timeout=None,  # No timeout
+                max_usage=True,  # Get maximum memory used
+                retval=True,  # Return the fitted model too
+                include_children=True  # Include joblib's child processes
+            )
+        elif test_type=='malloc':
+            tracemalloc.start()
+            est.fit(X,y=y,silent=True)
+            malloc_snapshot = tracemalloc.take_snapshot()
+        else:
+            est.fit(X,y=y,silent=True)
+
         end_time = time.time()
 
     # Extract results
-    max_memory, fitted_model = mem_usage
-    elapsed_time = end_time-start_time
-    logger.info(f"Maximum memory usage: {max_memory} MiB")
-    logger.info(f"Time to fit: {elapsed_time} seconds")
-
-    result_file_name = os.path.join(root_dir,"mem_test_results.csv")
-    if not os.path.isfile(result_file_name):
-        with open(result_file_name,"w") as output:
-            output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n")
-
-    run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
-    with open(result_file_name,"a") as output:
-        output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n")
+    if test_type == 'malloc':
+        # Get statistics grouped by filename and line number
+        top_stats = malloc_snapshot.statistics('lineno')
+        print("\nTop 10 memory-consuming locations:")
+        for i, stat in enumerate(top_stats[:10], 1):
+            print(f"{i}. {stat}")
+
+        # Get statistics grouped by object type (more useful for identifying large objects)
+        top_types = malloc_snapshot.statistics('traceback')
+        print("\nTop 10 memory-consuming object types:")
+        for i, stat in enumerate(top_types[:10], 1):
+            print(f"{i}. {stat}")
+            # Print the traceback to see where the object was created
+            for line in stat.traceback.format():
+                print(f"    {line}")
+    elif test_type == 'profile':
+        max_memory, fitted_model = mem_usage
+        elapsed_time = end_time-start_time
+        logger.info(f"Maximum memory usage: {max_memory} MiB")
+        logger.info(f"Time to fit: {elapsed_time} seconds")
+
+        result_file_name = os.path.join(root_dir,"mem_test_results.csv")
+        if not os.path.isfile(result_file_name):
+            with open(result_file_name,"w") as output:
+                output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n")
+
+        run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
+        with open(result_file_name,"a") as output:
+            output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n")
+    elif test_type=='object':
+        # Collect all objects
+        all_objects = muppy.get_objects()
+        # Get a summary of memory usage by type
+        mem_summary = summary.summarize(all_objects)
+        summary.print_(mem_summary)
+        # Look at NumPy arrays
+        numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)]
+        numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True)
+
+        print("\nTop 10 NumPy arrays by memory usage:")
+        for i, arr in enumerate(numpy_arrays[:10], 1):
+            memory_mb = arr.nbytes / (1024 * 1024)
+            print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB")
+
+        pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)]
+        pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True)
+
+        print("\nTop 10 Pandas DataFrames by memory usage:")
+        for i, df in enumerate(pandas_dfs[:10], 1):
+            memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
+            print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB")
 
 
 if __name__ == "__main__":
@@ -103,14 +164,16 @@ def causalforestdml_memory_test(
     parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib")
     parser.add_argument("--n_est_y", type=int, help="", default=None)
     parser.add_argument("--n_est_t", type=int, help="", default=None)
-    parser.add_argument("--n_est_2", type=int, help="", default=None)
+    parser.add_argument("--n_est_2", type=int, help="", default=500)
     parser.add_argument("--n_jobs", type=int, help="", default=-1)
     parser.add_argument("--memmap", type=bool, help="", default=False)
     parser.add_argument("--estimator", type=str, help="", default=None)
+    parser.add_argument("--test", type=str, help="profile or malloc, meaning use memory_profiler or tracemalloc", default="profile")
     args = parser.parse_args(sys.argv[1:])
     data_file = args.data_file
 
     causalforestdml_memory_test(
+        test_type=args.test,
         file_name=data_file,
         estimator=args.estimator,
         n_est_y=args.n_est_y,

From 39a9d3e9fe6df83923a9dbfead4816c76fb47ee3 Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Wed, 7 May 2025 09:21:58 -0700
Subject: [PATCH 22/25] Run all the memory tests Add downsampling function

---
 scripts/memory_test_script.py | 242 ++++++++++++++++++++--------------
 1 file changed, 142 insertions(+), 100 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 8cf3a8660..d4ff50175 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -22,20 +22,71 @@
 logger = logging.getLogger(__name__)
 
 
+def balanced_downsample(X:pd.DataFrame, y:np.array, T_feat:pd.DataFrame, t_id:np.array, downsample_ratio:float):
+    """
+    Balanced downsampling based on treatment identity
+
+    :param X:
+    :param y:
+    :param T_feat: features of the treatments
+    :param t_id: which treatment was applied
+    :return:
+    """
+    # Keep all the positive experiences
+    pos_ex = y > 0
+    positive_df = X[pos_ex].reset_index(drop=True)
+    positive_actions = t_id[pos_ex]
+    act_feature_positve = T_feat[pos_ex].reset_index(drop=True)
+    y_positive = y[pos_ex]
+    positive_df['action_id'] = positive_actions
+    positive_df['y'] = y_positive
+    positive_df = pd.concat([positive_df,act_feature_positve],axis=1).reset_index(drop=True)
+
+    negative_df = X[~pos_ex].reset_index(drop=True)
+    negative_actions = t_id[~pos_ex]
+    act_feature_negative = T_feat[~pos_ex].reset_index(drop=True)
+    y_negative = y[~pos_ex]
+
+    negative_df['action_id'] = negative_actions
+    negative_df['y'] = y_negative
+    negative_df = pd.concat([negative_df,act_feature_negative],axis=1).reset_index(drop=True)
+
+    df_downsampled = negative_df.groupby('action_id', group_keys=False).apply(lambda x: x.sample(frac=downsample_ratio))
+
+    logger.info(f'Dowsampled negative examples by {downsample_ratio}')
+    actions_before = negative_df['action_id'].value_counts()
+    actions_after = df_downsampled['action_id'].value_counts()
+    logger.info(f'Before: {actions_before}')
+    logger.info(f'After: {actions_after}')
+
+    # Final sample
+    combo_df   = pd.concat([positive_df,df_downsampled],axis=0).reset_index(drop=True)
+    # this shuffles it
+    combo_df = combo_df.sample(frac=1).reset_index(drop=True)
+    X = combo_df[X.columns.values]
+    T_feat= combo_df[T_feat.columns.values]
+    y = combo_df['y'].to_numpy()
+    t_id = combo_df['action_id'].to_numpy()
+
+    n_pos_ex2 = y.sum()
+    logger.info(f"After downsampling non-conversion, Conversion rate  now {n_pos_ex2/len(y)}")
+    logger.info(f"X {X.shape}")
+    logger.info(f"T {T_feat.shape}")
+    logger.info(f"y {y.shape}")
+
+    return X, y, T_feat, t_id
+
+
 def causalforestdml_memory_test(
-    test_type:str,
     file_name: str,
     estimator: str,
     n_est_y: int,
     n_est_t: int,
     n_est_2: int,
     n_jobs: int,
-    use_memmap: bool
+    use_memmap: bool,
+    downsample:float|None,
 ):
-    if test_type not in ("profile","malloc","object"):
-        raise ValueError(f"Test type should be 'profile'|'malloc'|'object' but got {test_type}")
-    if not estimator.lower() in ('causalforest','catboost'):
-        raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}")
 
     root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".")
     logger.info(f"Using root dir {root_dir} loading data file {file_name}")
@@ -46,10 +97,13 @@ def causalforestdml_memory_test(
     X = data['X']
     T = data['T']
     y = data['Y']
+    i = data['i']
+    if downsample:
+        X, y, T, i = balanced_downsample(X,y,T,i,downsample_ratio=downsample)
 
     logger.info(f"X has a shape of: {X.shape}")
     logger.info(f"T has a shape of: {T.shape}")
-    logger.info(f"y has a shape of: {y.shape}")
+    logger.info(f"y has a shape of: {y.shape}, total={np.sum(y)}")
 
     if estimator == 'causalforest':
         est = CausalForestDML(
@@ -63,100 +117,88 @@ def causalforestdml_memory_test(
         logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap},"
                     f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}")
 
-        start_time = time.time()
-        if test_type=='profile':
-            mem_usage = memory_usage(
-                (est.fit, [y,T], {"X":X}),
-                interval=0.1,  # Sample every 0.1 seconds
-                timeout=None,  # No timeout
-                max_usage=True,  # Get maximum memory used
-                retval=True,  # Return the fitted model too
-                include_children=True  # Include joblib's child processes
-            )
-        elif test_type=='malloc':
-            tracemalloc.start()
-            est.fit(y,T, X=X)
-            malloc_snapshot = tracemalloc.take_snapshot()
-        else:
-            est.fit(y,T, X=X)
-
-        end_time = time.time()
     elif estimator == 'catboost':
         est = CatBoostRegressor(n_estimators=n_est_2, allow_writing_files=False)
-
         logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}")
 
-        start_time = time.time()
-        if test_type=='profile':
-            mem_usage = memory_usage(
-                (est.fit, [X], {"y": y, "silent": True}),
-                interval=0.1,  # Sample every 0.1 seconds
-                timeout=None,  # No timeout
-                max_usage=True,  # Get maximum memory used
-                retval=True,  # Return the fitted model too
-                include_children=True  # Include joblib's child processes
-            )
-        elif test_type=='malloc':
-            tracemalloc.start()
-            est.fit(X,y=y,silent=True)
-            malloc_snapshot = tracemalloc.take_snapshot()
-        else:
-            est.fit(X,y=y,silent=True)
-
-        end_time = time.time()
-
-    # Extract results
-    if test_type == 'malloc':
-        # Get statistics grouped by filename and line number
-        top_stats = malloc_snapshot.statistics('lineno')
-        print("\nTop 10 memory-consuming locations:")
-        for i, stat in enumerate(top_stats[:10], 1):
-            print(f"{i}. {stat}")
-
-        # Get statistics grouped by object type (more useful for identifying large objects)
-        top_types = malloc_snapshot.statistics('traceback')
-        print("\nTop 10 memory-consuming object types:")
-        for i, stat in enumerate(top_types[:10], 1):
-            print(f"{i}. {stat}")
-            # Print the traceback to see where the object was created
-            for line in stat.traceback.format():
-                print(f"    {line}")
-    elif test_type == 'profile':
-        max_memory, fitted_model = mem_usage
-        elapsed_time = end_time-start_time
-        logger.info(f"Maximum memory usage: {max_memory} MiB")
-        logger.info(f"Time to fit: {elapsed_time} seconds")
-
-        result_file_name = os.path.join(root_dir,"mem_test_results.csv")
-        if not os.path.isfile(result_file_name):
-            with open(result_file_name,"w") as output:
-                output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n")
-
-        run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
-        with open(result_file_name,"a") as output:
-            output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n")
-    elif test_type=='object':
-        # Collect all objects
-        all_objects = muppy.get_objects()
-        # Get a summary of memory usage by type
-        mem_summary = summary.summarize(all_objects)
-        summary.print_(mem_summary)
-        # Look at NumPy arrays
-        numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)]
-        numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True)
-
-        print("\nTop 10 NumPy arrays by memory usage:")
-        for i, arr in enumerate(numpy_arrays[:10], 1):
-            memory_mb = arr.nbytes / (1024 * 1024)
-            print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB")
-
-        pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)]
-        pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True)
-
-        print("\nTop 10 Pandas DataFrames by memory usage:")
-        for i, df in enumerate(pandas_dfs[:10], 1):
-            memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
-            print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB")
+    tracemalloc.start()
+    start_time = time.time()
+    if estimator == 'causalforest':
+        mem_usage = memory_usage(
+            (est.fit, [y,T], {"X":X}),
+            interval=0.1,  # Sample every 0.1 seconds
+            timeout=None,  # No timeout
+            max_usage=True,  # Get maximum memory used
+            retval=True,  # Return the fitted model too
+            include_children=True  # Include joblib's child processes
+        )
+    else:
+        mem_usage = memory_usage(
+            (est.fit, [X], {"y": y, "silent": True}),
+            interval=0.1,  # Sample every 0.1 seconds
+            timeout=None,  # No timeout
+            max_usage=True,  # Get maximum memory used
+            retval=True,  # Return the fitted model too
+            include_children=True  # Include joblib's child processes
+        )
+
+    end_time = time.time()
+
+    # mem_usage section
+    max_memory, fitted_model = mem_usage
+    elapsed_time = end_time-start_time
+    logger.info(f"Maximum memory usage using memory_usage: {max_memory} MiB")
+    logger.info(f"Time to fit: {elapsed_time} seconds")
+
+    result_file_name = os.path.join(root_dir,"mem_test_results.csv")
+    if not os.path.isfile(result_file_name):
+        with open(result_file_name,"w") as output:
+            output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n")
+
+    run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
+    with open(result_file_name,"a") as output:
+        output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n")
+
+
+    # malloc section
+    malloc_snapshot = tracemalloc.take_snapshot()
+    top_stats = malloc_snapshot.statistics('lineno')
+    print("\nTop 10 *tracemalloc* memory-consuming locations:")
+    for i, stat in enumerate(top_stats[:10], 1):
+        print(f"{i}. {stat}")
+
+    # Get statistics grouped by object type (more useful for identifying large objects)
+    top_types = malloc_snapshot.statistics('traceback')
+    print("\nTop 10  *tracemalloc* memory-consuming object types:")
+    for i, stat in enumerate(top_types[:10], 1):
+        print(f"{i}. {stat}")
+        # Print the traceback to see where the object was created
+        for line in stat.traceback.format():
+            print(f"    {line}")
+
+
+    # muppy section
+    all_objects = muppy.get_objects()
+    # Get a summary of memory usage by type
+    mem_summary = summary.summarize(all_objects)
+    print("*muppy* Memory usage summary:")
+    summary.print_(mem_summary)
+    # Look at NumPy arrays
+    numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)]
+    numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True)
+
+    print("\nTop 10 NumPy arrays by memory usage from *muppy*:")
+    for i, arr in enumerate(numpy_arrays[:10], 1):
+        memory_mb = arr.nbytes / (1024 * 1024)
+        print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB")
+
+    pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)]
+    pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True)
+
+    print("\nTop 10 Pandas DataFrames by memory usage *muppy*:")
+    for i, df in enumerate(pandas_dfs[:10], 1):
+        memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
+        print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB")
 
 
 if __name__ == "__main__":
@@ -168,17 +210,17 @@ def causalforestdml_memory_test(
     parser.add_argument("--n_jobs", type=int, help="", default=-1)
     parser.add_argument("--memmap", type=bool, help="", default=False)
     parser.add_argument("--estimator", type=str, help="", default=None)
-    parser.add_argument("--test", type=str, help="profile or malloc, meaning use memory_profiler or tracemalloc", default="profile")
+    parser.add_argument("--downsample", type=float, help="", default=None)
     args = parser.parse_args(sys.argv[1:])
     data_file = args.data_file
 
     causalforestdml_memory_test(
-        test_type=args.test,
         file_name=data_file,
         estimator=args.estimator,
         n_est_y=args.n_est_y,
         n_est_t=args.n_est_t,
         n_est_2=args.n_est_2,
         n_jobs=args.n_jobs,
-        use_memmap=args.memmap
+        use_memmap=args.memmap,
+        downsample=args.downsample
     )

From c64d911289ef656dca9fd68beeb80f014797f07e Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Fri, 9 May 2025 10:55:01 -0700
Subject: [PATCH 23/25] Better memory test that adds up the numpy arrays in the
 estimator

---
 scripts/memory_test_script.py | 263 ++++++++++++++++++++++++++++------
 1 file changed, 220 insertions(+), 43 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index d4ff50175..4db74a5e6 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -8,8 +8,12 @@
 import sys
 import time
 import tracemalloc
-
 import pandas as pd
+import gc
+import inspect
+import types
+import weakref
+import warnings
 
 from econml.dml import CausalForestDML
 from memory_profiler import memory_usage
@@ -18,10 +22,180 @@
 from xgboost import XGBRegressor
 
 
+# More aggressive warning suppression
+# Filter all warnings from scipy
+warnings.filterwarnings("ignore", module="scipy")
+# Also try to catch other possible warning categories
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", message="Please import")
+
 logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
 logger = logging.getLogger(__name__)
 
 
+def get_size_of_object(obj):
+    """Get memory size of an object in bytes"""
+    if isinstance(obj, np.ndarray):
+        return obj.nbytes
+    elif isinstance(obj, pd.DataFrame):
+        return obj.memory_usage(deep=True).sum()
+    elif hasattr(obj, 'nbytes'):  # Other objects with nbytes attribute
+        return obj.nbytes
+    else:
+        # Approximate size for other objects
+        return sys.getsizeof(obj)
+
+
+def analyze_object_memory(obj, name="object", max_depth=100):
+    """
+    Analyze a specific object for large arrays and DataFrames
+
+    Args:
+        obj: The object to analyze
+        name: Name to use for the object's root path
+        max_depth: Maximum recursion depth
+
+    Returns:
+        List of large arrays/DataFrames found in the object
+    """
+    # Force garbage collection
+    gc.collect()
+    print(f"Analyzing memory usage of {name}...")
+
+    # Get results using the find_arrays_in_object function
+    results = find_arrays_in_object(obj, name, max_depth=max_depth)
+
+    # Sort by size
+    results.sort(key=lambda x: x["size_mb"], reverse=True)
+
+    # Deduplicate by object id
+    unique_results = []
+    seen_ids = set()
+    for result in results:
+        obj_id = id(result["object"])
+        if obj_id not in seen_ids:
+            seen_ids.add(obj_id)
+            unique_results.append(result)
+
+    sums = {}
+    counts = {}
+    for item in unique_results:
+        # Extract the last component of the path (after the last period)
+        last_component = item["path"].split(".")[-1]
+
+        # Add the size_mb to the sum for this last component
+        if last_component in sums:
+            sums[last_component] += item["size_mb"]
+            counts[last_component] += 1
+        else:
+            sums[last_component] = item["size_mb"]
+            counts[last_component] = 1
+
+    sums = dict(sorted(sums.items(), key=lambda x: x[1], reverse=True))
+
+    df = pd.DataFrame({
+        "obj_name": list(sums.keys()),
+        "total_size_mb": list(sums.values()),
+        "count": [counts[key] for key in sums.keys()]
+    })
+
+    # Sort the DataFrame by total_size_mb in descending order
+    df = df.sort_values(by="total_size_mb", ascending=False).reset_index(drop=True)
+    print(df)
+
+    return df
+
+
+def find_arrays_in_object(obj, path="", visited=None, results=None, max_depth=10):
+    """
+    Recursively search an object and its attributes for NumPy arrays and DataFrames
+
+    Args:
+        obj: The object to search
+        path: Current attribute path (for tracking)
+        visited: Set of already visited object IDs
+        results: List to collect results
+        max_depth: Maximum recursion depth
+    """
+    if visited is None:
+        visited = set()
+    if results is None:
+        results = []
+
+    # Stop if we've seen this object before to prevent infinite recursion
+    obj_id = id(obj)
+    if obj_id in visited:
+        return results
+
+    # Add to visited set
+    visited.add(obj_id)
+
+    # Check if this object is a numpy array or DataFrame
+    if isinstance(obj, np.ndarray):
+        size_mb = float(obj.nbytes) / float(1024 * 1024)
+        results.append({
+            "type": "ndarray",
+            "path": path,
+            "shape": obj.shape,
+            "dtype": obj.dtype,
+            "size_mb": size_mb,
+            "object": obj
+        })
+        return results
+    elif isinstance(obj, pd.DataFrame):
+        size_mb = float(obj.memory_usage(deep=True).sum()) / float(1024 * 1024)
+        results.append({
+            "type": "DataFrame",
+            "path": path,
+            "shape": obj.shape,
+            "size_mb": size_mb,
+            "object": obj,
+            "columns": list(obj.columns)[:5] + ['...'] if len(obj.columns) > 5 else list(
+                obj.columns)
+        })
+        return results
+        # Skip certain types that don't contain arrays
+    elif (isinstance(obj, (str, int, float, bool, type, types.FunctionType, types.MethodType, weakref.ref)) or obj is None):
+        return results
+
+    # For dictionaries
+    elif isinstance(obj, dict):
+        for key, value in obj.items():
+            if isinstance(key, (str, int, float, bool, tuple)):
+                new_path = f"{path}['{key}']" if path else f"['{key}']"
+                find_arrays_in_object(value, new_path, visited, results,
+                                      max_depth)
+        return results
+
+    # For lists and tuples
+    elif isinstance(obj, (list, tuple)):
+        for i, item in enumerate(obj):
+            new_path = f"{path}[{i}]" if path else f"[{i}]"
+            find_arrays_in_object(item, new_path, visited, results, max_depth)
+        return results
+
+    else:
+        for attr_name in sorted(list(dir(obj))):
+            if not (attr_name.startswith('__') or attr_name.startswith('_')):  # Skip special methods
+                try:
+                    attr_value = getattr(obj, attr_name)
+                    if not callable(attr_value):
+                        new_path = f"{path}.{attr_name}" if path else attr_name
+                        find_arrays_in_object(attr_value, new_path, visited, results,
+                                          max_depth)
+                except Exception as e:
+                    continue
+        return results
+
+
+
+
+
+
+
+
+
 def balanced_downsample(X:pd.DataFrame, y:np.array, T_feat:pd.DataFrame, t_id:np.array, downsample_ratio:float):
     """
     Balanced downsampling based on treatment identity
@@ -56,8 +230,8 @@ def balanced_downsample(X:pd.DataFrame, y:np.array, T_feat:pd.DataFrame, t_id:np
     logger.info(f'Dowsampled negative examples by {downsample_ratio}')
     actions_before = negative_df['action_id'].value_counts()
     actions_after = df_downsampled['action_id'].value_counts()
-    logger.info(f'Before: {actions_before}')
-    logger.info(f'After: {actions_after}')
+    # logger.info(f'Before: {actions_before}')
+    # logger.info(f'After: {actions_after}')
 
     # Final sample
     combo_df   = pd.concat([positive_df,df_downsampled],axis=0).reset_index(drop=True)
@@ -159,46 +333,49 @@ def causalforestdml_memory_test(
     with open(result_file_name,"a") as output:
         output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n")
 
-
-    # malloc section
-    malloc_snapshot = tracemalloc.take_snapshot()
-    top_stats = malloc_snapshot.statistics('lineno')
-    print("\nTop 10 *tracemalloc* memory-consuming locations:")
-    for i, stat in enumerate(top_stats[:10], 1):
-        print(f"{i}. {stat}")
-
-    # Get statistics grouped by object type (more useful for identifying large objects)
-    top_types = malloc_snapshot.statistics('traceback')
-    print("\nTop 10  *tracemalloc* memory-consuming object types:")
-    for i, stat in enumerate(top_types[:10], 1):
-        print(f"{i}. {stat}")
-        # Print the traceback to see where the object was created
-        for line in stat.traceback.format():
-            print(f"    {line}")
-
-
-    # muppy section
-    all_objects = muppy.get_objects()
-    # Get a summary of memory usage by type
-    mem_summary = summary.summarize(all_objects)
-    print("*muppy* Memory usage summary:")
-    summary.print_(mem_summary)
-    # Look at NumPy arrays
-    numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)]
-    numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True)
-
-    print("\nTop 10 NumPy arrays by memory usage from *muppy*:")
-    for i, arr in enumerate(numpy_arrays[:10], 1):
-        memory_mb = arr.nbytes / (1024 * 1024)
-        print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB")
-
-    pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)]
-    pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True)
-
-    print("\nTop 10 Pandas DataFrames by memory usage *muppy*:")
-    for i, df in enumerate(pandas_dfs[:10], 1):
-        memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
-        print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB")
+    analyze_object_memory(est,name=estimator, max_depth=100)
+
+    logger.info('Done')
+
+    # # malloc section
+    # malloc_snapshot = tracemalloc.take_snapshot()
+    # top_stats = malloc_snapshot.statistics('lineno')
+    # print("\nTop 10 *tracemalloc* memory-consuming locations:")
+    # for i, stat in enumerate(top_stats[:10], 1):
+    #     print(f"{i}. {stat}")
+    #
+    # # Get statistics grouped by object type (more useful for identifying large objects)
+    # top_types = malloc_snapshot.statistics('traceback')
+    # print("\nTop 10  *tracemalloc* memory-consuming object types:")
+    # for i, stat in enumerate(top_types[:10], 1):
+    #     print(f"{i}. {stat}")
+    #     # Print the traceback to see where the object was created
+    #     for line in stat.traceback.format():
+    #         print(f"    {line}")
+    #
+    #
+    # # muppy section
+    # all_objects = muppy.get_objects()
+    # # Get a summary of memory usage by type
+    # mem_summary = summary.summarize(all_objects)
+    # print("*muppy* Memory usage summary:")
+    # summary.print_(mem_summary)
+    # # Look at NumPy arrays
+    # numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)]
+    # numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True)
+    #
+    # print("\nTop 10 NumPy arrays by memory usage from *muppy*:")
+    # for i, arr in enumerate(numpy_arrays[:10], 1):
+    #     memory_mb = arr.nbytes / (1024 * 1024)
+    #     print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB")
+    #
+    # pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)]
+    # pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True)
+    #
+    # print("\nTop 10 Pandas DataFrames by memory usage *muppy*:")
+    # for i, df in enumerate(pandas_dfs[:10], 1):
+    #     memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
+    #     print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB")
 
 
 if __name__ == "__main__":

From 5ff58072deb39233605ab080503d5f7cda274abb Mon Sep 17 00:00:00 2001
From: Carl Gold <carl@offerfit.ai>
Date: Fri, 9 May 2025 12:22:21 -0700
Subject: [PATCH 24/25] Fix print statement

---
 scripts/memory_test_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index 4db74a5e6..630fc12f0 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -321,7 +321,7 @@ def causalforestdml_memory_test(
     # mem_usage section
     max_memory, fitted_model = mem_usage
     elapsed_time = end_time-start_time
-    logger.info(f"Maximum memory usage using memory_usage: {max_memory} MiB")
+    logger.info(f"Maximum memory usage during fit: {max_memory} MiB")
     logger.info(f"Time to fit: {elapsed_time} seconds")
 
     result_file_name = os.path.join(root_dir,"mem_test_results.csv")

From 999d03080dc7d356920eecdce2180a5d94928760 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 9 May 2025 19:24:15 +0000
Subject: [PATCH 25/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/memory_test_script.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py
index b5d0fcb5d..bd66a3488 100644
--- a/scripts/memory_test_script.py
+++ b/scripts/memory_test_script.py
@@ -10,7 +10,6 @@
 import tracemalloc
 import pandas as pd
 import gc
-import inspect
 import types
 import weakref
 import warnings
@@ -18,7 +17,6 @@
 
 from econml.dml import CausalForestDML
 from memory_profiler import memory_usage
-from pympler import muppy, summary
 from catboost import CatBoostRegressor
 from xgboost import XGBRegressor
 
@@ -57,7 +55,8 @@ def analyze_object_memory(obj, name="object", max_depth=100):
         name: Name to use for the object's root path
         max_depth: Maximum recursion depth
 
-    Returns:
+    Returns
+    -------
         List of large arrays/DataFrames found in the object
     """
     # Force garbage collection
@@ -185,7 +184,7 @@ def find_arrays_in_object(obj, path="", visited=None, results=None, max_depth=10
                         new_path = f"{path}.{attr_name}" if path else attr_name
                         find_arrays_in_object(attr_value, new_path, visited, results,
                                           max_depth)
-                except Exception as e:
+                except Exception:
                     continue
         return results