py-why · carl-offerfit · Jan 7, 2025 · Jan 7, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py
@@ -578,6 +578,10 @@ class CausalForestDML(_BaseDML):
         at depth `depth`, is re-weighted by 1 / (1 + `depth`)**2.0. See the method ``feature_importances``
         for a method that allows one to change these defaults.
 
+    use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps
+        reduce memory overhead for large data sets. For details on memmap see:
+        https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
+
     References
     ----------
     .. [cfdml1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests."
@@ -619,7 +623,8 @@ def __init__(self, *,
                  verbose=0,
                  allow_missing=False,
                  use_ray=False,
-                 ray_remote_func_options=None):
+                 ray_remote_func_options=None,
+                 use_memmap=False):
 
         # TODO: consider whether we need more care around stateful featurizers,
         #       since we clone it and fit separate copies
@@ -647,6 +652,7 @@ def __init__(self, *,
         self.fit_intercept = fit_intercept
         self.subforest_size = subforest_size
         self.n_jobs = n_jobs
+        self.use_memmap = use_memmap
         self.verbose = verbose
         super().__init__(discrete_outcome=discrete_outcome,
                          discrete_treatment=discrete_treatment,
@@ -698,7 +704,8 @@ def _gen_model_final(self):
                                            n_jobs=self.n_jobs,
                                            random_state=self.random_state,
                                            verbose=self.verbose,
-                                           warm_start=False))
+                                           warm_start=False,
+                                           use_memmap=self.use_memmap))
 
     def _gen_rlearner_model_final(self):
         return _CausalForestFinalWrapper(self._gen_model_final(), self._gen_featurizer(),

diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py
@@ -8,7 +8,7 @@
 #
 # Copyright (c) 2007-2020 The scikit-learn developers.
 # All rights reserved.
-
+import gc
 import numbers
 from warnings import warn
 from abc import ABCMeta, abstractmethod
@@ -27,6 +27,7 @@
 from sklearn.utils import check_X_y
 import scipy.stats
 from scipy.special import erfc
+import tempfile
 
 __all__ = ["BaseGRF"]
 
@@ -51,6 +52,11 @@ class BaseGRF(BaseEnsemble, metaclass=ABCMeta):
 
     Warning: This class should not be used directly. Use derived classes
     instead.
+
+
+    use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps
+        reduce memory overhead for large data sets. For details on memmap see:
+        https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
     """
 
     def __init__(self,
@@ -73,7 +79,8 @@ def __init__(self,
                  n_jobs=-1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 use_memmap=False):
         super().__init__(
             base_estimator=GRFTree(),
             n_estimators=n_estimators,
@@ -103,6 +110,7 @@ def __init__(self,
         self.verbose = verbose
         self.warm_start = warm_start
         self.max_samples = max_samples
+        self.use_memmap = use_memmap
 
     @abstractmethod
     def _get_alpha_and_pointJ(self, X, T, y, **kwargs):
@@ -384,12 +392,25 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs):
                 s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False)
                           for _ in range(n_more_estimators)]
 
+            if self.use_memmap:
+                # Make a memmap for better performance on large number of treatment variables
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file:
+                    filename = temp_file.name
+                print(f"BaseGRF.fit Making memmap with temp file {filename}")
+                np.save(filename, yaug)  # Save array to disk
+                # Remove references to (potentially) large data before Parallel
+                del yaug, pointJ
+                gc.collect()
+                # Create the memmap version
+                yaug = np.load(filename, mmap_mode='r')
+
             # Parallel loop: we prefer the threading backend as the Cython code
             # for fitting the trees is internally releasing the Python GIL
             # making threading more efficient than multiprocessing in
             # that case. However, for joblib 0.12+ we respect any
             # parallel_backend contexts set at a higher level,
             # since correctness does not rely on using threads.
+
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')(
                 delayed(t.fit)(X[s], yaug[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_,
                                sample_weight=sample_weight[s] if sample_weight is not None else None,

diff --git a/econml/grf/classes.py b/econml/grf/classes.py
@@ -359,7 +359,8 @@ def __init__(self,
                  n_jobs=-1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 use_memmap=False):
         super().__init__(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                          min_samples_split=min_samples_split,
                          min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
@@ -368,7 +369,7 @@ def __init__(self,
                          max_samples=max_samples, min_balancedness_tol=min_balancedness_tol,
                          honest=honest, inference=inference, fit_intercept=fit_intercept,
                          subforest_size=subforest_size, n_jobs=n_jobs, random_state=random_state, verbose=verbose,
-                         warm_start=warm_start)
+                         warm_start=warm_start, use_memmap=use_memmap)
 
     def fit(self, X, T, y, *, sample_weight=None):
         """

diff --git a/notebooks/Causal Forest Memory Demo.ipynb b/notebooks/Causal Forest Memory Demo.ipynb
@@ -0,0 +1,37 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    ""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}