From c87b9d9dd10fb1e3b3f58310cf478d370d589c52 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Tue, 7 Jan 2025 17:14:05 -0500 Subject: [PATCH 01/25] Add a memmap before Parallel --- econml/grf/_base_grf.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py index dac530fa6..fbff9d5f1 100644 --- a/econml/grf/_base_grf.py +++ b/econml/grf/_base_grf.py @@ -8,7 +8,7 @@ # # Copyright (c) 2007-2020 The scikit-learn developers. # All rights reserved. - +import gc import numbers from warnings import warn from abc import ABCMeta, abstractmethod @@ -27,6 +27,7 @@ from sklearn.utils import check_X_y import scipy.stats from scipy.special import erfc +import tempfile __all__ = ["BaseGRF"] @@ -384,14 +385,25 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs): s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False) for _ in range(n_more_estimators)] + # Make a memmap for better performance on large number of treatment variables + with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file: + filename = temp_file.name + np.save(filename, yaug) # Save array to disk + # Remove references to (potentially) large data before Parallel + del yaug, pointJ + gc.collect() + # Create the memmap version + yaug_mmap = np.load(filename + '.npy', mmap_mode='r') + # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, for joblib 0.12+ we respect any # parallel_backend contexts set at a higher level, # since correctness does not rely on using threads. + trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')( - delayed(t.fit)(X[s], yaug[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_, + delayed(t.fit)(X[s], yaug_mmap[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_, sample_weight=sample_weight[s] if sample_weight is not None else None, check_input=False) for t, s in zip(trees, s_inds)) From 672699fc753fbb342429c6db49f888908b773ab3 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Tue, 7 Jan 2025 18:03:51 -0500 Subject: [PATCH 02/25] Filename fix. Add a test script --- econml/grf/_base_grf.py | 2 +- scripts/memory_test_script.py | 75 +++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 scripts/memory_test_script.py diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py index fbff9d5f1..80c5c3c5b 100644 --- a/econml/grf/_base_grf.py +++ b/econml/grf/_base_grf.py @@ -393,7 +393,7 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs): del yaug, pointJ gc.collect() # Create the memmap version - yaug_mmap = np.load(filename + '.npy', mmap_mode='r') + yaug_mmap = np.load(filename, mmap_mode='r') # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py new file mode 100644 index 000000000..9d477b0fe --- /dev/null +++ b/scripts/memory_test_script.py @@ -0,0 +1,75 @@ +import json +from datetime import datetime + +import sklearn.metrics +import argparse +from econml.dml import SparseLinearDML, CausalForestDML +from econml.validate import DRTester +import collinearity +from itertools import product +import joblib +import numpy as np +import os +import pandas as pd +import scipy +from sklearn.preprocessing import StandardScaler, FunctionTransformer +from sklearn.pipeline import Pipeline +from sklearn.metrics import roc_auc_score +from xgboost import XGBRegressor, XGBClassifier +import sys +import logging +from sklearn.model_selection import KFold + +logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + + +def causalforestdml_memory_test( + file_name: str +): + """ + Main function for testing DML on high dimensional marketing data sets. Adds pre-processing + like scaling and removing highly collinear features. It can cross validates parameters of either + first stage model, or the final model. Saves out the scores of every model to a file. + + :param file_name: File that contains the data, assumed to be joblib + :param model_name: Name of the model to use, either SparseLinearDML or CausalForestDML + + """ + + root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") + logger.info(f"Using root dir {root_dir} loading data file {file_name}") + data_file = os.path.join(os.path.abspath(root_dir), file_name) + data = joblib.load(data_file) + + # This is specific to Offerfit - names of the data are derived from contextual bandits + X = data['X_no_act'] + T = data['a_processed'] + y = data['real_reward'] + # y[y > 1] = 1 + + logger.info(f"X has a shape of: {X.shape}") + logger.info(f"T has a shape of: {T.shape}") + logger.info(f"y has a shape of: {y.shape}") + + + est = CausalForestDML( + model_t=XGBRegressor(n_estimators=50), + model_y=XGBClassifier(n_estimators=50), + discrete_outcome=True, + n_jobs=1 + ) + est.fit(y,T,X=X) + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib") + args = parser.parse_args(sys.argv[1:]) + data_file = args.data_file + + causalforestdml_memory_test( + file_name=data_file + ) From 4e9820a09a0f59d7e1e71df52c0c29acdbe329a4 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Wed, 8 Jan 2025 15:15:01 -0500 Subject: [PATCH 03/25] Script update --- scripts/memory_test_script.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 9d477b0fe..f0056bc95 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -27,15 +27,7 @@ def causalforestdml_memory_test( file_name: str ): - """ - Main function for testing DML on high dimensional marketing data sets. Adds pre-processing - like scaling and removing highly collinear features. It can cross validates parameters of either - first stage model, or the final model. Saves out the scores of every model to a file. - :param file_name: File that contains the data, assumed to be joblib - :param model_name: Name of the model to use, either SparseLinearDML or CausalForestDML - - """ root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") logger.info(f"Using root dir {root_dir} loading data file {file_name}") @@ -46,7 +38,6 @@ def causalforestdml_memory_test( X = data['X_no_act'] T = data['a_processed'] y = data['real_reward'] - # y[y > 1] = 1 logger.info(f"X has a shape of: {X.shape}") logger.info(f"T has a shape of: {T.shape}") @@ -54,13 +45,14 @@ def causalforestdml_memory_test( est = CausalForestDML( - model_t=XGBRegressor(n_estimators=50), - model_y=XGBClassifier(n_estimators=50), + model_t=XGBRegressor(n_estimators=10), + model_y=XGBClassifier(n_estimators=10), discrete_outcome=True, - n_jobs=1 + n_jobs=2, + n_estimators=20 ) est.fit(y,T,X=X) - + print(est.summary()) From d0d575dfa59ddc55b35f953d5dde120d2e1ff353 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 20:30:55 +0000 Subject: [PATCH 04/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/memory_test_script.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index f0056bc95..fb96cbd72 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -1,24 +1,11 @@ -import json -from datetime import datetime -import sklearn.metrics import argparse -from econml.dml import SparseLinearDML, CausalForestDML -from econml.validate import DRTester -import collinearity -from itertools import product +from econml.dml import CausalForestDML import joblib -import numpy as np import os -import pandas as pd -import scipy -from sklearn.preprocessing import StandardScaler, FunctionTransformer -from sklearn.pipeline import Pipeline -from sklearn.metrics import roc_auc_score from xgboost import XGBRegressor, XGBClassifier import sys import logging -from sklearn.model_selection import KFold logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) From 5b3db78b9c8cbb191ea7bf4d3a64e085f816e655 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Wed, 8 Jan 2025 15:33:30 -0500 Subject: [PATCH 05/25] Remove test script from the PR --- scripts/memory_test_script.py | 67 ----------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 scripts/memory_test_script.py diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py deleted file mode 100644 index f0056bc95..000000000 --- a/scripts/memory_test_script.py +++ /dev/null @@ -1,67 +0,0 @@ -import json -from datetime import datetime - -import sklearn.metrics -import argparse -from econml.dml import SparseLinearDML, CausalForestDML -from econml.validate import DRTester -import collinearity -from itertools import product -import joblib -import numpy as np -import os -import pandas as pd -import scipy -from sklearn.preprocessing import StandardScaler, FunctionTransformer -from sklearn.pipeline import Pipeline -from sklearn.metrics import roc_auc_score -from xgboost import XGBRegressor, XGBClassifier -import sys -import logging -from sklearn.model_selection import KFold - -logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') -logger = logging.getLogger(__name__) - - -def causalforestdml_memory_test( - file_name: str -): - - - root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") - logger.info(f"Using root dir {root_dir} loading data file {file_name}") - data_file = os.path.join(os.path.abspath(root_dir), file_name) - data = joblib.load(data_file) - - # This is specific to Offerfit - names of the data are derived from contextual bandits - X = data['X_no_act'] - T = data['a_processed'] - y = data['real_reward'] - - logger.info(f"X has a shape of: {X.shape}") - logger.info(f"T has a shape of: {T.shape}") - logger.info(f"y has a shape of: {y.shape}") - - - est = CausalForestDML( - model_t=XGBRegressor(n_estimators=10), - model_y=XGBClassifier(n_estimators=10), - discrete_outcome=True, - n_jobs=2, - n_estimators=20 - ) - est.fit(y,T,X=X) - print(est.summary()) - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib") - args = parser.parse_args(sys.argv[1:]) - data_file = args.data_file - - causalforestdml_memory_test( - file_name=data_file - ) From 362b57ed5f390f71802754e730d4ad0c5c0f6f9a Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Fri, 21 Feb 2025 14:52:10 -0800 Subject: [PATCH 06/25] Make memmap an option, and add the reference in the doc string --- econml/grf/_base_grf.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py index 80c5c3c5b..68a9cb698 100644 --- a/econml/grf/_base_grf.py +++ b/econml/grf/_base_grf.py @@ -197,7 +197,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr - def fit(self, X, T, y, *, sample_weight=None, **kwargs): + def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs): """ Build a forest of trees from the training set (X, T, y) and any other auxiliary variables. @@ -214,6 +214,9 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs): Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. + use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps + reduce memory overhead for large data sets. For details on memmap see: + https://numpy.org/doc/stable/reference/generated/numpy.memmap.html **kwargs : dictionary of array_like items of shape (n_samples, d_var) Auxiliary random variables that go into the moment function (e.g. instrument, censoring etc) Any of these variables will be passed on as is to the `get_pointJ` and @@ -385,15 +388,16 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs): s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False) for _ in range(n_more_estimators)] - # Make a memmap for better performance on large number of treatment variables - with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file: - filename = temp_file.name - np.save(filename, yaug) # Save array to disk - # Remove references to (potentially) large data before Parallel - del yaug, pointJ - gc.collect() - # Create the memmap version - yaug_mmap = np.load(filename, mmap_mode='r') + if use_memmap: + # Make a memmap for better performance on large number of treatment variables + with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file: + filename = temp_file.name + np.save(filename, yaug) # Save array to disk + # Remove references to (potentially) large data before Parallel + del yaug, pointJ + gc.collect() + # Create the memmap version + yaug = np.load(filename, mmap_mode='r') # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL @@ -403,7 +407,7 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs): # since correctness does not rely on using threads. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')( - delayed(t.fit)(X[s], yaug_mmap[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_, + delayed(t.fit)(X[s], yaug[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_, sample_weight=sample_weight[s] if sample_weight is not None else None, check_input=False) for t, s in zip(trees, s_inds)) From ebb4348e2ef216d33a6e9ab5c794dfbddc2e6eb0 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Fri, 21 Feb 2025 15:00:13 -0800 Subject: [PATCH 07/25] Start a notebook to demonstrate causal forest memory usage, by copying from the existing Causal Forest Demo --- notebooks/Causal Forest Memory Demo.ipynb | 37 +++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 notebooks/Causal Forest Memory Demo.ipynb diff --git a/notebooks/Causal Forest Memory Demo.ipynb b/notebooks/Causal Forest Memory Demo.ipynb new file mode 100644 index 000000000..54f657bb0 --- /dev/null +++ b/notebooks/Causal Forest Memory Demo.ipynb @@ -0,0 +1,37 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5d694f74668862401622189952c1ac3eb57812cd Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Wed, 26 Feb 2025 08:38:30 -0800 Subject: [PATCH 08/25] Set use_memmap in constructor --- econml/dml/causal_forest.py | 11 +++++++++-- econml/grf/_base_grf.py | 16 ++++++++++------ econml/grf/classes.py | 5 +++-- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py index 4a1a536bc..33ad2398a 100644 --- a/econml/dml/causal_forest.py +++ b/econml/dml/causal_forest.py @@ -578,6 +578,10 @@ class CausalForestDML(_BaseDML): at depth `depth`, is re-weighted by 1 / (1 + `depth`)**2.0. See the method ``feature_importances`` for a method that allows one to change these defaults. + use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps + reduce memory overhead for large data sets. For details on memmap see: + https://numpy.org/doc/stable/reference/generated/numpy.memmap.html + References ---------- .. [cfdml1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests." @@ -619,7 +623,8 @@ def __init__(self, *, verbose=0, allow_missing=False, use_ray=False, - ray_remote_func_options=None): + ray_remote_func_options=None, + use_memmap=False): # TODO: consider whether we need more care around stateful featurizers, # since we clone it and fit separate copies @@ -647,6 +652,7 @@ def __init__(self, *, self.fit_intercept = fit_intercept self.subforest_size = subforest_size self.n_jobs = n_jobs + self.use_memmap = use_memmap self.verbose = verbose super().__init__(discrete_outcome=discrete_outcome, discrete_treatment=discrete_treatment, @@ -698,7 +704,8 @@ def _gen_model_final(self): n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, - warm_start=False)) + warm_start=False, + use_memmap=self.use_memmap)) def _gen_rlearner_model_final(self): return _CausalForestFinalWrapper(self._gen_model_final(), self._gen_featurizer(), diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py index 68a9cb698..57ea2762c 100644 --- a/econml/grf/_base_grf.py +++ b/econml/grf/_base_grf.py @@ -52,6 +52,11 @@ class BaseGRF(BaseEnsemble, metaclass=ABCMeta): Warning: This class should not be used directly. Use derived classes instead. + + + use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps + reduce memory overhead for large data sets. For details on memmap see: + https://numpy.org/doc/stable/reference/generated/numpy.memmap.html """ def __init__(self, @@ -74,7 +79,8 @@ def __init__(self, n_jobs=-1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + use_memmap=False): super().__init__( base_estimator=GRFTree(), n_estimators=n_estimators, @@ -104,6 +110,7 @@ def __init__(self, self.verbose = verbose self.warm_start = warm_start self.max_samples = max_samples + self.use_memmap = use_memmap @abstractmethod def _get_alpha_and_pointJ(self, X, T, y, **kwargs): @@ -197,7 +204,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr - def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs): + def fit(self, X, T, y, *, sample_weight=None, **kwargs): """ Build a forest of trees from the training set (X, T, y) and any other auxiliary variables. @@ -214,9 +221,6 @@ def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs): Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. - use_memmap: Whether to use a numpy memmap to pass data to parallel training. Helps - reduce memory overhead for large data sets. For details on memmap see: - https://numpy.org/doc/stable/reference/generated/numpy.memmap.html **kwargs : dictionary of array_like items of shape (n_samples, d_var) Auxiliary random variables that go into the moment function (e.g. instrument, censoring etc) Any of these variables will be passed on as is to the `get_pointJ` and @@ -388,7 +392,7 @@ def fit(self, X, T, y, *, sample_weight=None, use_memmap: bool=False, **kwargs): s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False) for _ in range(n_more_estimators)] - if use_memmap: + if self.use_memmap: # Make a memmap for better performance on large number of treatment variables with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file: filename = temp_file.name diff --git a/econml/grf/classes.py b/econml/grf/classes.py index 46d0a22ab..42e6cd399 100644 --- a/econml/grf/classes.py +++ b/econml/grf/classes.py @@ -359,7 +359,8 @@ def __init__(self, n_jobs=-1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + use_memmap=False): super().__init__(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, @@ -368,7 +369,7 @@ def __init__(self, max_samples=max_samples, min_balancedness_tol=min_balancedness_tol, honest=honest, inference=inference, fit_intercept=fit_intercept, subforest_size=subforest_size, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, use_memmap=use_memmap) def fit(self, X, T, y, *, sample_weight=None): """ From 8df2c15468ee8f541e68381026ed645b6b774e6c Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Thu, 10 Apr 2025 08:18:05 -0400 Subject: [PATCH 09/25] Shell script to load a data file and fit a CausalForestDML with different number of estimators, jobs and memory map option --- scripts/memory_test_script.py | 83 +++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 scripts/memory_test_script.py diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py new file mode 100644 index 000000000..efa15dede --- /dev/null +++ b/scripts/memory_test_script.py @@ -0,0 +1,83 @@ +import json +from datetime import datetime + +import sklearn.metrics +import argparse +from econml.dml import SparseLinearDML, CausalForestDML +from econml.validate import DRTester +import collinearity +from itertools import product +import joblib +import numpy as np +import os +import pandas as pd +import scipy +from sklearn.preprocessing import StandardScaler, FunctionTransformer +from sklearn.pipeline import Pipeline +from sklearn.metrics import roc_auc_score +from xgboost import XGBRegressor, XGBClassifier +import sys +import logging +from sklearn.model_selection import KFold + +logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + + +def causalforestdml_memory_test( + file_name: str, + n_est_y: int, + n_est_t: int, + n_est_2: int, + n_jobs: int, + use_memmap: bool +): + + + root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") + logger.info(f"Using root dir {root_dir} loading data file {file_name}") + data_file = os.path.join(os.path.abspath(root_dir), file_name) + data = joblib.load(data_file) + + # This is specific to Offerfit - names of the data are derived from contextual bandits + X = data['X'] + T = data['T'] + y = data['Y'] + + logger.info(f"X has a shape of: {X.shape}") + logger.info(f"T has a shape of: {T.shape}") + logger.info(f"y has a shape of: {y.shape}") + + est = CausalForestDML( + model_t=XGBRegressor(n_estimators=n_est_t), + model_y=XGBClassifier(n_estimators=n_est_y), + discrete_outcome=True, + n_jobs=n_jobs, + n_estimators=n_est_2, + use_memmap=use_memmap + ) + logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}") + est.fit(y,T,X=X) + print(est.summary()) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib") + parser.add_argument("--n_est_y", type=int, help="", default=100) + parser.add_argument("--n_est_t", type=int, help="", default=200) + parser.add_argument("--n_est_2", type=int, help="", default=500) + parser.add_argument("--n_jobs", type=int, help="", default=-1) + parser.add_argument("--memmap", type=bool, help="", default=False) + args = parser.parse_args(sys.argv[1:]) + data_file = args.data_file + + causalforestdml_memory_test( + file_name=data_file, + n_est_y=args.n_est_y, + n_est_t=args.n_est_t, + n_est_2=args.n_est_2, + n_jobs=args.n_jobs, + use_memmap=args.memmap + ) From 5ea336f3578f577f8b19620006ef5d2c5596a3de Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Thu, 10 Apr 2025 08:52:17 -0400 Subject: [PATCH 10/25] Try memory profiler, memory usage --- scripts/memory_test_script.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index efa15dede..d4dfda324 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -5,6 +5,7 @@ import argparse from econml.dml import SparseLinearDML, CausalForestDML from econml.validate import DRTester +from memory_profiler import memory_usage import collinearity from itertools import product import joblib @@ -56,9 +57,23 @@ def causalforestdml_memory_test( n_estimators=n_est_2, use_memmap=use_memmap ) - logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}") - est.fit(y,T,X=X) - print(est.summary()) + logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}," + f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") + # est.fit(y,T,X=X) + + mem_usage = memory_usage( + (est.fit, [y,T], {"X":X}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + + # Extract results + max_memory, fitted_model = mem_usage + + print(f"Maximum memory usage: {max_memory} MiB") From 1ef0a53faa7c060ab31844bf26c63f198522113b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 12:52:43 +0000 Subject: [PATCH 11/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/memory_test_script.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index d4dfda324..41fdd5634 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -1,25 +1,12 @@ -import json -from datetime import datetime -import sklearn.metrics import argparse -from econml.dml import SparseLinearDML, CausalForestDML -from econml.validate import DRTester +from econml.dml import CausalForestDML from memory_profiler import memory_usage -import collinearity -from itertools import product import joblib -import numpy as np import os -import pandas as pd -import scipy -from sklearn.preprocessing import StandardScaler, FunctionTransformer -from sklearn.pipeline import Pipeline -from sklearn.metrics import roc_auc_score from xgboost import XGBRegressor, XGBClassifier import sys import logging -from sklearn.model_selection import KFold logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) @@ -57,7 +44,7 @@ def causalforestdml_memory_test( n_estimators=n_est_2, use_memmap=use_memmap ) - logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}," + logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}," f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") # est.fit(y,T,X=X) From 3cb90d180017a255cd477d36018304c3457e94e8 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Thu, 10 Apr 2025 10:31:06 -0400 Subject: [PATCH 12/25] Add a printout to make sure it is working --- econml/grf/_base_grf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/econml/grf/_base_grf.py b/econml/grf/_base_grf.py index 57ea2762c..a4a6e03e0 100644 --- a/econml/grf/_base_grf.py +++ b/econml/grf/_base_grf.py @@ -396,6 +396,7 @@ def fit(self, X, T, y, *, sample_weight=None, **kwargs): # Make a memmap for better performance on large number of treatment variables with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as temp_file: filename = temp_file.name + print(f"BaseGRF.fit Making memmap with temp file {filename}") np.save(filename, yaug) # Save array to disk # Remove references to (potentially) large data before Parallel del yaug, pointJ From 6dbdd52c4eb6af755a5b3cad00855788661783f4 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Mon, 28 Apr 2025 08:38:33 -0700 Subject: [PATCH 13/25] Add catboost and save the output of the memory test script --- scripts/memory_test_script.py | 46 +++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 41fdd5634..bb3c2d000 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -4,8 +4,10 @@ from memory_profiler import memory_usage import joblib import os +from catboost import CatBoostRegressor from xgboost import XGBRegressor, XGBClassifier import sys +import time import logging logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') @@ -38,16 +40,16 @@ def causalforestdml_memory_test( est = CausalForestDML( model_t=XGBRegressor(n_estimators=n_est_t), - model_y=XGBClassifier(n_estimators=n_est_y), - discrete_outcome=True, + model_y=XGBRegressor(n_estimators=n_est_y), + discrete_outcome=False, n_jobs=n_jobs, n_estimators=n_est_2, use_memmap=use_memmap ) - logger.info(f"Calling fit: njobs={n_jobs}, MemMap={use_memmap}," + logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap}," f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") - # est.fit(y,T,X=X) + start_time = time.time() mem_usage = memory_usage( (est.fit, [y,T], {"X":X}), interval=0.1, # Sample every 0.1 seconds @@ -56,13 +58,47 @@ def causalforestdml_memory_test( retval=True, # Return the fitted model too include_children=True # Include joblib's child processes ) + end_time = time.time() # Extract results max_memory, fitted_model = mem_usage - print(f"Maximum memory usage: {max_memory} MiB") + logger.info(f"Maximum memory usage: {max_memory} MiB") + elapsed_time = end_time-start_time + logger.info(f"Time to fit: {elapsed_time} seconds") + est2 = CatBoostRegressor(n_estimators=n_est_2,allow_writing_files=False) + + + logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}") + + start_time2 = time.time() + mem_usage2 = memory_usage( + (est2.fit, [X],{"y":y, "silent": True}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + end_time2 = time.time() + + # Extract results + max_memory2, fitted_model2 = mem_usage2 + elapsed_time2 = end_time2-start_time2 + + logger.info(f"Maximum memory usage: {max_memory2} MiB") + logger.info(f"Time to fit: {elapsed_time2} seconds") + + file_name = os.path.join(root_dir,"mem_test_results.csv") + if not os.path.isfile(file_name): + with open(file_name,"w") as output: + output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n") + + with open(file_name,"a") as output: + output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory},{elapsed_time},{max_memory2},{elapsed_time2}\n") + if __name__ == "__main__": parser = argparse.ArgumentParser() From 27dc80c30ee8643cac6bab4c5e2a06a62b9bb4e9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:38:43 +0000 Subject: [PATCH 14/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/memory_test_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index bb3c2d000..7a863862c 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -5,7 +5,7 @@ import joblib import os from catboost import CatBoostRegressor -from xgboost import XGBRegressor, XGBClassifier +from xgboost import XGBRegressor import sys import time import logging From e9a54058039267361a80e506f62dd35cd30e802e Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Mon, 28 Apr 2025 08:52:22 -0700 Subject: [PATCH 15/25] Use default estimators --- scripts/memory_test_script.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index bb3c2d000..80465dd35 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -103,9 +103,9 @@ def causalforestdml_memory_test( if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib") - parser.add_argument("--n_est_y", type=int, help="", default=100) - parser.add_argument("--n_est_t", type=int, help="", default=200) - parser.add_argument("--n_est_2", type=int, help="", default=500) + parser.add_argument("--n_est_y", type=int, help="", default=None) + parser.add_argument("--n_est_t", type=int, help="", default=None) + parser.add_argument("--n_est_2", type=int, help="", default=None) parser.add_argument("--n_jobs", type=int, help="", default=-1) parser.add_argument("--memmap", type=bool, help="", default=False) args = parser.parse_args(sys.argv[1:]) From 6910ed3c649fc750b6e288cbfee11e9908944ad4 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Mon, 28 Apr 2025 09:25:44 -0700 Subject: [PATCH 16/25] Fix name conflict between input file and result file --- scripts/memory_test_script.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index b456c2401..ceb7021da 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -91,12 +91,12 @@ def causalforestdml_memory_test( logger.info(f"Maximum memory usage: {max_memory2} MiB") logger.info(f"Time to fit: {elapsed_time2} seconds") - file_name = os.path.join(root_dir,"mem_test_results.csv") - if not os.path.isfile(file_name): - with open(file_name,"w") as output: + result_file_name = os.path.join(root_dir,"mem_test_results.csv") + if not os.path.isfile(result_file_name): + with open(result_file_name,"w") as output: output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n") - with open(file_name,"a") as output: + with open(result_file_name,"a") as output: output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory},{elapsed_time},{max_memory2},{elapsed_time2}\n") From bb69fee0f5cb82557b40da37d80e69599e61fc4a Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Mon, 28 Apr 2025 10:01:02 -0700 Subject: [PATCH 17/25] Make sure we remove the memory for the causal forest estimator --- scripts/memory_test_script.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index ceb7021da..3714512df 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -1,5 +1,7 @@ import argparse +import gc + from econml.dml import CausalForestDML from memory_profiler import memory_usage import joblib @@ -67,6 +69,8 @@ def causalforestdml_memory_test( elapsed_time = end_time-start_time logger.info(f"Time to fit: {elapsed_time} seconds") + del est + gc.collect() est2 = CatBoostRegressor(n_estimators=n_est_2,allow_writing_files=False) From fee0757784695007b4a90041bab324dcced316fb Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Mon, 28 Apr 2025 10:03:32 -0700 Subject: [PATCH 18/25] Limit digits --- scripts/memory_test_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 3714512df..f2d4c85ba 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -101,7 +101,7 @@ def causalforestdml_memory_test( output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n") with open(result_file_name,"a") as output: - output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory},{elapsed_time},{max_memory2},{elapsed_time2}\n") + output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f},{max_memory2:.1f},{elapsed_time2:.1f}\n") if __name__ == "__main__": From e247739bf77f04fa6cc983f494fe3a98d5b104c7 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Mon, 28 Apr 2025 14:23:00 -0700 Subject: [PATCH 19/25] Switch to separate runs for catboost and causalforest --- scripts/memory_test_script.py | 98 +++++++++++++++++------------------ 1 file changed, 47 insertions(+), 51 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index f2d4c85ba..07e4e6b70 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -1,5 +1,6 @@ import argparse +import datetime import gc from econml.dml import CausalForestDML @@ -18,6 +19,7 @@ def causalforestdml_memory_test( file_name: str, + estimator: str, n_est_y: int, n_est_t: int, n_est_2: int, @@ -25,6 +27,8 @@ def causalforestdml_memory_test( use_memmap: bool ): + if not estimator.lower() in ('causalforest','catboost'): + raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}") root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") logger.info(f"Using root dir {root_dir} loading data file {file_name}") @@ -40,68 +44,58 @@ def causalforestdml_memory_test( logger.info(f"T has a shape of: {T.shape}") logger.info(f"y has a shape of: {y.shape}") - est = CausalForestDML( - model_t=XGBRegressor(n_estimators=n_est_t), - model_y=XGBRegressor(n_estimators=n_est_y), - discrete_outcome=False, - n_jobs=n_jobs, - n_estimators=n_est_2, - use_memmap=use_memmap - ) - logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap}," - f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") - - start_time = time.time() - mem_usage = memory_usage( - (est.fit, [y,T], {"X":X}), - interval=0.1, # Sample every 0.1 seconds - timeout=None, # No timeout - max_usage=True, # Get maximum memory used - retval=True, # Return the fitted model too - include_children=True # Include joblib's child processes - ) - end_time = time.time() + if estimator == 'causalforest': + est = CausalForestDML( + model_t=XGBRegressor(n_estimators=n_est_t), + model_y=XGBRegressor(n_estimators=n_est_y), + discrete_outcome=False, + n_jobs=n_jobs, + n_estimators=n_est_2, + use_memmap=use_memmap + ) + logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap}," + f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") + + start_time = time.time() + mem_usage = memory_usage( + (est.fit, [y,T], {"X":X}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + end_time = time.time() + elif estimator == 'catboost': + est = CatBoostRegressor(n_estimators=n_est_2, allow_writing_files=False) + + logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}") + + start_time = time.time() + mem_usage = memory_usage( + (est.fit, [X], {"y": y, "silent": True}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + end_time = time.time() # Extract results max_memory, fitted_model = mem_usage - - logger.info(f"Maximum memory usage: {max_memory} MiB") elapsed_time = end_time-start_time + logger.info(f"Maximum memory usage: {max_memory} MiB") logger.info(f"Time to fit: {elapsed_time} seconds") - del est - gc.collect() - - est2 = CatBoostRegressor(n_estimators=n_est_2,allow_writing_files=False) - - - logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}") - - start_time2 = time.time() - mem_usage2 = memory_usage( - (est2.fit, [X],{"y":y, "silent": True}), - interval=0.1, # Sample every 0.1 seconds - timeout=None, # No timeout - max_usage=True, # Get maximum memory used - retval=True, # Return the fitted model too - include_children=True # Include joblib's child processes - ) - end_time2 = time.time() - - # Extract results - max_memory2, fitted_model2 = mem_usage2 - elapsed_time2 = end_time2-start_time2 - - logger.info(f"Maximum memory usage: {max_memory2} MiB") - logger.info(f"Time to fit: {elapsed_time2} seconds") - result_file_name = os.path.join(root_dir,"mem_test_results.csv") if not os.path.isfile(result_file_name): with open(result_file_name,"w") as output: - output.write("data,N_examples,N_nuisances,N_treatments,CFDML_max_memory,CFDML_fit_time,CB_max_memory,CB_fit_time\n") + output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n") + run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") with open(result_file_name,"a") as output: - output.write(f"{file_name},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f},{max_memory2:.1f},{elapsed_time2:.1f}\n") + output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n") if __name__ == "__main__": @@ -112,11 +106,13 @@ def causalforestdml_memory_test( parser.add_argument("--n_est_2", type=int, help="", default=None) parser.add_argument("--n_jobs", type=int, help="", default=-1) parser.add_argument("--memmap", type=bool, help="", default=False) + parser.add_argument("--estimator", type=str, help="", default=None) args = parser.parse_args(sys.argv[1:]) data_file = args.data_file causalforestdml_memory_test( file_name=data_file, + estimator=args.estimator, n_est_y=args.n_est_y, n_est_t=args.n_est_t, n_est_2=args.n_est_2, From c39ef5ae7ae947127b02e8e1891aa8452987be5e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Apr 2025 21:23:10 +0000 Subject: [PATCH 20/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/memory_test_script.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 07e4e6b70..7be6eaa25 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -1,7 +1,6 @@ import argparse import datetime -import gc from econml.dml import CausalForestDML from memory_profiler import memory_usage @@ -27,7 +26,7 @@ def causalforestdml_memory_test( use_memmap: bool ): - if not estimator.lower() in ('causalforest','catboost'): + if estimator.lower() not in ('causalforest','catboost'): raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}") root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") From 2f2491c4e51424e9e4259723ffebf61861c10449 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Tue, 6 May 2025 13:37:27 -0700 Subject: [PATCH 21/25] More memory tests --- scripts/memory_test_script.py | 137 +++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 37 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 07e4e6b70..8cf3a8660 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -1,23 +1,29 @@ import argparse import datetime -import gc +import joblib +import logging +import numpy as np +import os +import sys +import time +import tracemalloc + +import pandas as pd from econml.dml import CausalForestDML from memory_profiler import memory_usage -import joblib -import os +from pympler import muppy, summary from catboost import CatBoostRegressor from xgboost import XGBRegressor -import sys -import time -import logging + logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) def causalforestdml_memory_test( + test_type:str, file_name: str, estimator: str, n_est_y: int, @@ -26,7 +32,8 @@ def causalforestdml_memory_test( n_jobs: int, use_memmap: bool ): - + if test_type not in ("profile","malloc","object"): + raise ValueError(f"Test type should be 'profile'|'malloc'|'object' but got {test_type}") if not estimator.lower() in ('causalforest','catboost'): raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}") @@ -57,14 +64,22 @@ def causalforestdml_memory_test( f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") start_time = time.time() - mem_usage = memory_usage( - (est.fit, [y,T], {"X":X}), - interval=0.1, # Sample every 0.1 seconds - timeout=None, # No timeout - max_usage=True, # Get maximum memory used - retval=True, # Return the fitted model too - include_children=True # Include joblib's child processes - ) + if test_type=='profile': + mem_usage = memory_usage( + (est.fit, [y,T], {"X":X}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + elif test_type=='malloc': + tracemalloc.start() + est.fit(y,T, X=X) + malloc_snapshot = tracemalloc.take_snapshot() + else: + est.fit(y,T, X=X) + end_time = time.time() elif estimator == 'catboost': est = CatBoostRegressor(n_estimators=n_est_2, allow_writing_files=False) @@ -72,30 +87,76 @@ def causalforestdml_memory_test( logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}") start_time = time.time() - mem_usage = memory_usage( - (est.fit, [X], {"y": y, "silent": True}), - interval=0.1, # Sample every 0.1 seconds - timeout=None, # No timeout - max_usage=True, # Get maximum memory used - retval=True, # Return the fitted model too - include_children=True # Include joblib's child processes - ) + if test_type=='profile': + mem_usage = memory_usage( + (est.fit, [X], {"y": y, "silent": True}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + elif test_type=='malloc': + tracemalloc.start() + est.fit(X,y=y,silent=True) + malloc_snapshot = tracemalloc.take_snapshot() + else: + est.fit(X,y=y,silent=True) + end_time = time.time() # Extract results - max_memory, fitted_model = mem_usage - elapsed_time = end_time-start_time - logger.info(f"Maximum memory usage: {max_memory} MiB") - logger.info(f"Time to fit: {elapsed_time} seconds") - - result_file_name = os.path.join(root_dir,"mem_test_results.csv") - if not os.path.isfile(result_file_name): - with open(result_file_name,"w") as output: - output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n") - - run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") - with open(result_file_name,"a") as output: - output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n") + if test_type == 'malloc': + # Get statistics grouped by filename and line number + top_stats = malloc_snapshot.statistics('lineno') + print("\nTop 10 memory-consuming locations:") + for i, stat in enumerate(top_stats[:10], 1): + print(f"{i}. {stat}") + + # Get statistics grouped by object type (more useful for identifying large objects) + top_types = malloc_snapshot.statistics('traceback') + print("\nTop 10 memory-consuming object types:") + for i, stat in enumerate(top_types[:10], 1): + print(f"{i}. {stat}") + # Print the traceback to see where the object was created + for line in stat.traceback.format(): + print(f" {line}") + elif test_type == 'profile': + max_memory, fitted_model = mem_usage + elapsed_time = end_time-start_time + logger.info(f"Maximum memory usage: {max_memory} MiB") + logger.info(f"Time to fit: {elapsed_time} seconds") + + result_file_name = os.path.join(root_dir,"mem_test_results.csv") + if not os.path.isfile(result_file_name): + with open(result_file_name,"w") as output: + output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n") + + run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + with open(result_file_name,"a") as output: + output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n") + elif test_type=='object': + # Collect all objects + all_objects = muppy.get_objects() + # Get a summary of memory usage by type + mem_summary = summary.summarize(all_objects) + summary.print_(mem_summary) + # Look at NumPy arrays + numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)] + numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True) + + print("\nTop 10 NumPy arrays by memory usage:") + for i, arr in enumerate(numpy_arrays[:10], 1): + memory_mb = arr.nbytes / (1024 * 1024) + print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB") + + pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)] + pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True) + + print("\nTop 10 Pandas DataFrames by memory usage:") + for i, df in enumerate(pandas_dfs[:10], 1): + memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) + print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB") if __name__ == "__main__": @@ -103,14 +164,16 @@ def causalforestdml_memory_test( parser.add_argument("--data_file", type=str, help="", default="offerfit.joblib") parser.add_argument("--n_est_y", type=int, help="", default=None) parser.add_argument("--n_est_t", type=int, help="", default=None) - parser.add_argument("--n_est_2", type=int, help="", default=None) + parser.add_argument("--n_est_2", type=int, help="", default=500) parser.add_argument("--n_jobs", type=int, help="", default=-1) parser.add_argument("--memmap", type=bool, help="", default=False) parser.add_argument("--estimator", type=str, help="", default=None) + parser.add_argument("--test", type=str, help="profile or malloc, meaning use memory_profiler or tracemalloc", default="profile") args = parser.parse_args(sys.argv[1:]) data_file = args.data_file causalforestdml_memory_test( + test_type=args.test, file_name=data_file, estimator=args.estimator, n_est_y=args.n_est_y, From 39a9d3e9fe6df83923a9dbfead4816c76fb47ee3 Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Wed, 7 May 2025 09:21:58 -0700 Subject: [PATCH 22/25] Run all the memory tests Add downsampling function --- scripts/memory_test_script.py | 242 ++++++++++++++++++++-------------- 1 file changed, 142 insertions(+), 100 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 8cf3a8660..d4ff50175 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -22,20 +22,71 @@ logger = logging.getLogger(__name__) +def balanced_downsample(X:pd.DataFrame, y:np.array, T_feat:pd.DataFrame, t_id:np.array, downsample_ratio:float): + """ + Balanced downsampling based on treatment identity + + :param X: + :param y: + :param T_feat: features of the treatments + :param t_id: which treatment was applied + :return: + """ + # Keep all the positive experiences + pos_ex = y > 0 + positive_df = X[pos_ex].reset_index(drop=True) + positive_actions = t_id[pos_ex] + act_feature_positve = T_feat[pos_ex].reset_index(drop=True) + y_positive = y[pos_ex] + positive_df['action_id'] = positive_actions + positive_df['y'] = y_positive + positive_df = pd.concat([positive_df,act_feature_positve],axis=1).reset_index(drop=True) + + negative_df = X[~pos_ex].reset_index(drop=True) + negative_actions = t_id[~pos_ex] + act_feature_negative = T_feat[~pos_ex].reset_index(drop=True) + y_negative = y[~pos_ex] + + negative_df['action_id'] = negative_actions + negative_df['y'] = y_negative + negative_df = pd.concat([negative_df,act_feature_negative],axis=1).reset_index(drop=True) + + df_downsampled = negative_df.groupby('action_id', group_keys=False).apply(lambda x: x.sample(frac=downsample_ratio)) + + logger.info(f'Dowsampled negative examples by {downsample_ratio}') + actions_before = negative_df['action_id'].value_counts() + actions_after = df_downsampled['action_id'].value_counts() + logger.info(f'Before: {actions_before}') + logger.info(f'After: {actions_after}') + + # Final sample + combo_df = pd.concat([positive_df,df_downsampled],axis=0).reset_index(drop=True) + # this shuffles it + combo_df = combo_df.sample(frac=1).reset_index(drop=True) + X = combo_df[X.columns.values] + T_feat= combo_df[T_feat.columns.values] + y = combo_df['y'].to_numpy() + t_id = combo_df['action_id'].to_numpy() + + n_pos_ex2 = y.sum() + logger.info(f"After downsampling non-conversion, Conversion rate now {n_pos_ex2/len(y)}") + logger.info(f"X {X.shape}") + logger.info(f"T {T_feat.shape}") + logger.info(f"y {y.shape}") + + return X, y, T_feat, t_id + + def causalforestdml_memory_test( - test_type:str, file_name: str, estimator: str, n_est_y: int, n_est_t: int, n_est_2: int, n_jobs: int, - use_memmap: bool + use_memmap: bool, + downsample:float|None, ): - if test_type not in ("profile","malloc","object"): - raise ValueError(f"Test type should be 'profile'|'malloc'|'object' but got {test_type}") - if not estimator.lower() in ('causalforest','catboost'): - raise NotImplementedError(f"Estimator must be in 'causalforest','catboost', got {estimator}") root_dir = os.environ.get("LOCAL_ANALYSIS_DIR", ".") logger.info(f"Using root dir {root_dir} loading data file {file_name}") @@ -46,10 +97,13 @@ def causalforestdml_memory_test( X = data['X'] T = data['T'] y = data['Y'] + i = data['i'] + if downsample: + X, y, T, i = balanced_downsample(X,y,T,i,downsample_ratio=downsample) logger.info(f"X has a shape of: {X.shape}") logger.info(f"T has a shape of: {T.shape}") - logger.info(f"y has a shape of: {y.shape}") + logger.info(f"y has a shape of: {y.shape}, total={np.sum(y)}") if estimator == 'causalforest': est = CausalForestDML( @@ -63,100 +117,88 @@ def causalforestdml_memory_test( logger.info(f"Calling CausalForestDML fit: njobs={n_jobs}, MemMap={use_memmap}," f"N estimators y={n_est_y}, t={n_est_t}, 2={n_est_2}") - start_time = time.time() - if test_type=='profile': - mem_usage = memory_usage( - (est.fit, [y,T], {"X":X}), - interval=0.1, # Sample every 0.1 seconds - timeout=None, # No timeout - max_usage=True, # Get maximum memory used - retval=True, # Return the fitted model too - include_children=True # Include joblib's child processes - ) - elif test_type=='malloc': - tracemalloc.start() - est.fit(y,T, X=X) - malloc_snapshot = tracemalloc.take_snapshot() - else: - est.fit(y,T, X=X) - - end_time = time.time() elif estimator == 'catboost': est = CatBoostRegressor(n_estimators=n_est_2, allow_writing_files=False) - logger.info(f"Calling CatBoostRegressor fit: n_estimators={n_est_2}") - start_time = time.time() - if test_type=='profile': - mem_usage = memory_usage( - (est.fit, [X], {"y": y, "silent": True}), - interval=0.1, # Sample every 0.1 seconds - timeout=None, # No timeout - max_usage=True, # Get maximum memory used - retval=True, # Return the fitted model too - include_children=True # Include joblib's child processes - ) - elif test_type=='malloc': - tracemalloc.start() - est.fit(X,y=y,silent=True) - malloc_snapshot = tracemalloc.take_snapshot() - else: - est.fit(X,y=y,silent=True) - - end_time = time.time() - - # Extract results - if test_type == 'malloc': - # Get statistics grouped by filename and line number - top_stats = malloc_snapshot.statistics('lineno') - print("\nTop 10 memory-consuming locations:") - for i, stat in enumerate(top_stats[:10], 1): - print(f"{i}. {stat}") - - # Get statistics grouped by object type (more useful for identifying large objects) - top_types = malloc_snapshot.statistics('traceback') - print("\nTop 10 memory-consuming object types:") - for i, stat in enumerate(top_types[:10], 1): - print(f"{i}. {stat}") - # Print the traceback to see where the object was created - for line in stat.traceback.format(): - print(f" {line}") - elif test_type == 'profile': - max_memory, fitted_model = mem_usage - elapsed_time = end_time-start_time - logger.info(f"Maximum memory usage: {max_memory} MiB") - logger.info(f"Time to fit: {elapsed_time} seconds") - - result_file_name = os.path.join(root_dir,"mem_test_results.csv") - if not os.path.isfile(result_file_name): - with open(result_file_name,"w") as output: - output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n") - - run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") - with open(result_file_name,"a") as output: - output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n") - elif test_type=='object': - # Collect all objects - all_objects = muppy.get_objects() - # Get a summary of memory usage by type - mem_summary = summary.summarize(all_objects) - summary.print_(mem_summary) - # Look at NumPy arrays - numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)] - numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True) - - print("\nTop 10 NumPy arrays by memory usage:") - for i, arr in enumerate(numpy_arrays[:10], 1): - memory_mb = arr.nbytes / (1024 * 1024) - print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB") - - pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)] - pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True) - - print("\nTop 10 Pandas DataFrames by memory usage:") - for i, df in enumerate(pandas_dfs[:10], 1): - memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) - print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB") + tracemalloc.start() + start_time = time.time() + if estimator == 'causalforest': + mem_usage = memory_usage( + (est.fit, [y,T], {"X":X}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + else: + mem_usage = memory_usage( + (est.fit, [X], {"y": y, "silent": True}), + interval=0.1, # Sample every 0.1 seconds + timeout=None, # No timeout + max_usage=True, # Get maximum memory used + retval=True, # Return the fitted model too + include_children=True # Include joblib's child processes + ) + + end_time = time.time() + + # mem_usage section + max_memory, fitted_model = mem_usage + elapsed_time = end_time-start_time + logger.info(f"Maximum memory usage using memory_usage: {max_memory} MiB") + logger.info(f"Time to fit: {elapsed_time} seconds") + + result_file_name = os.path.join(root_dir,"mem_test_results.csv") + if not os.path.isfile(result_file_name): + with open(result_file_name,"w") as output: + output.write("data,estimator,run_time,N_examples,N_nuisances,N_treatments,max_memory_mb,fit_time_secs\n") + + run_time_string = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + with open(result_file_name,"a") as output: + output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n") + + + # malloc section + malloc_snapshot = tracemalloc.take_snapshot() + top_stats = malloc_snapshot.statistics('lineno') + print("\nTop 10 *tracemalloc* memory-consuming locations:") + for i, stat in enumerate(top_stats[:10], 1): + print(f"{i}. {stat}") + + # Get statistics grouped by object type (more useful for identifying large objects) + top_types = malloc_snapshot.statistics('traceback') + print("\nTop 10 *tracemalloc* memory-consuming object types:") + for i, stat in enumerate(top_types[:10], 1): + print(f"{i}. {stat}") + # Print the traceback to see where the object was created + for line in stat.traceback.format(): + print(f" {line}") + + + # muppy section + all_objects = muppy.get_objects() + # Get a summary of memory usage by type + mem_summary = summary.summarize(all_objects) + print("*muppy* Memory usage summary:") + summary.print_(mem_summary) + # Look at NumPy arrays + numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)] + numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True) + + print("\nTop 10 NumPy arrays by memory usage from *muppy*:") + for i, arr in enumerate(numpy_arrays[:10], 1): + memory_mb = arr.nbytes / (1024 * 1024) + print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB") + + pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)] + pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True) + + print("\nTop 10 Pandas DataFrames by memory usage *muppy*:") + for i, df in enumerate(pandas_dfs[:10], 1): + memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) + print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB") if __name__ == "__main__": @@ -168,17 +210,17 @@ def causalforestdml_memory_test( parser.add_argument("--n_jobs", type=int, help="", default=-1) parser.add_argument("--memmap", type=bool, help="", default=False) parser.add_argument("--estimator", type=str, help="", default=None) - parser.add_argument("--test", type=str, help="profile or malloc, meaning use memory_profiler or tracemalloc", default="profile") + parser.add_argument("--downsample", type=float, help="", default=None) args = parser.parse_args(sys.argv[1:]) data_file = args.data_file causalforestdml_memory_test( - test_type=args.test, file_name=data_file, estimator=args.estimator, n_est_y=args.n_est_y, n_est_t=args.n_est_t, n_est_2=args.n_est_2, n_jobs=args.n_jobs, - use_memmap=args.memmap + use_memmap=args.memmap, + downsample=args.downsample ) From c64d911289ef656dca9fd68beeb80f014797f07e Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Fri, 9 May 2025 10:55:01 -0700 Subject: [PATCH 23/25] Better memory test that adds up the numpy arrays in the estimator --- scripts/memory_test_script.py | 263 ++++++++++++++++++++++++++++------ 1 file changed, 220 insertions(+), 43 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index d4ff50175..4db74a5e6 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -8,8 +8,12 @@ import sys import time import tracemalloc - import pandas as pd +import gc +import inspect +import types +import weakref +import warnings from econml.dml import CausalForestDML from memory_profiler import memory_usage @@ -18,10 +22,180 @@ from xgboost import XGBRegressor +# More aggressive warning suppression +# Filter all warnings from scipy +warnings.filterwarnings("ignore", module="scipy") +# Also try to catch other possible warning categories +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", message="Please import") + logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) +def get_size_of_object(obj): + """Get memory size of an object in bytes""" + if isinstance(obj, np.ndarray): + return obj.nbytes + elif isinstance(obj, pd.DataFrame): + return obj.memory_usage(deep=True).sum() + elif hasattr(obj, 'nbytes'): # Other objects with nbytes attribute + return obj.nbytes + else: + # Approximate size for other objects + return sys.getsizeof(obj) + + +def analyze_object_memory(obj, name="object", max_depth=100): + """ + Analyze a specific object for large arrays and DataFrames + + Args: + obj: The object to analyze + name: Name to use for the object's root path + max_depth: Maximum recursion depth + + Returns: + List of large arrays/DataFrames found in the object + """ + # Force garbage collection + gc.collect() + print(f"Analyzing memory usage of {name}...") + + # Get results using the find_arrays_in_object function + results = find_arrays_in_object(obj, name, max_depth=max_depth) + + # Sort by size + results.sort(key=lambda x: x["size_mb"], reverse=True) + + # Deduplicate by object id + unique_results = [] + seen_ids = set() + for result in results: + obj_id = id(result["object"]) + if obj_id not in seen_ids: + seen_ids.add(obj_id) + unique_results.append(result) + + sums = {} + counts = {} + for item in unique_results: + # Extract the last component of the path (after the last period) + last_component = item["path"].split(".")[-1] + + # Add the size_mb to the sum for this last component + if last_component in sums: + sums[last_component] += item["size_mb"] + counts[last_component] += 1 + else: + sums[last_component] = item["size_mb"] + counts[last_component] = 1 + + sums = dict(sorted(sums.items(), key=lambda x: x[1], reverse=True)) + + df = pd.DataFrame({ + "obj_name": list(sums.keys()), + "total_size_mb": list(sums.values()), + "count": [counts[key] for key in sums.keys()] + }) + + # Sort the DataFrame by total_size_mb in descending order + df = df.sort_values(by="total_size_mb", ascending=False).reset_index(drop=True) + print(df) + + return df + + +def find_arrays_in_object(obj, path="", visited=None, results=None, max_depth=10): + """ + Recursively search an object and its attributes for NumPy arrays and DataFrames + + Args: + obj: The object to search + path: Current attribute path (for tracking) + visited: Set of already visited object IDs + results: List to collect results + max_depth: Maximum recursion depth + """ + if visited is None: + visited = set() + if results is None: + results = [] + + # Stop if we've seen this object before to prevent infinite recursion + obj_id = id(obj) + if obj_id in visited: + return results + + # Add to visited set + visited.add(obj_id) + + # Check if this object is a numpy array or DataFrame + if isinstance(obj, np.ndarray): + size_mb = float(obj.nbytes) / float(1024 * 1024) + results.append({ + "type": "ndarray", + "path": path, + "shape": obj.shape, + "dtype": obj.dtype, + "size_mb": size_mb, + "object": obj + }) + return results + elif isinstance(obj, pd.DataFrame): + size_mb = float(obj.memory_usage(deep=True).sum()) / float(1024 * 1024) + results.append({ + "type": "DataFrame", + "path": path, + "shape": obj.shape, + "size_mb": size_mb, + "object": obj, + "columns": list(obj.columns)[:5] + ['...'] if len(obj.columns) > 5 else list( + obj.columns) + }) + return results + # Skip certain types that don't contain arrays + elif (isinstance(obj, (str, int, float, bool, type, types.FunctionType, types.MethodType, weakref.ref)) or obj is None): + return results + + # For dictionaries + elif isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(key, (str, int, float, bool, tuple)): + new_path = f"{path}['{key}']" if path else f"['{key}']" + find_arrays_in_object(value, new_path, visited, results, + max_depth) + return results + + # For lists and tuples + elif isinstance(obj, (list, tuple)): + for i, item in enumerate(obj): + new_path = f"{path}[{i}]" if path else f"[{i}]" + find_arrays_in_object(item, new_path, visited, results, max_depth) + return results + + else: + for attr_name in sorted(list(dir(obj))): + if not (attr_name.startswith('__') or attr_name.startswith('_')): # Skip special methods + try: + attr_value = getattr(obj, attr_name) + if not callable(attr_value): + new_path = f"{path}.{attr_name}" if path else attr_name + find_arrays_in_object(attr_value, new_path, visited, results, + max_depth) + except Exception as e: + continue + return results + + + + + + + + + def balanced_downsample(X:pd.DataFrame, y:np.array, T_feat:pd.DataFrame, t_id:np.array, downsample_ratio:float): """ Balanced downsampling based on treatment identity @@ -56,8 +230,8 @@ def balanced_downsample(X:pd.DataFrame, y:np.array, T_feat:pd.DataFrame, t_id:np logger.info(f'Dowsampled negative examples by {downsample_ratio}') actions_before = negative_df['action_id'].value_counts() actions_after = df_downsampled['action_id'].value_counts() - logger.info(f'Before: {actions_before}') - logger.info(f'After: {actions_after}') + # logger.info(f'Before: {actions_before}') + # logger.info(f'After: {actions_after}') # Final sample combo_df = pd.concat([positive_df,df_downsampled],axis=0).reset_index(drop=True) @@ -159,46 +333,49 @@ def causalforestdml_memory_test( with open(result_file_name,"a") as output: output.write(f"{file_name},{estimator},{run_time_string},{X.shape[0]},{X.shape[1]},{T.shape[1]},{max_memory:.1f},{elapsed_time:.1f}\n") - - # malloc section - malloc_snapshot = tracemalloc.take_snapshot() - top_stats = malloc_snapshot.statistics('lineno') - print("\nTop 10 *tracemalloc* memory-consuming locations:") - for i, stat in enumerate(top_stats[:10], 1): - print(f"{i}. {stat}") - - # Get statistics grouped by object type (more useful for identifying large objects) - top_types = malloc_snapshot.statistics('traceback') - print("\nTop 10 *tracemalloc* memory-consuming object types:") - for i, stat in enumerate(top_types[:10], 1): - print(f"{i}. {stat}") - # Print the traceback to see where the object was created - for line in stat.traceback.format(): - print(f" {line}") - - - # muppy section - all_objects = muppy.get_objects() - # Get a summary of memory usage by type - mem_summary = summary.summarize(all_objects) - print("*muppy* Memory usage summary:") - summary.print_(mem_summary) - # Look at NumPy arrays - numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)] - numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True) - - print("\nTop 10 NumPy arrays by memory usage from *muppy*:") - for i, arr in enumerate(numpy_arrays[:10], 1): - memory_mb = arr.nbytes / (1024 * 1024) - print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB") - - pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)] - pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True) - - print("\nTop 10 Pandas DataFrames by memory usage *muppy*:") - for i, df in enumerate(pandas_dfs[:10], 1): - memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) - print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB") + analyze_object_memory(est,name=estimator, max_depth=100) + + logger.info('Done') + + # # malloc section + # malloc_snapshot = tracemalloc.take_snapshot() + # top_stats = malloc_snapshot.statistics('lineno') + # print("\nTop 10 *tracemalloc* memory-consuming locations:") + # for i, stat in enumerate(top_stats[:10], 1): + # print(f"{i}. {stat}") + # + # # Get statistics grouped by object type (more useful for identifying large objects) + # top_types = malloc_snapshot.statistics('traceback') + # print("\nTop 10 *tracemalloc* memory-consuming object types:") + # for i, stat in enumerate(top_types[:10], 1): + # print(f"{i}. {stat}") + # # Print the traceback to see where the object was created + # for line in stat.traceback.format(): + # print(f" {line}") + # + # + # # muppy section + # all_objects = muppy.get_objects() + # # Get a summary of memory usage by type + # mem_summary = summary.summarize(all_objects) + # print("*muppy* Memory usage summary:") + # summary.print_(mem_summary) + # # Look at NumPy arrays + # numpy_arrays = [obj for obj in all_objects if isinstance(obj, np.ndarray)] + # numpy_arrays.sort(key=lambda arr: arr.nbytes, reverse=True) + # + # print("\nTop 10 NumPy arrays by memory usage from *muppy*:") + # for i, arr in enumerate(numpy_arrays[:10], 1): + # memory_mb = arr.nbytes / (1024 * 1024) + # print(f"{i}. Shape: {arr.shape}, Dtype: {arr.dtype}, Memory: {memory_mb:.2f} MB") + # + # pandas_dfs = [obj for obj in all_objects if isinstance(obj, pd.DataFrame)] + # pandas_dfs.sort(key=lambda df: df.memory_usage(deep=True).sum(), reverse=True) + # + # print("\nTop 10 Pandas DataFrames by memory usage *muppy*:") + # for i, df in enumerate(pandas_dfs[:10], 1): + # memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) + # print(f"{i}. Shape: {df.shape}, Memory: {memory_mb:.2f} MB") if __name__ == "__main__": From 5ff58072deb39233605ab080503d5f7cda274abb Mon Sep 17 00:00:00 2001 From: Carl Gold Date: Fri, 9 May 2025 12:22:21 -0700 Subject: [PATCH 24/25] Fix print statement --- scripts/memory_test_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index 4db74a5e6..630fc12f0 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -321,7 +321,7 @@ def causalforestdml_memory_test( # mem_usage section max_memory, fitted_model = mem_usage elapsed_time = end_time-start_time - logger.info(f"Maximum memory usage using memory_usage: {max_memory} MiB") + logger.info(f"Maximum memory usage during fit: {max_memory} MiB") logger.info(f"Time to fit: {elapsed_time} seconds") result_file_name = os.path.join(root_dir,"mem_test_results.csv") From 999d03080dc7d356920eecdce2180a5d94928760 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 19:24:15 +0000 Subject: [PATCH 25/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/memory_test_script.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/memory_test_script.py b/scripts/memory_test_script.py index b5d0fcb5d..bd66a3488 100644 --- a/scripts/memory_test_script.py +++ b/scripts/memory_test_script.py @@ -10,7 +10,6 @@ import tracemalloc import pandas as pd import gc -import inspect import types import weakref import warnings @@ -18,7 +17,6 @@ from econml.dml import CausalForestDML from memory_profiler import memory_usage -from pympler import muppy, summary from catboost import CatBoostRegressor from xgboost import XGBRegressor @@ -57,7 +55,8 @@ def analyze_object_memory(obj, name="object", max_depth=100): name: Name to use for the object's root path max_depth: Maximum recursion depth - Returns: + Returns + ------- List of large arrays/DataFrames found in the object """ # Force garbage collection @@ -185,7 +184,7 @@ def find_arrays_in_object(obj, path="", visited=None, results=None, max_depth=10 new_path = f"{path}.{attr_name}" if path else attr_name find_arrays_in_object(attr_value, new_path, visited, results, max_depth) - except Exception as e: + except Exception: continue return results