diff --git a/benchmarks/bench_applytocols.py b/benchmarks/bench_applytocols.py deleted file mode 100644 index 6a64a0c8..00000000 --- a/benchmarks/bench_applytocols.py +++ /dev/null @@ -1,73 +0,0 @@ -import gc -import time -import numpy as np -import pandas as pd -from joblib import parallel_backend - -import stratum as skrub -from skrub import ApplyToCols, StringEncoder - -# Create a synthetic test column -def make_data (n_rows, seed, vocab_size, - avg_words, words_len_range=(3, 10), n_features=1) -> pd.DataFrame: - rng = np.random.default_rng(seed) - - # Create a random lowercase word from ascii characters - def rand_word(): - size = rng.integers(words_len_range[0], words_len_range[1]) - return ''.join(rng.choice(list('abcdefghijklmnopqrstuvwxyz'), size=size)) #use ascii - - # Build a vocabulary of unique words - vocab = [rand_word() for _ in range(vocab_size)] - - # Function to generate a single text series - def gen_series(): - # Randomly generate number of words (around avg_words) in each row - n_per_row = np.maximum(1, rng.poisson(avg_words, size=n_rows)) - rows = [] - for k in n_per_row: - idx = rng.integers(0, vocab_size, size=k) - rows.append(' '.join(vocab[i] for i in idx)) - return pd.Series(rows) - - # Generate n_features columns - data = {f"text_{i+1}": gen_series() for i in range(n_features)} - return pd.DataFrame(data) - -def main(): - n_rows = 100_000 #number of rows (=200K) - vocab_size = 20000 #number of unique words (=5K). Large -> more distinct tokens -> sparser matrix - avg_words = 8 #average number of words per row (=8) - words_len = (3, 10) #length of each word (low to high) - n_features = 2 #number of features - - # Generate synthetic data - print("Generate synthetic data") - X = make_data(n_rows, 42, vocab_size, avg_words, words_len, n_features) - print(X) - - # Build encoder - enc = StringEncoder( - vectorizer="hashing", #hashing->tfidf - analyzer="char", - ngram_range=(3, 4), - n_components= 30, - random_state=0 - ) - - # Main benchmark. Run on the entire dataset - print("\nStarting main benchmark") - skrub.set_config(rust_backend=True, debug_timing=True) #sklearn backend - t0 = time.perf_counter() - with parallel_backend('threading'): - enc_cols = ApplyToCols(enc, n_jobs=n_features) #apply one encoder on all columns - Z = enc_cols.fit_transform(X) - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"Shape = {Z.shape}") - print(f"skrub - Execution time = {exec_time:8.3f}s\n") - del Z #optimize memory, especially for dense outputs - gc.collect() - -if __name__ == '__main__': - main() diff --git a/benchmarks/bench_onehot_encoder.py b/benchmarks/bench_onehot_encoder.py deleted file mode 100644 index f4e43e90..00000000 --- a/benchmarks/bench_onehot_encoder.py +++ /dev/null @@ -1,88 +0,0 @@ -import gc -import time -import numpy as np -import pandas as pd - -import stratum as skrub -from stratum import OneHotEncoder - -# Create synthetic features -def make_categorical_df( - n_rows: int, - n_features: int, - n_dists: int, #nuber of distinct items in each feature - seed: int = 42, -) -> pd.DataFrame: - rng = np.random.default_rng(seed) - - data = {} - #for j, k in enumerate(cardinals): - for j in range(n_features): - # Create category pool: e.g., ['c0_000001', ..., 'c0_00xxxx'] - cats = np.array([f"c{j}_{i:06d}" for i in range(n_dists)], dtype=object) - idx = rng.integers(0, n_dists, size=n_rows) - col = cats[idx].copy() - - data[f"col{j}"] = col - return pd.DataFrame(data) - -def OHE_benchmark(X, sparse_output): - # Build one hot encoder - enc = OneHotEncoder( - drop="if_binary", - dtype=np.float32, - handle_unknown="ignore", - sparse_output=sparse_output, - ) - - # Warm-up small runs - skrub.set_config(rust_backend=False) - X_small = X.iloc[: min(2048, len(X))] - _ = enc.fit_transform(X_small) - gc.collect() - skrub.set_config(rust_backend=True) - X_small = X.iloc[: min(2048, len(X))] - _ = enc.fit_transform(X_small) - gc.collect() - - # Main benchmark. Run on the entire dataset - print("\nStarting main benchmark") - skrub.set_config(rust_backend=False, debug_timing=True) #sklearn backend - t0 = time.perf_counter() - Z = enc.fit_transform(X) - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"Shape = {Z.shape}") - print(f"skrub - Execution time = {exec_time:8.3f}s\n") - del Z #optimize memory, especially for dense outputs - gc.collect() - - skrub.set_config(rust_backend=True, debug_timing=True, num_threads=0) #rust backend - t0 = time.perf_counter() - Z = enc.fit_transform(X) - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"Shape = {Z.shape}") - print(f"stratum - Execution time = {exec_time:8.3f}s\n") - - - -def main(): - print("Generate synthetic data for sparse output") - n_rows = 2_000_000 - n_features = 4 - n_dists = 200_000 # num of distinct items in each feature - X = make_categorical_df(n_rows=n_rows, n_features=n_features, n_dists=n_dists) - print(X.head(), "\n") - OHE_benchmark(X, sparse_output=True) - - print("Generate synthetic data for dense output") - n_rows = 200_000 - n_features = 4 - n_dists = 10_000 # num of distinct items in each feature - X = make_categorical_df(n_rows=n_rows, n_features=n_features, n_dists=n_dists) - print(X.head(), "\n") - OHE_benchmark(X, sparse_output=False) - -if __name__ == "__main__": - main() diff --git a/benchmarks/bench_string_encoder.py b/benchmarks/bench_string_encoder.py deleted file mode 100644 index a0e638de..00000000 --- a/benchmarks/bench_string_encoder.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -This script runs StringEncoder on a synthetic dataset and compares Sklearn and Rust backends. -It is used to show the performance benefits of the Rust backend (~5.8x w/ 24*2 cores). -This script also shows the use various config flags related to the Rust backend. -""" -import gc -import time -import numpy as np -import pandas as pd -import stratum as skrub -from stratum import StringEncoder - -# Create a synthetic test column -def make_series (n_rows, seed, vocab_size, avg_words, words_len_range=(3, 10)) -> pd.Series: - rng = np.random.default_rng(seed) - - # Create a random lowercase word from ascii characters - def rand_word(): - size = rng.integers(words_len_range[0], words_len_range[1]) - return ''.join(rng.choice(list('abcdefghijklmnopqrstuvwxyz'), size=size)) #use ascii - - # Build a vocabulary of unique words - vocab = [rand_word() for _ in range(vocab_size)] - - # Randomly generate number of words (around avg_words) in each row - n_per_row = np.maximum(1, rng.poisson(avg_words, size=n_rows)) - - rows = [] - for k in n_per_row: - idx = rng.integers(0, vocab_size, size=k) - rows.append(' '.join(vocab[i] for i in idx)) - - return pd.Series(rows, name="text") - -def main(): - n_rows = 100_000 #number of rows (=100K) - vocab_size = 20000 #number of unique words (=20K). Large -> more distinct tokens -> sparser matrix - avg_words = 8 #average number of words per row (=8) - words_len = (3, 10) #length of each word (low to high) - - # Generate synthetic data - print("Generate synthetic data") - X = make_series(n_rows, 42, vocab_size, avg_words, words_len) - print(X) - - # Build encoder - enc = StringEncoder( - vectorizer="tfidf", - analyzer="char_wb", - ngram_range=(3, 4), - n_components= 30, - random_state=0 - ) - - # Warm-up small runs to load code paths, JIT caches inside SciPy, etc. - skrub.set_config(rust_backend=False) #sklearn backend - X_small = X.iloc[: min(2048, len(X))] - _ = enc.fit_transform(X_small) - gc.collect() - skrub.set_config(rust_backend=True) #rust backend - X_small = X.iloc[: min(2048, len(X))] - _ = enc.fit_transform(X_small) - gc.collect() - - # Main benchmark: Run on the entire dataset - print("\nStarting main benchmark") - skrub.set_config(rust_backend=False) #sklearn - t0 = time.perf_counter() - X_enc = enc.fit_transform(X) - print(f"Shape = {X_enc.shape}") - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"skrub - Execution time = {exec_time:8.3f}s\n") - - skrub.set_config(rust_backend=True, debug_timing=False, num_threads=0) #rust - t0 = time.perf_counter() - X_enc = enc.fit_transform(X) - print(f"Shape = {X_enc.shape}") - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"stratum - Execution time = {exec_time:8.3f}s\n") - - -if __name__ == '__main__': - main() diff --git a/benchmarks/bench_tablevectorizer.py b/benchmarks/bench_tablevectorizer.py deleted file mode 100644 index e1f0ab1e..00000000 --- a/benchmarks/bench_tablevectorizer.py +++ /dev/null @@ -1,49 +0,0 @@ -import time -import stratum as skrub -from sklearn.preprocessing import OneHotEncoder -from skrub.datasets import fetch_employee_salaries -from skrub import TableVectorizer, StringEncoder -import pandas as pd -from joblib import parallel_backend - -def main(): - # Load dataset - dataset = fetch_employee_salaries() - employees, salaries = dataset.X, dataset.y - - # Append dataset n times to have a larger dataset - employees = pd.concat([employees] * 10, ignore_index=True) - print(employees.info()) - employees = employees.dropna() #necessary for rusty one-hot encoder - - # Use skrub's vanilla TableVectorizer - skrub.set_config(rust_backend=False, debug_timing=False) - t0 = time.perf_counter() - vectorizer = TableVectorizer(n_jobs=-1) - employees_enc = vectorizer.fit_transform(employees) - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"skrub - Encoding time: {exec_time:8.3f}s\n") - print(f"Encoded data shape: {employees_enc.shape}") - - - # Use stratum's TableVectorizer - t0 = time.perf_counter() - skrub.set_config(rust_backend=True, debug_timing=False, scheduler=True, stats=True) - with parallel_backend('threading'): - vectorizer = TableVectorizer(high_cardinality=StringEncoder(), low_cardinality=OneHotEncoder(), n_jobs=-1) #default setup - employees_enc = vectorizer.fit_transform(employees) - t1 = time.perf_counter() - exec_time = t1 - t0 - print(f"stratum - Encoding time: {exec_time:8.3f}s\n") - print(f"Encoded data shape: {employees_enc.shape}") - - # Explore the encodings - print(vectorizer.kind_to_columns_) - print("Fitted transformers to department column") - print(vectorizer.transformers_["department"]) #low_cardinality - print("Fitted transformers to division column") - print(vectorizer.transformers_["division"]) #high_cardinality - -if __name__ == "__main__": - main() diff --git a/benchmarks/logical_optimizer/end-to-end/20newsgroups.py b/benchmarks/logical_optimizer/end-to-end/20newsgroups.py deleted file mode 100644 index 1b688cc3..00000000 --- a/benchmarks/logical_optimizer/end-to-end/20newsgroups.py +++ /dev/null @@ -1,147 +0,0 @@ -from sklearn.datasets import fetch_20newsgroups -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.model_selection import KFold, ShuffleSplit -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression -from sklearn.svm import LinearSVC - -from stratum.logical_optimizer import apply_cse_on_skrub_ir -from stratum.api.gridsearch import grid_search - -import stratum as skrub -import logging -import tempfile -import numpy as np -import pandas as pd -from time import time - -logging.basicConfig(level=logging.INFO) - -from sklearn.base import BaseEstimator, TransformerMixin - -class PandasTfidfVectorizer(BaseEstimator, TransformerMixin): - """A CountVectorizer that returns a pandas DataFrame instead of a sparse matrix.""" - def __init__(self, **kwargs): - self.vectorizer = TfidfVectorizer(**kwargs) - - def fit(self, X: pd.DataFrame, y=None): - X = X.iloc[:,0] - self.vectorizer.fit(X) - return self - - def transform(self, X): - X = X.iloc[:,0] - X_counts = self.vectorizer.transform(X) - df = pd.DataFrame.sparse.from_spmatrix( - X_counts, - columns=self.vectorizer.get_feature_names_out() - ) - return df - - def fit_transform(self, X, y=None, **kwargs): - return self.fit(X).transform(X) - - def get_feature_names_out(self, *args, **kwargs): - return self.vectorizer.get_feature_names_out(*args, **kwargs) - -def tfidf_pipeline(df_path: str, show_graph: bool = False, stratum: bool = False, kfold: bool = False): - path = skrub.as_data_op(df_path) - data = path.skb.apply_func(pd.read_csv).skb.subsample(n=100) - data = data.fillna("") - y = data["y"].skb.mark_as_y() - X = data[["text"]].skb.mark_as_X() - - - vectorizer = PandasTfidfVectorizer() - - pipes = {f"pipeline{i}": X.skb.apply(vectorizer). - skb.apply(model, y=y) for i, model in - enumerate( - [LinearRegression(), - Ridge(), - LogisticRegression(max_iter=1000), - LinearSVC(), - MultinomialNB(), - ])} - pred = skrub.choose_from(pipes).as_data_op() - - if show_graph: - pred.skb.draw_graph().open() - print("----------------------------------------") - stats1 = make_gridsearch(pred, kfold=kfold) - print("\nOriginal Pipeline (njobs=1) took: ",sum(stats1)) - print("----------------------------------------") - - stats2 = make_gridsearch(pred, multi=True, kfold=kfold) - print("\nOriginal Pipeline (njobs=-1) took: ",sum(stats2)) - print("----------------------------------------") - - - stats_opt = make_gridsearch(pred, optimize_enabled=True, stratum=stratum, kfold=kfold) - print("\nTotal optimized Pipeline took: ",sum(stats_opt)) - print("----------------------------------------") - if show_graph: - pred.skb.draw_graph().open() - return np.array([stats1, stats2, stats_opt]) - - -def make_gridsearch(pred, random_state=42, optimize_enabled=False, stratum=False, multi=False, kfold=False) -> tuple[list, list]: - if optimize_enabled: - t00 = time() - pred = apply_cse_on_skrub_ir(pred) - t0 = time() - stats = [t0 - t00] - else: - t0 = time() - stats = [0.0] - - cv = KFold(n_splits=3, shuffle=True, random_state=random_state) if kfold else ShuffleSplit(n_splits=1, test_size=0.2, random_state=42) - - if stratum: - search = grid_search(pred, cv=cv, show_stats=True) - else: - search = pred.skb.make_grid_search(fitted=True, cv=cv, n_jobs=-1 if multi else 1) - print("Search results: \n", search.results_) - - t1 = time() - stats.append(t1-t0) - return stats - - -def run_tfidf_pipeline_benchmark(stratum: bool = False, kfold: bool = False): - data = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes")) - df = pd.DataFrame({"text": data.data, "y": data.target}) - df["text"].fillna("", inplace=True) - print("df shape: ", df.shape) - list_of_stats = [] - parameters = [100, 500, 1000,10000] - for n_rows in parameters: - df_n_rows = df.head(n_rows) - with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8", suffix=".csv", delete=False) as f: - df_n_rows.to_csv(f) - f.flush() - temp_path = f.name - - stats = tfidf_pipeline(temp_path, show_graph = False, stratum=stratum, kfold=kfold) - stats = np.hstack((np.array([n_rows, n_rows, n_rows]).reshape((3, 1)), stats)) - list_of_stats.append(stats) - - stats = np.vstack(list_of_stats) - - columns_results = ["n_rows","optimize", "runtime"] - df = pd.DataFrame(stats, columns=columns_results) - - columns_results.remove("n_rows") - df["total"] = df[columns_results].sum(axis=1).apply(lambda x: "{:.3f}".format(x)) - for col in columns_results: - df[col] = df[col].apply(lambda x: "{:.3f}".format(x)) - df["n_rows"] = df["n_rows"].astype(int) - - print(df) - df.to_csv(f"bench_cse_tfidf_gridsearch.csv", index=False) - -def main(): - run_tfidf_pipeline_benchmark(stratum=True, kfold=False) - -if __name__ == '__main__': - main() diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/README.md b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/README.md deleted file mode 100644 index 0105a651..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Bike Sharing Demand - -This benchmark evaluates data preprocessing pipelines on the bike-sharing demand dataset, comparing different approaches for feature engineering and encoding. - -## Overview - -The benchmark consists of multiple pipeline implementations (`pipeline0.py` through `pipeline4.py`) that demonstrate various preprocessing strategies for the bike-sharing demand prediction task. The goal is to compare the performance and execution time of different pipeline designs. -Before running the benchmark, please download the dataset and augment the dataset, as described below. - -## Running the Benchmark - -### Baseline Pipelines - -To run the baseline pipeline comparisons: - -```bash -python run_base_lines.py -``` - -This script executes multiple pipeline variants (pipeline0-4) and measures: -- Training time -- Prediction performance -- Memory usage - -Each pipeline implements different preprocessing strategies, allowing you to compare trade-offs between complexity and performance. - -### Skrubified Pipelines - -To run the optimized stratum/skrub-based pipelines: - -```bash -python skrubified_pipelines.py -``` - -## Data -```bash -kaggle competitions download -c bike-sharing-demand -unzip bike-sharing-demand.zip -d input/ -rm bike-sharing-demand.zip -``` - - -The benchmark uses various versions of the bike-sharing demand dataset: -- `train.csv` - Original training data -- `train_augmented_2x.csv` - 2x augmented dataset -- `train_augmented_3x.csv` - 3x augmented dataset -- `train_augmented_stratified.csv` - Stratified augmentation - -Data augmentation scripts are available in `bike_data_augmentation.py`. -```bash -python bike_data_augmentation.py -``` \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/bike_data_augmentation.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/bike_data_augmentation.py deleted file mode 100644 index f156dd91..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/bike_data_augmentation.py +++ /dev/null @@ -1,282 +0,0 @@ -""" -Data augmentation script for bike-sharing demand dataset. -Creates synthetic samples while avoiding duplicates. -""" - -import pandas as pd -import numpy as np -from pathlib import Path - -def augment_bike_data(input_path, output_path=None, augmentation_factor=2, seed=42): - """ - Augment bike-sharing demand data by creating synthetic samples. - - Parameters: - ----------- - input_path : str or Path - Path to the input CSV file - output_path : str or Path, optional - Path to save augmented data. If None, returns DataFrame - augmentation_factor : int - Multiplier for dataset size (2 = double the data) - seed : int - Random seed for reproducibility - - Returns: - -------- - pd.DataFrame - Augmented dataset with original + synthetic samples - """ - np.random.seed(seed) - - # Load original data - df = pd.read_csv(input_path) - print(f"Original dataset size: {len(df)}") - - # Identify column types - categorical_cols = [] - numeric_cols = [] - target_cols = ['casual', 'registered', 'count'] # Don't augment targets directly - - for col in df.columns: - if col in ['datetime']: - continue # Handle datetime separately - elif col in target_cols: - continue # Handle targets separately - elif df[col].dtype in ['int64', 'float64'] and df[col].nunique() < 20: - categorical_cols.append(col) - elif df[col].dtype in ['int64', 'float64']: - numeric_cols.append(col) - - print(f"Categorical columns: {categorical_cols}") - print(f"Numeric columns: {numeric_cols}") - - # Generate synthetic samples - num_synthetic = len(df) * (augmentation_factor - 1) - synthetic_samples = [] - - for i in range(num_synthetic): - # Randomly select two samples to interpolate/combine - idx1, idx2 = np.random.choice(len(df), size=2, replace=False) - sample1 = df.iloc[idx1] - sample2 = df.iloc[idx2] - - new_sample = {} - - # Handle datetime: use one of the existing samples or create nearby time - if 'datetime' in df.columns: - base_datetime = pd.to_datetime(sample1['datetime']) - # Add random time offset (±3 hours) - offset_hours = np.random.randint(-3, 4) - new_datetime = base_datetime + pd.Timedelta(hours=offset_hours) - new_sample['datetime'] = new_datetime.strftime('%Y-%m-%d %H:%M:%S') - - # Handle categorical columns: choose from one sample or pick random valid value - for col in categorical_cols: - if np.random.random() < 0.8: - # 80% chance: use value from one of the parent samples - new_sample[col] = sample1[col] if np.random.random() < 0.5 else sample2[col] - else: - # 20% chance: pick any valid value from the distribution - new_sample[col] = np.random.choice(df[col].dropna().values) - - # Handle numeric columns: interpolate with noise - for col in numeric_cols: - # Interpolation factor between the two samples - alpha = np.random.beta(2, 2) # Beta distribution favors middle values - interpolated = alpha * sample1[col] + (1 - alpha) * sample2[col] - - # Add small Gaussian noise (5% of std) - noise = np.random.normal(0, df[col].std() * 0.05) - new_value = interpolated + noise - - # Ensure value stays in valid range - new_value = np.clip(new_value, df[col].min(), df[col].max()) - new_sample[col] = new_value - - # Handle target columns: use relationship-based generation - # For bike sharing, casual + registered = count - if 'temp' in numeric_cols or 'atemp' in numeric_cols: - # Temperature strongly correlates with demand - temp_col = 'atemp' if 'atemp' in numeric_cols else 'temp' - temp_percentile = (new_sample[temp_col] - df[temp_col].min()) / (df[temp_col].max() - df[temp_col].min()) - - # Find similar weather conditions - similar_mask = ( - (df[temp_col] >= new_sample[temp_col] - df[temp_col].std() * 0.5) & - (df[temp_col] <= new_sample[temp_col] + df[temp_col].std() * 0.5) - ) - - if similar_mask.sum() > 0: - similar_samples = df[similar_mask] - base_casual = similar_samples['casual'].mean() - base_registered = similar_samples['registered'].mean() - else: - base_casual = df['casual'].mean() - base_registered = df['registered'].mean() - - # Add variation based on other factors - variation = np.random.normal(1.0, 0.15) - new_sample['casual'] = max(0, int(base_casual * variation)) - new_sample['registered'] = max(0, int(base_registered * variation)) - new_sample['count'] = new_sample['casual'] + new_sample['registered'] - else: - # Fallback: interpolate targets - alpha = np.random.beta(2, 2) - for target in target_cols: - if target in df.columns: - new_sample[target] = int(alpha * sample1[target] + (1 - alpha) * sample2[target]) - - synthetic_samples.append(new_sample) - - # Combine original and synthetic data - synthetic_df = pd.DataFrame(synthetic_samples) - augmented_df = pd.concat([df, synthetic_df], ignore_index=True) - - # Remove any exact duplicates (should be extremely rare with this approach) - original_len = len(augmented_df) - augmented_df = augmented_df.drop_duplicates() - duplicates_removed = original_len - len(augmented_df) - - if duplicates_removed > 0: - print(f"Removed {duplicates_removed} exact duplicates") - - print(f"Augmented dataset size: {len(augmented_df)}") - print(f"Augmentation ratio: {len(augmented_df) / len(df):.2f}x") - - # Verify no exact duplicates exist - assert augmented_df.duplicated().sum() == 0, "Duplicates detected in augmented data!" - - # Save if output path provided - if output_path: - augmented_df.to_csv(output_path, index=False) - print(f"Saved augmented data to {output_path}") - - return augmented_df - - -def create_stratified_augmentation(input_path, output_path=None, target_col='count', - bins=5, samples_per_bin=None, seed=42): - """ - Augment data with stratification to maintain target distribution. - Useful for imbalanced datasets. - - Parameters: - ----------- - input_path : str or Path - Path to the input CSV file - output_path : str or Path, optional - Path to save augmented data - target_col : str - Target column name for stratification - bins : int - Number of bins for stratification - samples_per_bin : int, optional - Number of synthetic samples per bin. If None, uses size of largest bin - seed : int - Random seed - - Returns: - -------- - pd.DataFrame - Augmented dataset - """ - np.random.seed(seed) - df = pd.read_csv(input_path) - - # Create bins for stratification - df['_bin'] = pd.qcut(df[target_col], q=bins, labels=False, duplicates='drop') - - bin_counts = df['_bin'].value_counts().sort_index() - print(f"Samples per bin: {bin_counts.to_dict()}") - - if samples_per_bin is None: - samples_per_bin = bin_counts.max() - - augmented_dfs = [df.drop('_bin', axis=1)] - - for bin_id in df['_bin'].unique(): - bin_df = df[df['_bin'] == bin_id].drop('_bin', axis=1) - current_count = len(bin_df) - needed = samples_per_bin - current_count - - if needed <= 0: - continue - - print(f"Augmenting bin {bin_id}: adding {needed} samples") - - # Generate synthetic samples within this bin - synthetic_samples = [] - for _ in range(needed): - idx1, idx2 = np.random.choice(len(bin_df), size=2, replace=True) - sample1 = bin_df.iloc[idx1] - sample2 = bin_df.iloc[idx2] - - new_sample = {} - alpha = np.random.uniform(0.3, 0.7) # Interpolation weight - - for col in bin_df.columns: - if col == 'datetime': - new_sample[col] = sample1[col] - elif bin_df[col].dtype == 'object': - new_sample[col] = sample1[col] if np.random.random() < 0.5 else sample2[col] - else: - val = alpha * sample1[col] + (1 - alpha) * sample2[col] - if bin_df[col].dtype in ['int64']: - val = int(round(val)) - new_sample[col] = val - - synthetic_samples.append(new_sample) - - synthetic_df = pd.DataFrame(synthetic_samples) - augmented_dfs.append(synthetic_df) - - result = pd.concat(augmented_dfs, ignore_index=True) - result = result.drop_duplicates() - - print(f"Final augmented size: {len(result)}") - - if output_path: - result.to_csv(output_path, index=False) - print(f"Saved to {output_path}") - - return result - - -if __name__ == "__main__": - # Example usage - input_file = Path(__file__).parent / "input" / "train.csv" - - # Basic augmentation (double the dataset) - output_file = Path(__file__).parent / "input" / "train_augmented_2x.csv" - augmented_data = augment_bike_data( - input_file, - output_file, - augmentation_factor=2, - seed=42 - ) - - # Triple the dataset - output_file_3x = Path(__file__).parent / "input" / "train_augmented_3x.csv" - augmented_data_3x = augment_bike_data( - input_file, - output_file_3x, - augmentation_factor=3, - seed=42 - ) - - # Stratified augmentation (maintains target distribution) - output_file_stratified = Path(__file__).parent / "input" / "train_augmented_stratified.csv" - stratified_data = create_stratified_augmentation( - input_file, - output_file_stratified, - target_col='count', - bins=5, - seed=42 - ) - - print("\n✅ Data augmentation complete!") - print("Generated files:") - print(f" - {output_file.name}") - print(f" - {output_file_3x.name}") - print(f" - {output_file_stratified.name}") diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/get_data.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/get_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline0.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline0.py deleted file mode 100644 index 599c2274..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline0.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import pandas as pd -import numpy as np -import lightgbm as lgb -from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_log_error - -# Load data -_input_filename = os.getenv("BIKE_INPUT_FILE", "train.csv") -train = pd.read_csv(f"./input/{_input_filename}", parse_dates=["datetime"]) - - -# Feature engineering -def add_datetime_features(df): - df["year"] = df["datetime"].dt.year - df["month"] = df["datetime"].dt.month - df["dayofweek"] = df["datetime"].dt.dayofweek - df["hour"] = df["datetime"].dt.hour - return df - - -train = add_datetime_features(train) - -# Define features and target -features = [ - "season", - "holiday", - "workingday", - "weather", - "temp", - "atemp", - "humidity", - "windspeed", - "year", - "month", - "dayofweek", - "hour", -] -X = train[features] -y = train["count"] - - -# RMSLE scorer -def rmsle(y_true, y_pred): - return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None))) - - -# 5-fold CV -kf = KFold(n_splits=5, shuffle=True, random_state=42) -scores = [] -for train_idx, val_idx in kf.split(X): - X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] - y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] - model = lgb.LGBMRegressor(random_state=42) - model.fit(X_train, y_train) - preds = model.predict(X_val) - scores.append(rmsle(y_val, preds)) - -print(f"5-fold RMSLE: {np.mean(scores):.5f}") diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline1.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline1.py deleted file mode 100644 index 3a549c7c..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline1.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import pandas as pd -import numpy as np -from sklearn.model_selection import KFold -from xgboost import XGBRegressor -from sklearn.metrics import mean_squared_log_error -import warnings - -warnings.filterwarnings("ignore") - -# Load data -_input_filename = os.getenv("BIKE_INPUT_FILE", "train.csv") -train = pd.read_csv(f"./input/{_input_filename}") - - -# Feature engineering -def fe(df): - df["datetime"] = pd.to_datetime(df["datetime"]) - df["year"] = df["datetime"].dt.year - df["month"] = df["datetime"].dt.month - df["hour"] = df["datetime"].dt.hour - df["weekday"] = df["datetime"].dt.weekday - return df - - -train = fe(train) - -# Define features and target -features = [ - "season", - "holiday", - "workingday", - "weather", - "temp", - "atemp", - "humidity", - "windspeed", - "year", - "month", - "hour", - "weekday", -] -X = train[features] -y = train["count"] - - -# RMSLE function -def rmsle(y_true, y_pred): - return np.sqrt(mean_squared_log_error(y_true, y_pred.clip(0, None))) - - -# 5-fold CV -kf = KFold(n_splits=5, shuffle=True, random_state=42) -scores = [] -for train_idx, val_idx in kf.split(X): - X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx] - y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx] - # log1p transform - y_tr_log = np.log1p(y_tr) - model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1) - model.fit(X_tr, y_tr_log) - preds_log = model.predict(X_val) - preds = np.expm1(preds_log) - scores.append(rmsle(y_val, preds)) -print(f"CV RMSLE: {np.mean(scores):.5f}") diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline2.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline2.py deleted file mode 100644 index 27be54a9..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline2.py +++ /dev/null @@ -1,58 +0,0 @@ -import os -import pandas as pd -import numpy as np -from sklearn.ensemble import RandomForestRegressor -from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_error - -# Load data -_input_filename = os.getenv("BIKE_INPUT_FILE", "train.csv") -train = pd.read_csv(f"./input/{_input_filename}") - -# Identify target column from sample submission - -# Feature engineering -for df in [train]: - df["datetime"] = pd.to_datetime(df["datetime"]) - df["year"] = df["datetime"].dt.year - df["month"] = df["datetime"].dt.month - df["dayofweek"] = df["datetime"].dt.dayofweek - df["hour"] = df["datetime"].dt.hour - -features = [ - "season", - "holiday", - "workingday", - "weather", - "temp", - "atemp", - "humidity", - "windspeed", - "year", - "month", - "dayofweek", - "hour", -] - -X = train[features] -y = train["count"].values - -# Log-transform target -y_log = np.log1p(y) - -# 5-fold CV for RMSLE -kf = KFold(n_splits=5, shuffle=True, random_state=42) -rmsle_scores = [] -for train_idx, val_idx in kf.split(X): - X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx] - y_tr, y_val = y_log[train_idx], y[val_idx] - model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) - model.fit(X_tr, y_tr) - preds_log = model.predict(X_val) - preds = np.expm1(preds_log) - preds[preds < 0] = 0 - rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds))) - rmsle_scores.append(rmsle) - -cv_rmsle = np.mean(rmsle_scores) -print(f"CV RMSLE: {cv_rmsle:.5f}") \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline3.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline3.py deleted file mode 100644 index 40e762c4..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline3.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import pandas as pd -import numpy as np -from sklearn.linear_model import Ridge -from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_error - -# Load data -_input_filename = os.getenv("BIKE_INPUT_FILE", "train.csv") -train = pd.read_csv(f"./input/{_input_filename}") - - -# Feature engineering -def preprocess(df): - df = df.copy() - df["datetime"] = pd.to_datetime(df["datetime"]) - df["year"] = df["datetime"].dt.year - df["month"] = df["datetime"].dt.month - df["dayofweek"] = df["datetime"].dt.dayofweek - df["hour"] = df["datetime"].dt.hour - return df[ - [ - "year", - "month", - "dayofweek", - "hour", - "season", - "weather", - "temp", - "atemp", - "humidity", - "windspeed", - "workingday", - "holiday", - ] - ] - - -X = preprocess(train) -y = np.log1p(train["count"]) - -# 5-fold CV for RMSLE on log scale -kf = KFold(n_splits=5, shuffle=True, random_state=42) -rmsle_scores = [] -for train_idx, val_idx in kf.split(X): - X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx] - y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx] - model = Ridge() - model.fit(X_tr, y_tr) - y_pred_log = model.predict(X_val) - rmsle = np.sqrt(mean_squared_error(y_val, y_pred_log)) - rmsle_scores.append(rmsle) - -cv_rmsle = np.mean(rmsle_scores) -print(f"CV RMSLE: {cv_rmsle:.5f}") diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline4.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline4.py deleted file mode 100644 index d4f43803..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/pipeline4.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import pandas as pd -import numpy as np -from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_log_error -import xgboost as xgb - -# Load data -_input_filename = os.getenv("BIKE_INPUT_FILE", "train.csv") -train = pd.read_csv(f"./input/{_input_filename}") - -# Identify target from sample submission -target_col = "count" - - -# Feature engineering -def prepare(df): - df = df.copy() - df["datetime"] = pd.to_datetime(df["datetime"]) - df["year"] = df["datetime"].dt.year - df["month"] = df["datetime"].dt.month - df["day_of_week"] = df["datetime"].dt.dayofweek - df["hour"] = df["datetime"].dt.hour - return df - - -train_p = prepare(train) - -features = [ - "season", - "weather", - "temp", - "atemp", - "humidity", - "windspeed", - "workingday", - "holiday", - "year", - "month", - "day_of_week", - "hour", -] -X = train_p[features] -y = np.log1p(train_p[target_col]) - -# 5-fold CV evaluation -kf = KFold(n_splits=5, shuffle=True, random_state=42) -rmsle_scores = [] -for train_idx, val_idx in kf.split(X): - X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] - y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] - model = xgb.XGBRegressor( - objective="reg:squarederror", - n_estimators=100, - learning_rate=0.1, - max_depth=6, - random_state=42, - n_jobs=-1, - ) - model.fit(X_train, y_train) - y_pred = model.predict(X_val) - score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred))) - rmsle_scores.append(score) - -print(f"CV RMSLE: {np.mean(rmsle_scores):.5f}") diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/run_base_lines.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/run_base_lines.py deleted file mode 100644 index e38dd0be..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/run_base_lines.py +++ /dev/null @@ -1,92 +0,0 @@ -import time -import subprocess -import sys -import os - - -def run_pipeline(pipeline_name, input_filename): - """Run a pipeline script and measure its execution time.""" - print(f"\n{'=' * 60}") - print(f"Running {pipeline_name}...") - print(f"{'=' * 60}") - - start_time = time.time() - - try: - # Run the pipeline script - # Prepare environment with selected input filename - env = dict(os.environ) - env["BIKE_INPUT_FILE"] = input_filename - - result = subprocess.run( - [sys.executable, pipeline_name], - capture_output=True, - text=True, - check=True, - env=env, - ) - - elapsed_time = time.time() - start_time - - # Print the output - print(result.stdout) - if result.stderr: - print("STDERR:", result.stderr) - - print(f"\n✓ {pipeline_name} completed in {elapsed_time:.2f} seconds") - - return elapsed_time, True - - except subprocess.CalledProcessError as e: - elapsed_time = time.time() - start_time - print(f"\n✗ {pipeline_name} failed after {elapsed_time:.2f} seconds") - print("STDOUT:", e.stdout) - print("STDERR:", e.stderr) - return elapsed_time, False - except Exception as e: - elapsed_time = time.time() - start_time - print(f"\n✗ {pipeline_name} error after {elapsed_time:.2f} seconds: {e}") - return elapsed_time, False - - -def main(): - """Run all pipelines sequentially and report timing results.""" - # Allow selecting input file variant via CLI arg; fallback to 'train.csv' - input_filename = sys.argv[1] if len(sys.argv) > 1 else "train_augmented_3x.csv" - pipelines = [f"pipeline{i}.py" for i in range(5)] - - results = {} - total_start = time.time() - - print("Starting pipeline execution...") - print(f"Total pipelines to run: {len(pipelines)}") - - for pipeline in pipelines: - elapsed, success = run_pipeline(pipeline, input_filename) - results[pipeline] = { - 'time': elapsed, - 'success': success - } - - total_time = time.time() - total_start - - # Print summary - print(f"\n{'=' * 60}") - print("EXECUTION SUMMARY") - print(f"{'=' * 60}") - - for pipeline, result in results.items(): - status = "✓ SUCCESS" if result['success'] else "✗ FAILED" - print(f"{pipeline:20s} - {result['time']:8.2f}s - {status}") - - print(f"{'-' * 60}") - print(f"{'Total time:':20s} {total_time:8.2f}s") - print(f"{'=' * 60}") - - # Count successes - successful = sum(1 for r in results.values() if r['success']) - print(f"\nCompleted: {successful}/{len(pipelines)} pipelines successful") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/skrubified_pipelines.py b/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/skrubified_pipelines.py deleted file mode 100644 index 8e918eef..00000000 --- a/benchmarks/logical_optimizer/end-to-end/bike-sharing-demand/skrubified_pipelines.py +++ /dev/null @@ -1,125 +0,0 @@ -import skrub -import pandas as pd -import numpy as np -import lightgbm as lgb -from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import Ridge -from xgboost import XGBRegressor -from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_log_error, make_scorer -import time - -from stratum.logical_optimizer import apply_cse_on_skrub_ir - -t0 = time.time() - -# Load data -train = pd.read_csv("./input/train_augmented_3x.csv", parse_dates=["datetime"]) - -# Skrub DataOps plan -data = skrub.var("data", train).skb.subsample(n=1000) -X = data.drop("count", axis=1).skb.mark_as_X() -y = data["count"].skb.mark_as_y() -mode = skrub.eval_mode() - -# Pipeline 0 -datetime_col = X["datetime"].dt -X_feat_pipe0 = X.assign( - year=datetime_col.year, - month=datetime_col.month, - dayofweek=datetime_col.dayofweek, - hour=datetime_col.hour) - -X_feat_pipe0 = X_feat_pipe0.drop(["datetime", "casual", "registered"], axis=1, errors="ignore") -model_pipe0 = lgb.LGBMRegressor(random_state=42) -pred_pipe0 = X_feat_pipe0.skb.apply(model_pipe0, y=y).skb.set_name("Pipeline 0") - -# Pipeline 1 -X_feat_pipe1 = X.assign( - year=datetime_col.year, - month=datetime_col.month, - weekday=datetime_col.weekday, - hour=datetime_col.hour) - -X_feat_pipe1 = X_feat_pipe1.drop(["datetime", "casual", "registered"], axis=1, errors="ignore") - -model_pipe1 = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1) -pred_pipe1 = X_feat_pipe1.skb.apply(model_pipe1, y=y.skb.apply_func(np.log1p)).skb.set_name("Pipeline 1") -pred_final_pipe1 = pred_pipe1.skb.apply_func( - lambda a,b: np.expm1(a) if b=="predict" else a, - mode).skb.set_name("Reverse log for prediction1") - -# Pipeline 2 -X_feat_pipe2 = X.assign( - year=datetime_col.year, - month=datetime_col.month, - dayofweek=datetime_col.dayofweek, - hour=datetime_col.hour) -X_feat_pipe2 = X_feat_pipe2.drop(["datetime", "casual", "registered"], axis=1, errors="ignore") - -model_pipe2 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) -pred_pipe2 = X_feat_pipe2.skb.apply(model_pipe2, y=y.skb.apply_func(np.log1p)).skb.set_name("Pipeline 2") -pred_final_pipe2 = pred_pipe2.skb.apply_func( - lambda a,b: np.expm1(a) if b=="predict" else a, - mode).skb.set_name("Reverse log for prediction2") - -# Pipeline 3 -X_feat_pipe3 = X.assign( - year=datetime_col.year, - month=datetime_col.month, - dayofweek=datetime_col.dayofweek, - hour=datetime_col.hour) -X_feat_pipe3 = X_feat_pipe3.drop(["datetime", "casual", "registered"], axis=1, errors="ignore") - -model_pipe3 = Ridge() -pred_pipe3 = X_feat_pipe3.skb.apply(model_pipe3, y=y.skb.apply_func(np.log1p)).skb.set_name("Pipeline 3") -pred_final_pipe3 = pred_pipe3.skb.apply_func( - lambda a,b: np.expm1(a) if b=="predict" else a, - mode).skb.set_name("Reverse log for prediction3") - -# Pipeline 4 -X_feat_pipe4 = X.assign( - year=datetime_col.year, - month=datetime_col.month, - dayofweek=datetime_col.dayofweek, - hour=datetime_col.hour) -X_feat_pipe4 = X_feat_pipe4.drop(["datetime", "casual", "registered"], axis=1, errors="ignore") - -model_pipe4 = XGBRegressor( - objective="reg:squarederror", - n_estimators=100, - learning_rate=0.1, - max_depth=6, - random_state=42, - n_jobs=-1, -) -pred_pipe4 = X_feat_pipe4.skb.apply(model_pipe4, y=y.skb.apply_func(np.log1p)).skb.set_name("Pipeline 4") -pred_final_pipe4 = pred_pipe4.skb.apply_func( - lambda a,b: np.expm1(a) if b=="predict" else a, - mode).skb.set_name("Reverse log for prediction4") - -merged_pipelines = skrub.choose_from({ - "pipeline0": pred_pipe0, - "pipeline1": pred_final_pipe1, - "pipeline2": pred_final_pipe2, - "pipeline3": pred_final_pipe3, - "pipeline4": pred_final_pipe4, -}, name="merged pipelines").as_data_op().skb.set_name("GridSearchCV") - -# merged_pipelines.skb.draw_graph().open() -merged_pipelines = apply_cse_on_skrub_ir(merged_pipelines) -# merged_pipelines.skb.draw_graph().open() - -# RMSLE scorer -def rmsle(y_true, y_pred): - return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None))) -scorer = make_scorer(rmsle) - -cv = KFold(n_splits=5, shuffle=True, random_state=42) -t0_ = time.time() -search = merged_pipelines.skb.make_grid_search(fitted=True, cv=cv, scoring=scorer, n_jobs=-1) -print(search.results_) - -t1 = time.time() -print(t1 - t0) -print(t1 - t0_) diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/bar_plot.py b/benchmarks/logical_optimizer/end-to-end/california-housing/bar_plot.py deleted file mode 100644 index eeabd1b3..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/bar_plot.py +++ /dev/null @@ -1,39 +0,0 @@ -import matplotlib.pyplot as plt -import pandas as pd -import numpy as np - -base_path = "benchmarks/logical_optimizer/end-to-end/california-housing/" -data = pd.read_csv(base_path + "california_housing_pipelines_benchmark.csv", sep=";") -data["time"] = data["time"].apply(np.round, decimals=2) - -# Prepare data in desired order: non-optimized, optimized -non_optimized = data.iloc[:3]["time"].values -optimized = data.iloc[3:6]["time"].values - -labels = ["skrub-njobs=1", "skrub-njobs=-1", "stratum-njobs=1"] -data = pd.DataFrame({"non_optimized": non_optimized, "optimized": optimized, "labels": labels}) - -# Publication-quality colorblind-friendly colors -# Using a 4-color palette: blue, orange, green, red (ColorBrewer inspired) -pub_colors = ['#F18F01', '#C73E1D', '#6A994E'] # Blue, Orange, Red, Green -exp_names = ("w/o Logical Rewrites", "w/ Logical Rewrites") -x = np.arange(len(exp_names)) # the label locations -width = 0.25 # the width of the bars - -multiplier = 0 - -fig, ax = plt.subplots(figsize=(5, 5), dpi=100, layout='constrained') -for i, row in data.iterrows(): - offset = width * multiplier - rects = ax.bar(x + offset, row[:2], width=width, label=labels[i], color=pub_colors[i]) - ax.bar_label(rects, padding=3) - multiplier += 1 - -ax.set_xticks(x + width, exp_names) -ax.set_yscale("log") -ax.set_ylabel("Time (s)") -ax.legend(loc="upper right", ncols=2) -plt.ylim(0.01, 30) -ax.grid(axis='y', alpha=0.3, linestyle='--') -plt.tight_layout() -plt.savefig(base_path + "california_housing_pipelines_benchmark_bar_plot.pdf") \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline0.py b/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline0.py deleted file mode 100644 index 4917903d..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline0.py +++ /dev/null @@ -1,55 +0,0 @@ -import numpy as np -import pandas as pd -from sklearn.model_selection import KFold, cross_val_score -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer -from sklearn.linear_model import LinearRegression - -# Load dataset -df = pd.read_csv("input/train.csv") -target = "MedHouseVal" - -# Feature engineering -def feat_eng(X): - return X.assign( - BedroomsPerRoom=X["AveBedrms"] / X["AveRooms"], - IncomeSquared=X["MedInc"] ** 2, - IncomeRoomInteraction=X["MedInc"] * X["AveRooms"], - Density=X["Population"] / X["AveOccup"], - LatitudeLongitude=X["Latitude"] * X["Longitude"], - MedInc3=X["MedInc"] ** 3, - RoomDensity=X["AveRooms"] / X["Population"] - ) - -# Prepare features and target -X = df.drop(columns=[target]) -X = feat_eng(X) -y = df[target] - -numeric_features = X.columns.tolist() - -# Build preprocessing pipeline -numeric_transformer = Pipeline(steps=[ - ("scaler", StandardScaler()), -]) - -preprocessor = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features) - ] -) - -# Build model pipeline -model = Pipeline(steps=[ - ("preprocess", preprocessor), - ("regressor", LinearRegression()) -]) - -# Cross-validation -cv = KFold(n_splits=5, shuffle=True, random_state=42) -cv_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error') -mse_scores = cv_scores.mean() - -print(f"Pipeline 0 (LinearRegression) MSE: {mse_scores:.4f}") - diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline1.py b/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline1.py deleted file mode 100644 index 41d8dc91..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline1.py +++ /dev/null @@ -1,54 +0,0 @@ -import pandas as pd -from sklearn.model_selection import KFold, cross_val_score -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer -from sklearn.linear_model import Ridge - -# Load dataset -df = pd.read_csv("input/train.csv") -target = "MedHouseVal" - -# Feature engineering -def feat_eng(X): - return X.assign( - BedroomsPerRoom=X["AveBedrms"] / X["AveRooms"], - IncomeSquared=X["MedInc"] ** 2, - IncomeRoomInteraction=X["MedInc"] * X["AveRooms"], - Density=X["Population"] / X["AveOccup"], - LatitudeLongitude=X["Latitude"] * X["Longitude"], - MedInc3=X["MedInc"] ** 3, - RoomDensity=X["AveRooms"] / X["Population"] - ) - -# Prepare features and target -X = df.drop(columns=[target]) -X = feat_eng(X) -y = df[target] - -numeric_features = X.columns.tolist() - -# Build preprocessing pipeline -numeric_transformer = Pipeline(steps=[ - ("scaler", StandardScaler()), -]) - -preprocessor = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features) - ] -) - -# Build model pipeline -model = Pipeline(steps=[ - ("preprocess", preprocessor), - ("regressor", Ridge()) -]) - -# Cross-validation -cv = KFold(n_splits=5, shuffle=True, random_state=42) -cv_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error') -mse_scores = cv_scores.mean() - -print(f"Pipeline 1 (Ridge) MSE: {mse_scores:.4f}") - diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline2.py b/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline2.py deleted file mode 100644 index 1ec3af7a..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline2.py +++ /dev/null @@ -1,54 +0,0 @@ -import pandas as pd -from sklearn.model_selection import KFold, cross_val_score -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer -from sklearn.linear_model import Lasso - -# Load dataset -df = pd.read_csv("input/train.csv") -target = "MedHouseVal" - -# Feature engineering -def feat_eng(X): - return X.assign( - BedroomsPerRoom=X["AveBedrms"] / X["AveRooms"], - IncomeSquared=X["MedInc"] ** 2, - IncomeRoomInteraction=X["MedInc"] * X["AveRooms"], - Density=X["Population"] / X["AveOccup"], - LatitudeLongitude=X["Latitude"] * X["Longitude"], - MedInc3=X["MedInc"] ** 3, - RoomDensity=X["AveRooms"] / X["Population"] - ) - -# Prepare features and target -X = df.drop(columns=[target]) -X = feat_eng(X) -y = df[target] - -numeric_features = X.columns.tolist() - -# Build preprocessing pipeline -numeric_transformer = Pipeline(steps=[ - ("scaler", StandardScaler()), -]) - -preprocessor = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features) - ] -) - -# Build model pipeline -model = Pipeline(steps=[ - ("preprocess", preprocessor), - ("regressor", Lasso()) -]) - -# Cross-validation -cv = KFold(n_splits=5, shuffle=True, random_state=42) -cv_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error') -mse_scores = cv_scores.mean() - -print(f"Pipeline 2 (Lasso) MSE: {mse_scores:.4f}") - diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline3.py b/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline3.py deleted file mode 100644 index 0b4d7520..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/pipeline3.py +++ /dev/null @@ -1,54 +0,0 @@ -import pandas as pd -from sklearn.model_selection import KFold, cross_val_score -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer -from sklearn.linear_model import ElasticNet - -# Load dataset -df = pd.read_csv("input/train.csv") -target = "MedHouseVal" - -# Feature engineering -def feat_eng(X): - return X.assign( - BedroomsPerRoom=X["AveBedrms"] / X["AveRooms"], - IncomeSquared=X["MedInc"] ** 2, - IncomeRoomInteraction=X["MedInc"] * X["AveRooms"], - Density=X["Population"] / X["AveOccup"], - LatitudeLongitude=X["Latitude"] * X["Longitude"], - MedInc3=X["MedInc"] ** 3, - RoomDensity=X["AveRooms"] / X["Population"] - ) - -# Prepare features and target -X = df.drop(columns=[target]) -X = feat_eng(X) -y = df[target] - -numeric_features = X.columns.tolist() - -# Build preprocessing pipeline -numeric_transformer = Pipeline(steps=[ - ("scaler", StandardScaler()), -]) - -preprocessor = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features) - ] -) - -# Build model pipeline -model = Pipeline(steps=[ - ("preprocess", preprocessor), - ("regressor", ElasticNet()) -]) - -# Cross-validation -cv = KFold(n_splits=5, shuffle=True, random_state=42) -cv_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error') -mse_scores = cv_scores.mean() - -print(f"Pipeline 3 (ElasticNet) MSE: {mse_scores:.4f}") - diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/run_base_lines.py b/benchmarks/logical_optimizer/end-to-end/california-housing/run_base_lines.py deleted file mode 100644 index 6fb38ff3..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/run_base_lines.py +++ /dev/null @@ -1,91 +0,0 @@ -import time -import subprocess -import sys -import os - - -def run_pipeline(pipeline_name, input_filename=None): - """Run a pipeline script and measure its execution time.""" - print(f"\n{'=' * 60}") - print(f"Running {pipeline_name}...") - print(f"{'=' * 60}") - - start_time = time.time() - - try: - # Run the pipeline script - # Note: california-housing pipelines don't use input files, - # but we keep the interface consistent with bike-sharing - env = dict(os.environ) - - result = subprocess.run( - [sys.executable, pipeline_name], - capture_output=True, - text=True, - check=True, - env=env, - ) - - elapsed_time = time.time() - start_time - - # Print the output - print(result.stdout) - if result.stderr: - print("STDERR:", result.stderr) - - print(f"\n✓ {pipeline_name} completed in {elapsed_time:.2f} seconds") - - return elapsed_time, True - - except subprocess.CalledProcessError as e: - elapsed_time = time.time() - start_time - print(f"\n✗ {pipeline_name} failed after {elapsed_time:.2f} seconds") - print("STDOUT:", e.stdout) - print("STDERR:", e.stderr) - return elapsed_time, False - except Exception as e: - elapsed_time = time.time() - start_time - print(f"\n✗ {pipeline_name} error after {elapsed_time:.2f} seconds: {e}") - return elapsed_time, False - - -def main(): - """Run all pipelines sequentially and report timing results.""" - pipelines = [f"pipeline{i}.py" for i in range(4)] - - results = {} - total_start = time.time() - - print("Starting pipeline execution...") - print(f"Total pipelines to run: {len(pipelines)}") - - for pipeline in pipelines: - elapsed, success = run_pipeline(pipeline) - results[pipeline] = { - 'time': elapsed, - 'success': success - } - - total_time = time.time() - total_start - - # Print summary - print(f"\n{'=' * 60}") - print("EXECUTION SUMMARY") - print(f"{'=' * 60}") - - for pipeline, result in results.items(): - status = "✓ SUCCESS" if result['success'] else "✗ FAILED" - print(f"{pipeline:20s} - {result['time']:8.2f}s - {status}") - - print(f"{'-' * 60}") - print(f"{'Total time:':20s} {total_time:8.2f}s") - print(f"{'=' * 60}") - - # Count successes - successful = sum(1 for r in results.values() if r['success']) - print(f"\nCompleted: {successful}/{len(pipelines)} pipelines successful") - - -if __name__ == "__main__": - main() - diff --git a/benchmarks/logical_optimizer/end-to-end/california-housing/skrubified_merged_pipelines.py b/benchmarks/logical_optimizer/end-to-end/california-housing/skrubified_merged_pipelines.py deleted file mode 100644 index 82b4e4b8..00000000 --- a/benchmarks/logical_optimizer/end-to-end/california-housing/skrubified_merged_pipelines.py +++ /dev/null @@ -1,166 +0,0 @@ -import numpy as np -import pandas as pd -from sklearn.model_selection import KFold -import skrub -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge - -from stratum.logical_optimizer import apply_cse_on_skrub_ir -from stratum.api.gridsearch import grid_search -from time import time - -def pipeline_definition(show_graph=False): - # csv file contains the data from sklearn.datasets.fetch_california_housing - df_path = "input/train.csv" - target = "MedHouseVal" - - df_path = skrub.as_data_op(df_path) - df = df_path.skb.apply_func(pd.read_csv).skb.subsample(n=100) - - y = df[target].skb.mark_as_y() - X = df.drop(columns=[target]).skb.mark_as_X() - - def feat_eng(X): - return X.assign(BedroomsPerRoom=X["AveBedrms"] / X["AveRooms"], - IncomeSquared=X["MedInc"] ** 2, - IncomeRoomInteraction=X["MedInc"] * X["AveRooms"], - Density=X["Population"] / X["AveOccup"], - LatitudeLongitude=X["Latitude"] * X["Longitude"], - MedInc3=X["MedInc"] ** 3, - RoomDensity=X["AveRooms"] / X["Population"] - ) - - # pipeline 0 - X2 = feat_eng(X) - scaler = StandardScaler() - X_scaled = X2.skb.apply(scaler) - pred0 = X_scaled.skb.apply(LinearRegression(), y=y) - - # Pipeline 1 - X2 = feat_eng(X) - scaler = StandardScaler() - X_scaled = X2.skb.apply(scaler) - pred1 = X_scaled.skb.apply(Ridge(), y=y) - - # Pipeline 2 - X2 = feat_eng(X) - scaler = StandardScaler() - X_scaled = X2.skb.apply(scaler) - pred2 = X_scaled.skb.apply(Lasso(), y=y) - - # Pipeline 3 - X2 = feat_eng(X) - scaler = StandardScaler() - X_scaled = X2.skb.apply(scaler) - pred3 = X_scaled.skb.apply(ElasticNet(), y=y) - - preds = { - "pipeline0": pred0, - "pipeline1": pred1, - "pipeline2": pred2, - "pipeline3": pred3, - } - pred = skrub.choose_from(preds, name="predictions").as_data_op() - if show_graph: - pred.skb.draw_graph().open() - - - return pred - - -def run_experiment(pred, show_graph=False): - cv = KFold(n_splits=5, shuffle=True, random_state=42) - runs = 1 - - def run_and_average(name, search_func, print_results=True): - times = [] - search_result = None - for run_idx in range(runs): - t0 = time() - search_result = search_func() - t1 = time() - times.append(t1 - t0) - if runs > 1: - print(f" Run {run_idx + 1}/{runs}: {t1 - t0:.4f}s") - - avg_time = np.mean(times) - std_time = np.std(times) if runs > 1 else 0 - print(f"Gridsearch time (avg over {runs} runs): {avg_time:.4f}s" + - (f" (std: {std_time:.4f}s)" if runs > 1 else "")) - - if print_results and search_result is not None: - if hasattr(search_result, 'results_'): - print(search_result.results_) - else: - print(search_result) - print("----------------------------------------") - - return {"impl": name, "time": avg_time} - - df_vals = [] - - # Skrub - non optimized gridsearch (n_jobs=1) - print("Skrub - non optimized gridsearch (n_jobs=1)") - df_vals.append(run_and_average( - "skrub-njobs=1", - lambda: pred.skb.make_grid_search(fitted=True, cv=cv, n_jobs=1, scoring="neg_mean_squared_error", refit=False) - )) - - # Skrub - non optimized gridsearch (n_jobs=-1) - print("Skrub - non optimized gridsearch (n_jobs=-1)") - df_vals.append(run_and_average( - "skrub-njobs=-1", - lambda: pred.skb.make_grid_search(fitted=True, cv=cv, n_jobs=-1, scoring="neg_mean_squared_error", refit=False) - )) - - # Stratum - gridsearch (n_jobs=1) - print("Stratum - optimized gridsearch (n_jobs=1)") - df_vals.append(run_and_average( - "stratum-njobs=1", - lambda: grid_search(pred, cv=cv, scoring="neg_mean_squared_error") - )) - - # Optimization step (only run once) - t00 = time() - pred_optimized = apply_cse_on_skrub_ir(pred) - t01 = time() - print("Optimization time: ", t01 - t00) - - if show_graph: - pred_optimized.skb.draw_graph().open() - - # Skrub - optimized gridsearch (n_jobs=1) - print("Skrub - optimized gridsearch (n_jobs=1)") - df_vals.append(run_and_average( - "skrub-optimized-njobs=1", - lambda: pred_optimized.skb.make_grid_search(fitted=True, cv=cv, n_jobs=1, scoring="neg_mean_squared_error", refit=False) - )) - - # Skrub - optimized gridsearch (n_jobs=-1) - print("Skrub - optimized gridsearch (n_jobs=-1)") - df_vals.append(run_and_average( - "skrub-optimized-njobs=-1", - lambda: pred_optimized.skb.make_grid_search(fitted=True, cv=cv, n_jobs=-1, scoring="neg_mean_squared_error", refit=False) - )) - - # Stratum - optimized gridsearch (n_jobs=1) - print("Stratum - optimized gridsearch (n_jobs=1)") - df_vals.append(run_and_average( - "stratum-optimized-njobs=1", - lambda: grid_search(pred_optimized, cv=cv, scoring="neg_mean_squared_error") - )) - - df_vals.append({"impl": "baseline", "time": 3.81}) - - df = pd.DataFrame(df_vals) - df.to_csv("california_housing_pipelines_benchmark.csv", index=False, header=True, sep=";") - print("\nSummary:") - print(df) - - -show_graph = False -t0 = time() -pred = pipeline_definition(show_graph=show_graph) -t1 = time() -print("Pipeline definition time: ", t1 - t0) -run_experiment(pred, show_graph=show_graph) \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/plot_20newsgroup_results.py b/benchmarks/logical_optimizer/end-to-end/plot_20newsgroup_results.py deleted file mode 100644 index de96e41a..00000000 --- a/benchmarks/logical_optimizer/end-to-end/plot_20newsgroup_results.py +++ /dev/null @@ -1,36 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np - -base_path = "benchmarks/logical_optimizer/end-to-end/" - -data = pd.read_csv(base_path + 'bench_cse_tfidf_gridsearch.csv') -data["total"] = data["total"].apply(np.round, decimals=2) - -labels = ["skrub-njobs=1", "skrub-njobs=-1", "stratum-njobs=1"] -exp_names = (100, 500, 1000, 1000) - -# Publication-quality colorblind-friendly colors -# Using a 4-color palette: blue, orange, green, red (ColorBrewer inspired) -pub_colors = ['#F18F01', '#C73E1D', '#6A994E'] # Blue, Orange, Red, Green -exp_names = (100, 500, 1000, 1000) -x = np.arange(len(exp_names)) # the label locations -width = 0.85 # the width of the bars -x = x* width*(len(labels)+1) -multiplier = 0 - -fig, ax = plt.subplots(figsize=(9, 5), dpi=100) -for scheduler, group in data.groupby("scheduler"): - offset = width * multiplier - rects = ax.bar(x + offset, group["total"], width=width, label=scheduler, color=pub_colors[multiplier]) - ax.bar_label(rects, padding=3) - multiplier += 1 - -ax.set_xticks(x + width, exp_names) -ax.set_yscale("log") -ax.set_ylabel("Time (s)") -ax.legend(loc="upper right", ncols=len(labels)) -plt.ylim(0.1, 300) -ax.grid(axis='y', alpha=0.3, linestyle='--') -plt.tight_layout() -plt.savefig(base_path + "20newsgroup_results_bar_plot.pdf") \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/uk-house-price/feature_transform.py b/benchmarks/logical_optimizer/end-to-end/uk-house-price/feature_transform.py deleted file mode 100644 index 591b3cae..00000000 --- a/benchmarks/logical_optimizer/end-to-end/uk-house-price/feature_transform.py +++ /dev/null @@ -1,17 +0,0 @@ -import numpy as np -import pandas as pd -import stratum as skrub -from sklearn.preprocessing import OneHotEncoder -from skrub import TableVectorizer, StringEncoder - -file_path = "input/price_paid_records_small.csv" -df = pd.read_csv(file_path) -df = df.rename(columns={"Town/City": "Town"}, inplace=False) -df.drop("Price", axis=1, inplace=True) -print(df.info()) - -skrub.set_config(rust_backend=True, debug_timing=True) -enc = TableVectorizer(high_cardinality=StringEncoder(), low_cardinality=OneHotEncoder(), n_jobs=-1) #default setup -X_cat_enc = enc.fit_transform(df) -print(X_cat_enc) - diff --git a/benchmarks/logical_optimizer/end-to-end/uk-house-price/setup.sh b/benchmarks/logical_optimizer/end-to-end/uk-house-price/setup.sh deleted file mode 100755 index 6af5e83e..00000000 --- a/benchmarks/logical_optimizer/end-to-end/uk-house-price/setup.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# download dataset from kaggle -curl -L -o uk-housing-prices-paid.zip\ - https://www.kaggle.com/api/v1/datasets/download/hm-land-registry/uk-housing-prices-paid - -unzip uk-housing-prices-paid.zip -d tmp -mkdir -p input -mv tmp/* input/ -rm -rf tmp -rm uk-housing-prices-paid.zip - -# downsample for testing: -head -100000 input/price_paid_records.csv > input/price_paid_records_small.csv diff --git a/benchmarks/logical_optimizer/end-to-end/uk-house-price/tabvec_lightgbm.py b/benchmarks/logical_optimizer/end-to-end/uk-house-price/tabvec_lightgbm.py deleted file mode 100644 index eae7cdf4..00000000 --- a/benchmarks/logical_optimizer/end-to-end/uk-house-price/tabvec_lightgbm.py +++ /dev/null @@ -1,66 +0,0 @@ -from time import perf_counter -import pandas as pd -import polars -from joblib import parallel_backend -from sklearn.metrics import make_scorer, r2_score - -#import skrub -import stratum as skrub -from lightgbm import LGBMRegressor -from sklearn.model_selection import train_test_split, ShuffleSplit -from sklearn.preprocessing import OneHotEncoder, StandardScaler -from skrub import StringEncoder, TableVectorizer -import cProfile -import pstats -pr = cProfile.Profile() - -# 1. Load Data -dtypes = { - "Transaction unique identifier": "category", - "Price": "int32", - "Property Type": "category", - "Old/New": "category", - "Duration": "category", - "Town/City": "category", - "District": "category", - "Country": "category", - "County": "category", - "PPDCategory Type": "category", - "Record Status - monthly file only": "category" -} -file_path = "input/price_paid_records_small.csv" -#df_raw = pd.read_csv(file_path, dtype=dtypes) #setting datatypes reduces size and speeds up -df_raw = pd.read_csv(file_path) #setting datatypes reduces size and speeds up -#print(df_raw.memory_usage(deep=True).sum() / 1024**2) #in-memory size in MB -print(df_raw.info()) -df = skrub.as_data_op(df_raw) - -y = df["Price"].skb.mark_as_y() -X = df.drop("Price", axis=1).skb.mark_as_X() - -# 3. Pre-processing (pre_process_2 logic) -vec = TableVectorizer(n_jobs=1, - high_cardinality=StringEncoder(), - low_cardinality=OneHotEncoder(drop='if_binary', dtype='float32', handle_unknown='ignore', sparse_output=False) -) -X_enc = X.skb.apply(vec) -X_vec = X_enc.skb.apply(StandardScaler()) - -# 4. Modeling -model = LGBMRegressor(random_state=42) -preds = X_vec.skb.apply(model, y=y) - -# 5. Grid search -skrub.set_config(rust_backend=True, debug_timing=False, scheduler=True, stats=True) -cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42) -scorer = make_scorer(r2_score) -t0 = perf_counter() -#pr.enable() -search = preds.skb.make_grid_search(cv=cv, n_jobs=1, scoring=scorer, fitted=True) -#pr.disable() -t1 = perf_counter() -print(f"Time taken: {t1 - t0} seconds") -print(search.results_) - -#stats = pstats.Stats(pr).sort_stats("tottime") -#stats.print_stats(60) diff --git a/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp.py b/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp.py deleted file mode 100644 index bbe6cf3d..00000000 --- a/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp.py +++ /dev/null @@ -1,131 +0,0 @@ -from sklearn.metrics import make_scorer, mean_squared_error, r2_score -from sklearn.model_selection import KFold, ShuffleSplit -import pandas as pd -from xgboost import XGBRegressor -from lightgbm import LGBMRegressor -from sklearn.linear_model import ElasticNet, Ridge - -from time import perf_counter -import numpy as np -from sklearn.preprocessing import StandardScaler -import stratum as skrub -test=True - -import logging - -logging.basicConfig(level=logging.DEBUG) - -file_path = "price_paid_records_1M.csv" if test else "input/price_paid_records.csv" -df = skrub.as_data_op(file_path).skb.apply_func(pd.read_csv).skb.subsample(n=1000) -print(df.columns.skb.preview()) -df = df.rename(columns={"Town/City": "Town"}, inplace=False) -y = df["Price"].skb.mark_as_y() -X = df.drop("Price", axis=1).skb.mark_as_X() - -from sklearn.base import BaseEstimator, TransformerMixin -class TargetEncoder(BaseEstimator, TransformerMixin): - def fit(self, X, y=None): - print("fit target encoder") - self.global_mean_ = y.mean() - tmp = pd.concat([X, y], axis=1) - self.cols = X.columns - self.means = {} - for col in self.cols: - self.means[col] = tmp.groupby(col)[tmp.columns[-1]].mean() - return self - - def transform(self, X): - print("transform target encoder") - X_out = X.copy() - for col in self.cols: - X_out[col] = X_out[col].map(self.means[col]).fillna(self.global_mean_) - return X_out - - def fit_transform(self, X, y=None): - self.fit(X, y) - return self.transform(X) - - def get_feature_names_out(self): - return self.cols - - -def pre_process_1(X, y): - date = X["Date of Transfer"].skb.apply_func(pd.to_datetime) - X = X.assign( - year=date.dt.year, - month=date.dt.month, - day=date.dt.day, - dayofweek=date.dt.dayofweek, - hour=date.dt.hour) - X = X.assign( - month_sin=(date.dt.month * (2 * np.pi / 12)).apply(np.sin), - month_cos=(date.dt.month * (2 * np.pi / 12)).apply(np.cos), - day_sin=(date.dt.day * (2 * np.pi / 30)).apply(np.sin), - day_cos=(date.dt.day * (2 * np.pi / 30)).apply(np.cos), - dayofweek_sin=(date.dt.dayofweek * (2 * np.pi / 7)).apply(np.sin), - dayofweek_cos=(date.dt.dayofweek * (2 * np.pi / 7)).apply(np.cos), - hour_sin=(date.dt.hour * (2 * np.pi / 24)).apply(np.sin), - hour_cos=(date.dt.hour * (2 * np.pi / 24)).apply(np.cos), - ) - X = X.drop([ - "Date of Transfer", - 'Duration', - 'Transaction unique identifier', - 'PPDCategory Type', - 'Record Status - monthly file only'], axis=1) - - cat_selector = skrub.selectors.filter(lambda col: col.dtype == "object") - X_cat = X.skb.select(cat_selector) - X_cat_enc = X_cat.skb.apply(skrub.StringEncoder()) - num_selector = skrub.selectors.filter(lambda col: col.dtype != "object") - - X_te = X[["District", "County", "Town"]].skb.apply(TargetEncoder(), y=y) - X_te = X_te.rename(columns={"District": "district_te", "County": "county_te", "Town": "town_te"}) - X_num = X.skb.select(num_selector) - X_num = X_num.skb.concat([X_te], axis=1) - - X_num_scaled = X_num.skb.apply(StandardScaler()) - X_vec = X_num_scaled.skb.concat([X_cat_enc], axis=1) - return X_vec - -def pre_process_2(X): - X_enc = X.skb.apply(skrub.TableVectorizer()) - return X_enc - -X_1 = pre_process_1(X,y) -X_2 = pre_process_2(X) -X_enc = skrub.choose_from({ - "1": X_1, - "2": X_2 - }, name="feat_eng").as_data_op() - -models = { - "Ridge": Ridge(random_state=42), - "XGBoost": XGBRegressor(random_state=42), - "LightGBM": LGBMRegressor(random_state=42), - "ElasticNet": ElasticNet(random_state=42), -} -preds = {k: X_enc.skb.apply(model, y=y) for k,model in models.items()} -preds = skrub.choose_from(preds, name="models").as_data_op() -preds = preds.skb.apply_func(lambda a, m: (a, print(m))[0], skrub.eval_mode()) - -# play with cvs -cv = 3 -cv = ShuffleSplit(n_splits=1,test_size=0.2,random_state=42) if cv == 1 else KFold(n_splits=cv, shuffle=True, random_state=42) -scorer = make_scorer(r2_score) -t0 = perf_counter() -with skrub.config(scheduler=True, stats=20, rust_backend=True): - search_stratum = preds.skb.make_grid_search(cv=cv, n_jobs=1, fitted=True, scoring=scorer) -t1 = perf_counter() -print("="*80) -print(f"Stratum gridsearch scheduler time: {t1 - t0} seconds") -print("="*80) -search = preds.skb.make_grid_search(cv=cv, n_jobs=1, fitted=True, scoring=scorer, refit=False) -t2 = perf_counter() -print("="*80) -print(f"Skrub default gridsearch time: {t2 - t1} seconds") -print("="*80) -print("Results:") -print(search.results_) -print(search_stratum.results_) -print("="*80) \ No newline at end of file diff --git a/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp2.py b/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp2.py deleted file mode 100644 index 63112ab5..00000000 --- a/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp2.py +++ /dev/null @@ -1,132 +0,0 @@ -import cProfile -from joblib import parallel_backend -from sklearn.metrics import r2_score, make_scorer - -import stratum as skrub -from skrub import StringEncoder -from sklearn.model_selection import KFold -#import skrub -import pandas as pd -from xgboost import XGBRegressor -from lightgbm import LGBMRegressor -from sklearn.linear_model import ElasticNet, Ridge - -from time import perf_counter -import numpy as np -from sklearn.preprocessing import StandardScaler, OneHotEncoder -import pstats - -pr = cProfile.Profile() - -test=True - -file_path = "input/price_paid_records_small.csv" if test else "input/price_paid_records.csv" -df = skrub.as_data_op(file_path).skb.apply_func(pd.read_csv).skb.subsample(n=1000) -print(df.columns.skb.preview()) -df = df.rename(columns={"Town/City": "Town"}, inplace=False) -y = df["Price"].skb.mark_as_y() -X = df.drop("Price", axis=1).skb.mark_as_X() - -from sklearn.base import BaseEstimator, TransformerMixin -class TargetEncoder(BaseEstimator, TransformerMixin): - def fit(self, X, y=None): - print("fit target encoder") - self.global_mean_ = y.mean() - tmp = pd.concat([X, y], axis=1) - self.cols = X.columns - self.means = {} - for col in self.cols: - self.means[col] = tmp.groupby(col)[tmp.columns[-1]].mean() - return self - - def transform(self, X): - print("transform target encoder") - X_out = X.copy() - for col in self.cols: - X_out[col] = X_out[col].map(self.means[col]).fillna(self.global_mean_) - return X_out - - def fit_transform(self, X, y=None): - self.fit(X, y) - return self.transform(X) - - def get_feature_names_out(self): - return self.cols - - -def pre_process_1(X, y): - date = X["Date of Transfer"].skb.apply_func(pd.to_datetime) - X = X.assign( - year=date.dt.year, - month=date.dt.month, - day=date.dt.day, - dayofweek=date.dt.dayofweek, - hour=date.dt.hour) - X = X.assign( - month_sin=(date.dt.month * (2 * np.pi / 12)).apply(np.sin), - month_cos=(date.dt.month * (2 * np.pi / 12)).apply(np.cos), - day_sin=(date.dt.day * (2 * np.pi / 30)).apply(np.sin), - day_cos=(date.dt.day * (2 * np.pi / 30)).apply(np.cos), - dayofweek_sin=(date.dt.dayofweek * (2 * np.pi / 7)).apply(np.sin), - dayofweek_cos=(date.dt.dayofweek * (2 * np.pi / 7)).apply(np.cos), - hour_sin=(date.dt.hour * (2 * np.pi / 24)).apply(np.sin), - hour_cos=(date.dt.hour * (2 * np.pi / 24)).apply(np.cos), - ) - X = X.drop([ - "Date of Transfer", - 'Duration', - 'Transaction unique identifier', - 'PPDCategory Type', - 'Record Status - monthly file only'], axis=1) - - cat_selector = skrub.selectors.filter(lambda col: col.dtype == "object") - X_cat = X.skb.select(cat_selector) - X_cat_enc = X_cat.skb.apply(skrub.StringEncoder()) - num_selector = skrub.selectors.filter(lambda col: col.dtype != "object") - - X_te = X[["District", "County", "Town"]].skb.apply(TargetEncoder(), y=y) - X_te = X_te.rename(columns={"District": "district_te", "County": "county_te", "Town": "town_te"}) - X_num = X.skb.select(num_selector) - X_num = X_num.skb.concat([X_te], axis=1) - - X_num_scaled = X_num.skb.apply(StandardScaler()) - X_vec = X_num_scaled.skb.concat([X_cat_enc], axis=1) - return X_vec - -def pre_process_2(X): - X_enc = X.skb.apply(skrub.TableVectorizer(high_cardinality=StringEncoder(), low_cardinality=OneHotEncoder())) - # Scaling is necessary for ElasticNet and Ridge (converge quick and fast) - X_vec = X_enc.skb.apply(StandardScaler()) - return X_vec - -X_1 = pre_process_1(X,y) -X_2 = pre_process_2(X) -X_enc = skrub.choose_from({ - "data engineering 1": X_1, - "data engineering 2": X_2 - }, name="X_enc").as_data_op() - -X_enc = X_enc.skb.apply_func(lambda x, m: (x, print(m))[0], skrub.eval_mode()) - -models = { - "Ridge": Ridge(random_state=42), - "XGBoost": XGBRegressor(random_state=42), - "LightGBM": LGBMRegressor(random_state=42), - "ElasticNet": ElasticNet(random_state=42), -} -preds = {k: X_enc.skb.apply(model, y=y) for k,model in models.items()} -preds = skrub.choose_from(preds, name="preds").as_data_op() - -skrub.set_config(rust_backend=True, debug_timing=False, scheduler=True, stats=True) -cv = KFold(n_splits=3, shuffle=True, random_state=42) -scorer = make_scorer(r2_score) -t0 = perf_counter() -#pr.enable() -#with parallel_backend('threading'): -search = preds.skb.make_grid_search(cv=cv, scoring=scorer, n_jobs=1, fitted=True, refit=True) -#pr.disable() -t1 = perf_counter() -print(f"Time taken: {t1 - t0} seconds") -print(search.results_) -#stats = pstats.Stats(pr).sort_stats("tottime") -#stats.print_stats(60) diff --git a/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp_polars.py b/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp_polars.py deleted file mode 100644 index 15363571..00000000 --- a/benchmarks/logical_optimizer/end-to-end/uk-house-price/two_level_gridsearch_exp_polars.py +++ /dev/null @@ -1,144 +0,0 @@ -from sklearn.metrics import make_scorer, mean_squared_error, r2_score -from sklearn.model_selection import KFold, ShuffleSplit -import polars as pl -from xgboost import XGBRegressor -from lightgbm import LGBMRegressor -from sklearn.linear_model import ElasticNet, Ridge - -from time import perf_counter -import numpy as np -from sklearn.preprocessing import StandardScaler -import stratum as skrub -test=True - -import logging - -logging.basicConfig(level=logging.INFO) - -file_path = "input/price_paid_records_small.csv" if test else "input/price_paid_records.csv" -df = skrub.as_data_op(file_path).skb.apply_func(pl.read_csv).skb.subsample(n=1000) -df = df.rename({"Town/City": "Town"}) -y = df["Price"].skb.mark_as_y() -X = df.drop("Price").skb.mark_as_X() - -from sklearn.base import BaseEstimator, TransformerMixin -class TargetEncoder(BaseEstimator, TransformerMixin): - def fit(self, X, y=None): - print("fit target encoder") - self.global_mean_ = y.mean() - y_name = y.name if isinstance(y, pl.Series) and y.name else 'target' - # Handle both Polars Series and numpy arrays - if isinstance(y, pl.Series): - tmp = X.with_columns(y.alias(y_name)) - else: - tmp = X.with_columns(pl.Series(y_name, y)) - self.cols = X.columns - self.means = {} - for col in self.cols: - # Store as DataFrame with column name and mean for efficient join - self.means[col] = tmp.group_by(col).agg(pl.col(y_name).mean().alias(f"{col}_mean")) - return self - - def transform(self, X): - print("transform target encoder") - X_out = X.clone() - for col in self.cols: - # Use join instead of map for better performance - mean_col_name = f"{col}_mean" - X_out = X_out.join( - self.means[col], - on=col, - how="left" - ).with_columns( - pl.col(mean_col_name).fill_null(self.global_mean_).alias(col) - ).drop(mean_col_name) - return X_out - - def fit_transform(self, X, y=None): - self.fit(X, y) - return self.transform(X) - - def get_feature_names_out(self): - return self.cols - - -def pre_process_1(X, y): - date = X["Date of Transfer"].str.to_datetime() - X = X.with_columns( - year=date.dt.year(), - month=date.dt.month(), - day=date.dt.day(), - dayofweek=date.dt.weekday(), - hour=date.dt.hour()) - X = X.with_columns( - month_sin=(date.dt.month() * (2 * np.pi / 12)).sin(), - month_cos=(date.dt.month() * (2 * np.pi / 12)).cos(), - day_sin=(date.dt.day() * (2 * np.pi / 30)).sin(), - day_cos=(date.dt.day() * (2 * np.pi / 30)).cos(), - dayofweek_sin=(date.dt.weekday() * (2 * np.pi / 7)).sin(), - dayofweek_cos=(date.dt.weekday() * (2 * np.pi / 7)).cos(), - hour_sin=(date.dt.hour() * (2 * np.pi / 24)).sin(), - hour_cos=(date.dt.hour() * (2 * np.pi / 24)).cos(), - ) - X = X.drop([ - "Date of Transfer", - 'Duration', - 'Transaction unique identifier', - 'PPDCategory Type', - 'Record Status - monthly file only']) - - cat_selector = skrub.selectors.filter(lambda col: col.dtype == pl.String) - X_cat = X.skb.select(cat_selector) - X_cat_enc = X_cat.skb.apply(skrub.StringEncoder()) - num_selector = skrub.selectors.filter(lambda col: col.dtype != pl.String) - - X_te = X[["District", "County", "Town"]].skb.apply(TargetEncoder(), y=y) - X_te = X_te.rename({"District": "district_te", "County": "county_te", "Town": "town_te"}) - X_num = X.skb.select(num_selector) - X_num = X_num.skb.concat([X_te], axis=1) - - X_num_scaled = X_num.skb.apply(StandardScaler()) - X_vec = X_num_scaled.skb.concat([X_cat_enc], axis=1) - return X_vec - -def pre_process_2(X): - X_enc = X.skb.apply(skrub.TableVectorizer()) - return X_enc -X_1 = pre_process_1(X,y) -print(X_1.skb.preview()) -X_2 = pre_process_2(X) -X_enc = skrub.choose_from({ - "1": X_1, - "2": X_2 - }, name="feat_eng").as_data_op() - -models = { - "Ridge": Ridge(random_state=42), - "XGBoost": XGBRegressor(random_state=42), - "LightGBM": LGBMRegressor(random_state=42), - "ElasticNet": ElasticNet(random_state=42), -} -preds = {k: X_enc.skb.apply(model, y=y) for k,model in models.items()} -preds = skrub.choose_from(preds, name="models").as_data_op() -preds = preds.skb.apply_func(lambda a, m: (a, print(m))[0], skrub.eval_mode()) - -# play with cvs -cv = 1 -cv = ShuffleSplit(n_splits=1,test_size=0.2,random_state=42) if cv == 1 else KFold(n_splits=cv, shuffle=True, random_state=42) -scorer = make_scorer(mean_squared_error) -t0 = perf_counter() -with skrub.config(scheduler=True, stats=True): - search_stratum = preds.skb.make_grid_search(cv=cv, n_jobs=1, fitted=True, scoring=scorer) -t1 = perf_counter() -print("="*80) -print(f"Stratum gridsearch scheduler time: {t1 - t0} seconds") -print("="*80) -search = preds.skb.make_grid_search(cv=cv, n_jobs=-1, fitted=True, scoring=scorer, refit=False) -t2 = perf_counter() -print("="*80) -print(f"Skrub default gridsearch time: {t2 - t1} seconds") -print("="*80) -print("Results:") -print(search.results_) -print(search_stratum.results_) -print("="*80) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 79da8e4b..a0030c02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,13 @@ classifiers = [ requires-python = ">=3.11" dependencies = [ "scikit-learn==1.8", - "skrub>=0.3", + "skrub==0.6.2", + "pandas==2.3.3", "polars", "graphviz", "pyarrow>=22.0.0", + "joblib", + "psutil", ] [project.optional-dependencies] diff --git a/stratum/_api.py b/stratum/_api.py index 687471af..a5fd7413 100644 --- a/stratum/_api.py +++ b/stratum/_api.py @@ -3,18 +3,35 @@ from stratum._config import FLAGS from stratum.logical_optimizer._optimize import optimize -from stratum.runtime._scheduler import Scheduler +from stratum.runtime._caching import Cache +from stratum.runtime._physical_planning import physical_planning +from stratum.runtime._scheduler import ParallelScheduler, SequentialScheduler +from time import perf_counter - -def grid_search(dag: DataOp, cv=None, scoring=None, return_predictions=False): +def grid_search(dag: DataOp, cv=None, scoring=None, return_predictions=False, env=None): """Perform grid search with cross-validation on a DataOp DAG.""" - + t0 = perf_counter() show_stats = FLAGS.stats is not None - ops_ordered = optimize(dag) - sched = Scheduler(ops_ordered, show_stats) + env_extra = env if env else {} + env = dag.skb.get_data() + for k, v in env_extra.items(): + env[k] = v + cache = None + if FLAGS.caching: + cache = Cache() + dag = optimize(dag) + if FLAGS.scheduler_parallelism is not None: + dag = physical_planning(dag) + sched = ParallelScheduler(dag, {}, show_stats, backend=FLAGS.scheduler_parallelism, cache=cache, env=env) + else: + sched = SequentialScheduler(dag, show_stats, cache=cache, env=env, t0=t0) preds = sched.grid_search(cv, scoring, return_predictions) + if FLAGS.caching: + # persist cache to disk + cache.persist() + # Heavy hitters if show_stats: table = pd.DataFrame(sched.timings, columns=["Op", "time"]) @@ -22,13 +39,30 @@ def grid_search(dag: DataOp, cv=None, scoring=None, return_predictions=False): table.columns = ["Time", "Count"] table = table.reset_index().sort_values(by="Time", ascending=False) print("\n" + "=" * 80) - print(f"Heavy hitters (sorted by time spent in DataOp evaluation):") + print(f"Heavy hitters (sorted by time spent in DataOp evaluation):\n") print(table.head(FLAGS.stats).to_string(index=False)) + table.head(FLAGS.stats).to_csv("heavy_hitters.csv", index=False) print("=" * 80 + "\n") + if FLAGS.caching and cache is not None: + print("\n" + "=" * 80) + print("Cache timing statistics:\n") + cache_stats = [] + for op_name, duration in cache.timings: + cache_stats.append({"Operation": op_name, "Time (s)": f"{duration:.4f}", "Count": "1"}) + if cache.hit_count > 0: + cache_stats.append({"Operation": "cache_hits", "Time (s)": f"{cache.hit_time:.4f}", "Count": f"{cache.hit_count}"}) + if cache.miss_count > 0: + cache_stats.append({"Operation": "cache_misses", "Time (s)": "-", "Count": f"{cache.miss_count}"}) + if cache.set_count > 0: + cache_stats.append({"Operation": "cache_sets", "Time (s)": f"{cache.set_time:.4f}", "Count": f"{cache.set_count}"}) + if cache_stats: + cache_table = pd.DataFrame(cache_stats) + print(cache_table.to_string(index=False)) + print("=" * 80 + "\n") + return (sched,preds) if return_predictions else sched - def evaluate(dag: DataOp, seed: int = 42, test_size = 0.2, cse: bool = False): """Evaluate a DataOp DAG with train/test split.""" ops_ordered = optimize(dag) - return Scheduler(ops_ordered).evaluate(seed, test_size) \ No newline at end of file + return SequentialScheduler(ops_ordered).evaluate(seed, test_size) \ No newline at end of file diff --git a/stratum/_config.py b/stratum/_config.py index 8468be2e..2389d6b9 100644 --- a/stratum/_config.py +++ b/stratum/_config.py @@ -5,6 +5,9 @@ import logging logger = logging.getLogger(__name__) +# Sentinel to detect if scheduler_parallelism was explicitly provided +_UNSET = object() + def _env_bool(name, default=False): val = os.getenv(name) if val is None: @@ -20,6 +23,15 @@ def _env_int(name, default=0): v = os.getenv(name) return int(v) if v is not None else int(default) +def _env_str(name, default=None): + v = os.getenv(name) + if v is None: + return default + s = str(v).strip().lower() + if s in ("", "none", "null"): + return None + return s + @dataclass class _Flags: rust_backend: bool = _env_bool("SKRUB_RUST", False) @@ -28,9 +40,12 @@ class _Flags: allow_patch: bool = _env_bool("SKRUB_RUST_ALLOW_PATCH", True) scheduler: bool = False stats: int | None = None # TODO if we want to use that flag on other runtimes we need to set envirenment variable as well - open_graph: bool = True, + open_graph: bool = False, + cse: bool = True, DEBUG: bool = False + scheduler_parallelism: str | None = _env_str("STRATUM_SCHEDULER_PARALLELISM", None) force_polars: bool = _env_bool("STRATUM_FORCE_POLARS", False) + caching: bool = _env_bool("STRATUM_CACHING", False) FLAGS = _Flags() @@ -42,7 +57,10 @@ def set_config(rust_backend: bool | None = None, scheduler: bool | None = None, open_graph: bool | None = None, DEBUG: bool | None = None, - force_polars: bool | None = None) -> None: + force_polars: bool | None = None, + scheduler_parallelism: str | None = _UNSET, + caching: bool | None = None, + cse: bool = True) -> None: """Runtime toggles (synced env for Rust to read). Parameter: @@ -72,6 +90,13 @@ def set_config(rust_backend: bool | None = None, force_polars: bool, default false Force use of Polars instead of Pandas for dataframe operations. + + scheduler_parallelism: str | None, default None + Scheduler parallelism mode. None uses SequentialScheduler, "threading" or "process" + uses ParallelScheduler with the specified backend. + + caching: bool, default false + Enable/disable caching for DataOp operations. """ if rust_backend is not None: FLAGS.rust_backend = bool(rust_backend) @@ -102,6 +127,22 @@ def set_config(rust_backend: bool | None = None, if force_polars is not None: FLAGS.force_polars = bool(force_polars) os.environ["STRATUM_FORCE_POLARS"] = "1" if FLAGS.force_polars else "0" + if scheduler_parallelism is not _UNSET: + if scheduler_parallelism is not None: + if scheduler_parallelism not in ("threading", "process", "auto"): + raise ValueError(f"scheduler_parallelism must be None, 'threading', 'process', or 'auto', got {scheduler_parallelism}") + FLAGS.scheduler_parallelism = scheduler_parallelism + os.environ["STRATUM_SCHEDULER_PARALLELISM"] = scheduler_parallelism + else: + # Explicitly set to None + FLAGS.scheduler_parallelism = None + if "STRATUM_SCHEDULER_PARALLELISM" in os.environ: + del os.environ["STRATUM_SCHEDULER_PARALLELISM"] + if caching is not None: + FLAGS.caching = bool(caching) + os.environ["STRATUM_CACHING"] = "1" if FLAGS.caching else "0" + if cse is not None: + FLAGS.cse = bool(cse) def get_config() -> dict: @@ -116,6 +157,9 @@ def get_config() -> dict: "open_graph": FLAGS.open_graph, "DEBUG" : FLAGS.DEBUG, "force_polars": FLAGS.force_polars, + "scheduler_parallelism": FLAGS.scheduler_parallelism, + "caching": FLAGS.caching, + "cse": FLAGS.cse, } @contextmanager diff --git a/stratum/logical_optimizer/_dataframe_ops.py b/stratum/logical_optimizer/_dataframe_ops.py index 763590e1..693a3e4e 100644 --- a/stratum/logical_optimizer/_dataframe_ops.py +++ b/stratum/logical_optimizer/_dataframe_ops.py @@ -1,18 +1,21 @@ -from stratum.logical_optimizer._ops import DATA_OP_PLACEHOLDER, BinOp, CallOp, GetAttrOp, GetItemOp, MethodCallOp, Op, ValueOp +from stratum.logical_optimizer._ops import DATA_OP_PLACEHOLDER, BaseEstimatorOp, BinOp, CallOp, GetAttrOp, GetItemOp, MethodCallOp, Op, ValueOp, VariableOp from pandas import DataFrame import pandas as pd import polars as pl from stratum.logical_optimizer._op_utils import topological_iterator from stratum._config import FLAGS - -POLARS = FLAGS.force_polars +from stratum.runtime._hash_utils import stable_hash +from skrub._data_ops._data_ops import DataOp +import logging +from numpy import sin, cos +logger = logging.getLogger(__name__) class DataSourceOp(Op): def __init__(self, data: DataFrame = None, file_path: str = None, _format: str = None, - read_args: tuple | list = None, read_kwargs: dict = None, is_X=False, is_y=False, outputs: list[Op] = None): + read_args: tuple | list = None, read_kwargs: dict = None, is_X=False, is_y=False, outputs: list[Op] = None, inputs: list[Op] = None): if outputs is None: outputs = [] - super().__init__(name="Frame" if data is not None else f"read_{_format}", is_X=is_X, is_y=is_y, outputs=outputs, inputs=None) + super().__init__(name="Frame" if data is not None else f"read_{_format}", is_X=is_X, is_y=is_y, outputs=outputs, inputs=inputs) if read_kwargs is not None: self.check_kwargs(read_kwargs) self.data = data @@ -22,18 +25,26 @@ def __init__(self, data: DataFrame = None, file_path: str = None, _format: str = self.read_kwargs = read_kwargs self.is_dataframe_op = True + def simple_hash(self): + if self.data is not None: + raise NotImplementedError("Hashing is not implemented for DataSourceOp with data yet") + else: + return stable_hash((self.file_path, self.format, self.read_args, self.read_kwargs)) + def process(self, mode: str, environment: dict): + logger.debug(f"Using Polars: {FLAGS.force_polars}") if self.data is not None: - if POLARS: + if FLAGS.force_polars: self.intermediate = pl.DataFrame(self.data) else: self.intermediate = self.data else: - if POLARS: - self.intermediate = pl.read_csv(self.file_path, *self.read_args, **self.read_kwargs) + file_path = self.inputs[0].intermediate if self.file_path is DATA_OP_PLACEHOLDER else self.file_path + if FLAGS.force_polars: + self.intermediate = pl.read_csv(file_path, *self.read_args, **self.read_kwargs) else: - self.intermediate = pd.read_csv(self.file_path, *self.read_args, **self.read_kwargs) + self.intermediate = pd.read_csv(file_path, *self.read_args, **self.read_kwargs) def clone(self): raise ValueError(f"We should not clone DataSourceOp objects.") @@ -50,12 +61,15 @@ def __init__(self, func: str, args: tuple | list = None, kwargs: dict = None, in self.kwargs = kwargs self.is_dataframe_op = True + def simple_hash(self): + return stable_hash((self.func, self.args, self.kwargs)) + def process(self, mode: str, environment: dict): iter_ins = iter(self.inputs) _obj = next(iter_ins).intermediate _args = [next(iter_ins).intermediate if arg is DATA_OP_PLACEHOLDER else arg for arg in self.args] _kwargs = {k: next(iter_ins).intermediate if v is DATA_OP_PLACEHOLDER else v for k, v in self.kwargs.items()} - if POLARS: + if FLAGS.force_polars: if "columns" in _kwargs: _args.append(_kwargs["columns"]) self.intermediate = getattr(_obj, self.func)(*_args) @@ -90,7 +104,7 @@ def _extract_args_and_kwargs(self): def process(self, mode: str, environment: dict): _obj, _args, _kwargs = self._extract_args_and_kwargs() if self.is_method: - if POLARS: + if FLAGS.force_polars: raise ValueError(f"Unsupported method: {self.func}") else: self.intermediate = getattr(_obj, self.func)(*_args, **_kwargs) @@ -98,14 +112,18 @@ def process(self, mode: str, environment: dict): self.intermediate = self.func(_obj, *_args, **_kwargs) class DropOp(ProjectionOp): + fields = ["args", "kwargs", "columns"] def __init__(self, args: tuple | list = (), kwargs: dict = {}, inputs: list[Op] = None, outputs: list[Op] = None, columns: list[str] = None): super().__init__(args=args, kwargs=kwargs, inputs=inputs, outputs=outputs, columns=columns) + def simple_hash(self): + return stable_hash((self.args, self.kwargs)) + def process(self, mode: str, environment: dict): _obj, _args, _kwargs = self._extract_args_and_kwargs() - if POLARS: + if FLAGS.force_polars: if "columns" in _kwargs: _args.append(_kwargs["columns"]) if "ignore_errors" in _kwargs: @@ -131,16 +149,26 @@ def process(self, mode: str, environment: dict): else: n_cols = len(self.columns) - if POLARS: + if FLAGS.force_polars: if isinstance(_obj, pl.Series): n_cols = 1 if n_cols == 1: - self.intermediate = _obj.map_elements(*_args, **_kwargs) + if _args[0] == sin: + logger.debug("Rewrite UDF sin to polars sin") + self.intermediate = _obj.sin() + elif _args[0] == cos: + logger.debug("Rewrite UDF cos to polars cos") + self.intermediate = _obj.cos() + else: + self.intermediate = _obj.map_elements(*_args, **_kwargs) else: self.intermediate = _obj.map_rows(*_args, **_kwargs) else: self.intermediate = _obj.apply(*_args, **_kwargs) + def simple_hash(self): + return stable_hash((self.args, self.kwargs, "apply_udf")) + class AssignOp(ProjectionOp): def __init__(self, args: tuple | list = (), kwargs: dict = {}, inputs: list[Op] = None, outputs: list[Op] = None, columns: list[str] = None): @@ -148,26 +176,42 @@ def __init__(self, args: tuple | list = (), kwargs: dict = {}, def process(self, mode: str, environment: dict): _obj, _args, _kwargs = self._extract_args_and_kwargs() - if POLARS: - self.intermediate = _obj.with_columns(*_args, **_kwargs) + if FLAGS.force_polars: + checked_kwargs = {} + for k, v in _kwargs.items(): + if v is DATA_OP_PLACEHOLDER: + raise NotImplementedError("Is not yet suppoerted, please report this issue") + elif isinstance(v, pd.Series) or isinstance(v, pd.DataFrame): + logger.warning(f"Converting pandas object to polars object for column {k}") + checked_kwargs[k] = pl.from_pandas(v) + else: + checked_kwargs[k] = v + self.intermediate = _obj.with_columns(*_args, **checked_kwargs) else: self.intermediate = _obj.assign(*_args, **_kwargs) + def simple_hash(self): + return stable_hash((self.args, self.kwargs, "assign")) + class DatetimeConversionOp(ProjectionOp): def __init__(self, args: tuple | list = (), kwargs: dict = {}, inputs: list[Op] = None, outputs: list[Op] = None, columns: list[str] = None): - super().__init__(args=args, kwargs=kwargs, inputs=inputs, outputs=outputs, columns=columns) + super().__init__(args=args, inputs=inputs, outputs=outputs, columns=columns) + self.strict = kwargs.get("errors", "raise") == "raise" def process(self, mode: str, environment: dict): - if POLARS: - self.intermediate = self.inputs[0].intermediate.str.to_datetime(*self.args, **self.kwargs) + if FLAGS.force_polars: + self.intermediate = self.inputs[0].intermediate.str.to_datetime(*self.args, strict=self.strict) else: - self.intermediate = pd.to_datetime(self.inputs[0].intermediate, *self.args, **self.kwargs) + self.intermediate = pd.to_datetime(self.inputs[0].intermediate, *self.args, errors="raise" if self.strict else "coerce") + + def simple_hash(self): + return stable_hash((self.args, self.kwargs, "datetime_conversion")) class GetAttrProjectionOp(Op): fields = ["attr_name"] - POLARS_ATTR_NAME_MAP = {"dayofweek": "weekday"} + POLARS_ATTR_NAME_MAP = {"dayofweek": "weekday","dayofyear": "ordinal_day"} def __init__(self, attr_name: list[str] | str = None, inputs: list[Op] = None, outputs: list[Op] = None): if attr_name is None: @@ -188,17 +232,24 @@ def __str__(self): def process(self, mode: str, environment: dict): self.intermediate = self.inputs[0].intermediate - if POLARS: + tmp = self.intermediate + if FLAGS.force_polars: for attr in self.attr_name: attr = self.POLARS_ATTR_NAME_MAP.get(attr, attr) + + # TODO find better way to handle this + if attr == "is_month_end": + self.intermediate = (self.intermediate.dt.month_end() == self.intermediate) + return + # polars implements dt.day as a method, not an attribute # use getattr to handle both attributes and methods - self.intermediate = getattr(self.intermediate, attr) - self.intermediate = self.intermediate() + tmp = getattr(tmp, attr) + self.intermediate = tmp() else: for attr in self.attr_name: - self.intermediate = self.intermediate.__getattribute__(attr) - + tmp = getattr(tmp, attr) + self.intermediate = tmp class GroupedDataframeOp(Op): def __init__(self, ops: list[Op]): super().__init__(name="GROUPED_DATAFRAME", is_X=False, is_y=False) @@ -210,6 +261,30 @@ def process(self, mode: str, environment: dict): op.process(mode, environment) self.intermediate = self.ops[-1].intermediate +class ConcatOp(Op): + fields = ["first", "others", "axis"] # Add more if needed + + axis_map = { + 0: "diagonal_relaxed", + 1: "horizontal", + } + def __init__(self, first: Op, others: list[Op], axis: int): + super().__init__(name="CONCAT", is_X=False, is_y=False) + self.first = DATA_OP_PLACEHOLDER if isinstance(first, DataOp) else first + self.others = [DATA_OP_PLACEHOLDER if isinstance(other, DataOp) else other for other in others] + self.axis = DATA_OP_PLACEHOLDER if isinstance(axis, DataOp) else axis + self.is_dataframe_op = True + + def process(self, mode: str, environment: dict): + input_iter = iter(self.inputs) + first = next(input_iter).intermediate if self.first is DATA_OP_PLACEHOLDER else self.first + others = [next(input_iter).intermediate if other is DATA_OP_PLACEHOLDER else other for other in self.others] + axis = next(input_iter).intermediate if self.axis is DATA_OP_PLACEHOLDER else self.axis + if FLAGS.force_polars: + self.intermediate = pl.concat([first, *others], how=self.axis_map[axis]) + else: + self.intermediate = pd.concat([first, *others], axis=axis) + def rewrite_fuse_get_item_ops(op: Op) -> Op: pass @@ -249,6 +324,9 @@ def process(self, mode: str, environment: dict): else: raise ValueError(f"Unsupported dataframe type: {type(x)}") + def simple_hash(self): + return 1 + class SplitOutput(Op): def __init__(self, inputs: list[Op]=None, outputs: list[Op]=None, is_x = True, ): name = "X" if is_x else "y" @@ -262,6 +340,9 @@ def process(self, mode: str, environment: dict): else: self.intermediate = self.inputs[0].intermediate[1] + def simple_hash(self): + return 2 if self.is_x else 3 + def add_splitting_op(sink: Op) -> Op: x_op = None y_op = None @@ -330,9 +411,10 @@ def rewrite_dataframe_ops(sink: Op) -> Op: op.is_dataframe_op = True # mark as dataframe op - elif isinstance(op, GetItemOp): + elif isinstance(op, GetItemOp) or isinstance(op, BaseEstimatorOp): op.is_dataframe_op = True + if new_op is not None: op.replace_input_of_outputs(new_op) if sink is op: @@ -356,11 +438,33 @@ def make_datetime_conversion_op(new_op: DatetimeConversionOp, op: CallOp) -> Dat def make_read_op(new_op: DataSourceOp, op: CallOp) -> DataSourceOp: input_iter = iter(op.inputs) # assume all inputs are ValueOps - assert all(isinstance(arg, ValueOp) for arg in op.inputs), "All inputs must be ValueOps" - args = [next(input_iter).value if arg is DATA_OP_PLACEHOLDER else arg for arg in op.args] - kwargs = {k: next(input_iter).value if v is DATA_OP_PLACEHOLDER else v for k, v in op.kwargs.items()} - new_op = DataSourceOp(file_path=args[0], _format="csv", read_args=args[1:], read_kwargs=kwargs) - new_op.outputs = op.outputs + assert all(isinstance(arg, ValueOp) or isinstance(arg, VariableOp) for arg in op.inputs), "All inputs must be ValueOps or VariableOps" + inputs = [] + args = [] + for arg in op.args: + if arg is DATA_OP_PLACEHOLDER: + actual_input_op = next(input_iter) + if isinstance(actual_input_op, VariableOp): + args.append(DATA_OP_PLACEHOLDER) + inputs.append(actual_input_op) + else: + args.append(actual_input_op.value) + else: + args.append(arg) + kwargs = {} + for k, v in op.kwargs.items(): + if v is DATA_OP_PLACEHOLDER: + actual_input_op = next(input_iter) + if isinstance(actual_input_op, VariableOp): + kwargs[k] = DATA_OP_PLACEHOLDER + inputs.append(actual_input_op) + else: + kwargs[k] = actual_input_op.value + else: + kwargs[k] = v + new_op = DataSourceOp(file_path=args[0], _format="csv", read_args=args[1:], read_kwargs=kwargs, inputs=inputs, outputs=op.outputs) + for in_ in inputs: + in_.replace_output(op, new_op) return new_op diff --git a/stratum/logical_optimizer/_op_comparison.py b/stratum/logical_optimizer/_op_comparison.py index 0ad18b24..e63d3269 100644 --- a/stratum/logical_optimizer/_op_comparison.py +++ b/stratum/logical_optimizer/_op_comparison.py @@ -1,9 +1,11 @@ from typing import Iterable from sklearn.base import BaseEstimator +from skrub import SelectCols from skrub._data_ops import DataOp from skrub._data_ops._choosing import Choice -from skrub._data_ops._data_ops import Call, GetItem, CallMethod, GetAttr, Apply, Value, BinOp -from skrub.selectors._base import All +from skrub._data_ops._data_ops import Call, GetItem, CallMethod, GetAttr, Apply, Value, BinOp, Concat +from skrub.selectors._base import All, Filter, Inv +from pandas import isna def equals_data_op(op1: DataOp, op2: DataOp): """ @@ -54,6 +56,10 @@ def equals_skrub_impl(impl1, impl2): # TODO also match All with set(cols) if cols contains all columns of the input frame if set(cols1) == set(cols2): return estimator_equality_check(est1, est2) + elif isinstance(impl1, Concat): + # op1 = col1.skb.concat(col2, axis=1) + # op2 = col1.skb.concat(col2, axis=1) + return _stable_id(impl1.first) == _stable_id(impl2.first) and _stable_id(impl1.others) == _stable_id(impl2.others) elif isinstance(impl1, BinOp): # op1 = col1 / col2 # op2 = col1 / col2 @@ -72,7 +78,7 @@ def estimator_equality_check(est1: BaseEstimator, est2: BaseEstimator) -> bool: params2 = est2.get_params() for key, value in params1.items(): value2 = params2.get(key) - if value2 != value and ( + if value2 != value and not isna(value) and not isna(value2) and ( type(value) != type(value2) or not isinstance(value, BaseEstimator) or not estimator_equality_check(value, value2)): @@ -129,7 +135,10 @@ def hash_skrub_impl(impl) -> int: return hash((t, id(impl.X), col_ids, est_type, est_params)) elif isinstance(impl, BinOp): return hash((t, impl.op, _stable_id(impl.left), _stable_id(impl.right))) - + elif isinstance(impl, Concat): + # op1 = col1.skb.concat(col2, axis=1) + # op2 = col1.skb.concat(col2, axis=1) + return hash((_stable_id(impl.first), _stable_id(impl.others))) else: # Fallback for unknown DataOp types return hash((t, id(impl))) @@ -159,6 +168,10 @@ def _stable_id(obj): return frozenset(_stable_id(x) for x in obj) elif isinstance(obj, dict): return frozenset((k, _stable_id(v)) for k, v in obj.items()) + elif isinstance(obj, Filter): + return id(obj.predicate) + elif isinstance(obj, Inv): + return _stable_id(obj.complement)*-1 elif hasattr(obj, "__hash__") and not isinstance(obj, DataOp): # hashable primitive or object return hash(obj) @@ -234,6 +247,14 @@ def update_data_op(op: DataOp, old_input: DataOp, new_input: DataOp): elif impl.right is old_input: impl.right = new_input return + elif isinstance(impl, Concat): + if impl.first is old_input: + impl.first = new_input + return + for i, other in enumerate(impl.others): + if other is old_input: + impl.others[i] = new_input + return raise Exception(f"Could not find old DataOp {old_input} during input update for {op}") diff --git a/stratum/logical_optimizer/_op_utils.py b/stratum/logical_optimizer/_op_utils.py index 7e492fb7..3f6ee4ea 100644 --- a/stratum/logical_optimizer/_op_utils.py +++ b/stratum/logical_optimizer/_op_utils.py @@ -2,10 +2,12 @@ from collections import deque from typing import Iterator from graphviz import Digraph -from stratum.logical_optimizer._ops import Op, ChoiceOp +from stratum.logical_optimizer._ops import DATA_OP_PLACEHOLDER, Op, ChoiceOp from stratum._config import get_config import os +bfs = False + def replace_op_in_outputs(op: Op, replacement: Op): """Replace op in all its outputs with a replacement op.""" @@ -110,18 +112,42 @@ def topological_iterator(sink: Op) -> Iterator[Op]: else: for in_op in op.inputs: if in_op not in indegree: - indegree[in_op] = 0 if not in_op.inputs else len(in_op.inputs) + if in_op is DATA_OP_PLACEHOLDER: + raise RuntimeError(f"Encountered DATA_OP_PLACEHOLDER as input of op {op}, which should not happen.") + curr_indegree = len(in_op.inputs) + (0 if in_op.additional_inputs is None else len(in_op.additional_inputs)) + indegree[in_op] = curr_indegree queue1.append(in_op) # now we can do topological traversal - while queue2: - op = queue2.popleft() + if bfs: + return topological_iterator_bfs(sink, queue2, indegree) + else: + return topological_iterator_dfs(sink, queue2, indegree) + +def topological_iterator_bfs(sink: Op, queue, indegree) -> Iterator[Op]: + while queue: + op = queue.popleft() yield op - for out_op in op.outputs: + op_outputs = op.outputs + (op.additional_outputs if op.additional_outputs is not None else []) + for out_op in op_outputs: + if out_op not in indegree: + raise RuntimeError(f"Encountered op {out_op} which should not exist in the DAG. Probably due to a buggy rewrite, which did not updated the its inputs / outputs correctly.") indegree[out_op] -= 1 if indegree[out_op] == 0: - queue2.append(out_op) + queue.append(out_op) +def topological_iterator_dfs(sink: Op, queue, indegree) -> Iterator[Op]: + stack = list(queue) + while stack: + op = stack.pop() + yield op + op_outputs = op.outputs + (op.additional_outputs if op.additional_outputs is not None else []) + for out_op in op_outputs: + if out_op not in indegree: + raise RuntimeError(f"Encountered op {out_op} which should not exist in the DAG. Probably due to a buggy rewrite, which did not updated the its inputs / outputs correctly.") + indegree[out_op] -= 1 + if indegree[out_op] == 0: + stack.append(out_op) def show_graph(sink: Op, filename: str = 'plan'): """Show the runtime plan of the DataOp DAG.""" @@ -134,6 +160,9 @@ def show_graph(sink: Op, filename: str = 'plan'): dot.node(str(id(current_op)), name) for outputs in current_op.outputs: dot.edge(str(id(current_op)), str(id(outputs))) + if current_op.additional_outputs is not None: + for additional_output in current_op.additional_outputs: + dot.edge(str(id(current_op)), str(id(additional_output)), color='blue') filename = "graphs/" + filename # make sure folder exists os.makedirs(os.path.dirname(filename), exist_ok=True) diff --git a/stratum/logical_optimizer/_ops.py b/stratum/logical_optimizer/_ops.py index 8c37968b..2531192f 100644 --- a/stratum/logical_optimizer/_ops.py +++ b/stratum/logical_optimizer/_ops.py @@ -1,13 +1,18 @@ from __future__ import annotations +import sys from types import SimpleNamespace from typing import Callable +from joblib import parallel_config from sklearn import clone from sklearn.base import BaseEstimator from skrub._data_ops._choosing import Choice -from skrub._data_ops._data_ops import DataOp, Apply, Value, CallMethod, Call, GetAttr, GetItem, BinOp as SkrubBinOp, _wrap_estimator -from pandas import DataFrame +from skrub._data_ops._data_ops import DataOp, Apply, Value, CallMethod, Call, GetAttr, GetItem, BinOp as SkrubBinOp, Concat, Var, _wrap_estimator +from pandas import DataFrame, Series from polars import DataFrame as PlDataFrame, Series as PlSeries +from stratum.runtime._hash_utils import stable_hash +import logging +logger = logging.getLogger(__name__) class PlaceHolder(): def __init__(self, name: str): @@ -27,17 +32,24 @@ def __init__(self, inputs=None,outputs=None, name=None, is_X=False, is_y=False): self.name = name self.outputs = outputs if outputs is not None else [] self.inputs = inputs if inputs is not None else [] + self.additional_inputs = None + self.additional_outputs = None self.intermediate = None self.is_X = is_X self.is_y = is_y self.is_dataframe_op = False self.is_split_op = False self.was_cloned = False + self.parallel_group = None + self.cached_hash = None def to_str_helper(self): class_name = self.__class__.__name__ is_df = " [df]" if self.is_dataframe_op else "" name = f"({self.name})" if self.name and len(self.name) > 0 else "" + # truncate name if it is too long + if len(name) > 50: + name = name[:50] + "..." return class_name, name, is_df def __str__(self): @@ -103,6 +115,28 @@ def check_kwargs(self, kwargs): f" {type(kwargs).__name__!r} instead: {kwargs!r}" ) + def simple_hash(self): + raise NotImplementedError(f"Simple_hash must be implemented in {self.__class__.__name__}") + + def get_hash(self): + if self.cached_hash is not None: + return self.cached_hash + sub_dag_hash = [op.get_hash() for op in self.inputs] + sub_dag_hash.append(self.simple_hash()) + self.cached_hash = stable_hash(sub_dag_hash) + return self.cached_hash + + + def get_intermediate_size(self): + if isinstance(self.intermediate, DataFrame): + return self.intermediate.memory_usage(deep=True).sum() + elif isinstance(self.intermediate, Series): + return self.intermediate.memory_usage(deep=True) + elif isinstance(self.intermediate, PlDataFrame) or isinstance(self.intermediate, PlSeries): + return self.intermediate.estimated_size() + else: + return sys.getsizeof(self.intermediate) + def clone_value(value): if isinstance(value, dict): return {k:clone_value(v) for k,v in value.items()} @@ -162,7 +196,22 @@ def process(self, mode: str, environment: dict): ns = self.replace_fields_with_values() self.intermediate = self.skrub_impl.compute(ns, mode, environment) -class EstimatorOp(Op): +class VariableOp(Op): + def __init__(self, name: str, value = None): + super().__init__(name=name) + self.name = name + if value is not None: + self.value = value + else: + self.value = "EMPTY_VARIABLE" + + def clone(self): + return VariableOp(name=self.name) + + def process(self, mode: str, environment: dict): + self.intermediate = environment[self.name] + +class BaseEstimatorOp(Op): fields = ["estimator", "y", "cols", "how", "allow_reject", "unsupervised", "kwargs"] def __init__(self, estimator: BaseEstimator, y=None, cols=None, how="no-wrap", allow_reject=False, unsupervised=False, kwargs=None): @@ -171,18 +220,25 @@ def __init__(self, estimator: BaseEstimator, y=None, cols=None, how="no-wrap", a kwargs = {} self.check_kwargs(kwargs) self.estimator = estimator + place_holders = {k: v for k, v in self.estimator.get_params().items() if isinstance(v, DataOp)} + self.estimator.set_params(**place_holders) + self.original_estimator = clone(self.estimator) self.y = DATA_OP_PLACEHOLDER if isinstance(y, DataOp) else y self.cols = DATA_OP_PLACEHOLDER if isinstance(cols, DataOp) else cols self.how = how self.allow_reject = allow_reject self.unsupervised = unsupervised self.kwargs = remove_datops_from_args(kwargs) if kwargs is not None else kwargs + self.parallelism = 8 + + def simple_hash(self): + return stable_hash((self.estimator, self.y, self.cols, self.how, self.allow_reject, self.unsupervised, self.kwargs)) def clone(self): params = self.estimator.get_params() estimator_new = clone(self.estimator) estimator_new.set_params(**params) - new_op = EstimatorOp( + new_op = self.__class__( estimator=estimator_new, y=self.y, cols=self.cols, @@ -193,35 +249,119 @@ def clone(self): ) new_op.was_cloned = True return new_op - - def process(self, mode: str, environment: dict): + + def extract_args_from_inputs(self, mode: str): + """ + Extract all necessary data from an EstimatorOp to make it picklable for multiprocessing. + + Returns a tuple of picklable data that can be sent to worker processes. + """ input_iter = iter(self.inputs) x = next(input_iter).intermediate - if isinstance(x, PlDataFrame): - x = x.to_pandas() - y = next(input_iter).intermediate if self.y == DATA_OP_PLACEHOLDER else self.y - if isinstance(y, PlSeries): - y = y.to_pandas() + assert x is not None, f"X is None for {self}" + y = None if mode == 'predict' else next(input_iter).intermediate if self.y == DATA_OP_PLACEHOLDER else self.y + estm = self.estimator if mode == "predict" else self.original_estimator + place_holders = {k: next(input_iter).intermediate for k, v in estm.get_params().items() if isinstance(v, DataOp)} + estm.set_params(**place_holders) cols = next(input_iter).intermediate if self.cols == DATA_OP_PLACEHOLDER else self.cols + return ( + estm, + x, + y, + cols, + self.how, + self.allow_reject, + self.unsupervised, + self.kwargs, + mode, + self.parallelism + ) + + def process(self, mode: str, environment: dict): + # we use a separate function to process the estimator to allow reuse for multiprocessing + task_data = self.extract_args_from_inputs(mode) + process_task = self.get_process_task() + self.intermediate, self.estimator = process_task(task_data) + + def get_process_task(self): + raise NotImplementedError(f"get_process_task must be implemented in {self.__class__.__name__}") + +class EstimatorOp(BaseEstimatorOp): + def get_process_task(self): + return process_estimator_task + +class TransformerOp(BaseEstimatorOp): + def get_process_task(self): + return process_transformer_task + +class DummyConfigManager: + """A no-op context manager that does nothing.""" + def __enter__(self): + return self + + def __exit__(self, *args): + return False + +def estimator_parallel_config(n_jobs: int = None): + if n_jobs is not None: + logger.debug(f"Using threading backend with {n_jobs} jobs") + return parallel_config(backend='threading', n_jobs=n_jobs) + else: + return DummyConfigManager() + +def estm_supports_polars(estimator): + is_sklearn = estimator.__class__.__module__.startswith("sklearn.") or estimator.__class__.__module__.startswith("skrub.") + is_stratum = estimator.__class__.__module__.startswith("stratum.") and estimator.__class__.__name__.startswith("Rusty") + # other_frameworks = estimator.__class__.__module__.startswith("xgboost.") + return is_sklearn or is_stratum #or other_frameworks + +def check_estm_inputs(estimator, mode, x, y): + input_is_polars = type(x) == PlDataFrame + converted = False + if estimator.__class__.__module__.startswith("skrub."): + if estimator.__class__.__name__.startswith("ApplyTo"): + estimator = estimator.transformer + if input_is_polars and not estm_supports_polars(estimator): + converted = True + logger.debug(f"Estimator {estimator.__class__.__name__} does not support Polars DataFrame. Converting to Pandas DataFrame.") + x = x.to_pandas() + if y is not None and mode == "fit_transform": + y = y.to_pandas() + return converted, x, y + +def process_estimator_task(task_data): + """ Process a predictor (EstimatorOp) task in a worker process. """ + (estimator, x, y, cols, how, allow_reject, unsupervised, kwargs, mode, parallelism) = task_data + _, x, y = check_estm_inputs(estimator, mode, x, y) + if mode == "fit_transform": + estimator = _wrap_estimator(estimator, cols, how=how, allow_reject=allow_reject, X=x) + y_arg = () if unsupervised else (y,) + estimator.fit(x, *y_arg, **kwargs) + result = estimator.predict(x, **kwargs) + # Return both result and fitted estimator (in case of multi-processing) + return result, estimator + elif mode == "predict": + result = estimator.predict(x, **kwargs) + return result, estimator + else: + raise ValueError(f"Mode {mode} not supported for EstimatorOp.") + +def process_transformer_task(task_data): + """ Process a transformer (TransformerOp) task in a worker process. """ + (estimator, x, y, cols, how, allow_reject, unsupervised, kwargs, mode, parallelism) = task_data + converted, x, y = check_estm_inputs(estimator, mode, x, y) + with estimator_parallel_config(parallelism): if mode == "fit_transform": - self.estimator = _wrap_estimator(self.estimator, cols, how=self.how, allow_reject=self.allow_reject, X=x) - y_arg = () if self.unsupervised else (y,) - if not hasattr(self.estimator, mode): - # Predictors - self.estimator.fit(x, *y_arg, **self.kwargs) - self.intermediate = self.estimator.predict(x, **self.kwargs) - else: - # Transformers - self.intermediate = self.estimator.fit_transform(x, *y_arg, **self.kwargs) + estimator = _wrap_estimator(estimator, cols, how=how, allow_reject=allow_reject, X=x) + y_arg = () if unsupervised else (y,) + result = estimator.fit_transform(x, *y_arg, **kwargs) elif mode == "predict": - if not hasattr(self.estimator, mode): - # Transformers - self.intermediate = self.estimator.transform(x, **self.kwargs) - else: - # Predictors - self.intermediate = self.estimator.predict(x, **self.kwargs) + result = estimator.transform(x, **kwargs) else: - raise ValueError(f"Mode {mode} not supported for EstimatorOp.") + raise ValueError(f"Mode {mode} not supported for TransformerOp.") + if converted: + result = PlDataFrame(result) + return result, estimator class ChoiceOp(Op): @@ -248,7 +388,11 @@ def make_outcome_names(self): ) for combi in self.outcome_names] def update_name(self): - self.name = " | ".join(self.make_outcome_names()) + opts = " | ".join(self.make_outcome_names()) + max_len = 50 + if len(opts) > max_len: + opts = opts[:max_len] + "..." + self.name = opts def clone(self): new_op = ChoiceOp(outcome_names=self.outcome_names, append_choice_name=False) @@ -287,6 +431,8 @@ def __init__(self, method_name: str, args = None, kwargs = None): def process(self, mode: str, environment: dict): iter_ins = iter(self.inputs) _obj = next(iter_ins).intermediate + if isinstance(_obj, PlDataFrame) or isinstance(_obj, PlSeries): + _obj = _obj.to_pandas() _args = [next(iter_ins).intermediate if arg is DATA_OP_PLACEHOLDER else arg for arg in self.args] _kwargs = {k: next(iter_ins).intermediate if v is DATA_OP_PLACEHOLDER else v for k, v in self.kwargs.items()} self.intermediate = _obj.__getattribute__(self.method_name)(*_args, **_kwargs) @@ -294,7 +440,9 @@ def process(self, mode: str, environment: dict): class CallOp(Op): fields = ["func", "args", "kwargs"] - def __init__(self, name: str = "CallOp", func=None, args=None, kwargs=None): + def __init__(self, name=None, func=None, args=None, kwargs=None): + if name is None: + name = "CallOp" if func is None else func.__name__ super().__init__(name=name) if kwargs is not None: self.check_kwargs(kwargs) @@ -319,19 +467,30 @@ def process(self, mode: str, environment: dict): if self.is_dataframe_op: self.intermediate = self.inputs[0].intermediate for attr in self.attr_name: - self.intermediate = self.intermediate.__getattribute__(attr) + self.intermediate = getattr(self.intermediate, attr) else: - self.intermediate = self.inputs[0].intermediate.__getattribute__(self.attr_name) + self.intermediate = getattr(self.inputs[0].intermediate, self.attr_name) class GetItemOp(Op): fields = ["key"] def __init__(self, key=None): - super().__init__(name=str(key) if key is not None else '?') - self.key = key + self.key = DATA_OP_PLACEHOLDER if isinstance(key, DataOp) else key + name = key._skrub_impl.__class__.__name__ if isinstance(key, DataOp) else str(self.key) + super().__init__(name=name) + def process(self, mode: str, environment: dict): - self.intermediate = self.inputs[0].intermediate[self.key] + key = self.key + if key is DATA_OP_PLACEHOLDER: + key = self.inputs[1].intermediate + self.intermediate = self.inputs[0].intermediate[key] + + def simple_hash(self): + if isinstance(self.key, str) or isinstance(self.key, list): + return stable_hash(self.key) + else: + raise NotImplementedError(f"Hashing is nt implemented for key type: {type(self.key)}") class BinOp(Op): fields = ["op", "left", "right"] @@ -343,7 +502,7 @@ def __init__(self, op: Callable, left, right): self.right = DATA_OP_PLACEHOLDER if isinstance(right, DataOp) else right - def process(self, mode: str, environment: dict): + def process(self, mode: str, environment: dict, cv_id = None): i = 0 if self.left is DATA_OP_PLACEHOLDER: left = self.inputs[i].intermediate @@ -411,14 +570,20 @@ def as_op(data_op: DataOp): elif isinstance(impl, SkrubBinOp): return_op = BinOp(op=impl.op, left=impl.left, right=impl.right) elif isinstance(impl, Apply): - return_op = EstimatorOp( + estimator_class = EstimatorOp if hasattr(impl.estimator, "predict") else TransformerOp + return_op = estimator_class( y=impl.y, estimator=impl.estimator, cols=impl.cols, how=impl.how, allow_reject=impl.allow_reject, unsupervised=impl.unsupervised, - kwargs=impl.kwargs if hasattr(impl, "kwargs") else {}) + kwargs= {}) + elif isinstance(impl, Var): + return_op = VariableOp(name=impl.name, value=impl.value) + elif isinstance(impl, Concat): + from stratum.logical_optimizer._dataframe_ops import ConcatOp + return_op = ConcatOp(first=impl.first, others=impl.others, axis=impl.axis) else: return_op = ImplOp(skrub_impl=impl, name=data_op.__skrub_short_repr__()) diff --git a/stratum/logical_optimizer/_optimize.py b/stratum/logical_optimizer/_optimize.py index 72d1a6a9..de3ee26c 100644 --- a/stratum/logical_optimizer/_optimize.py +++ b/stratum/logical_optimizer/_optimize.py @@ -1,13 +1,12 @@ -from numpy import True_ -from skrub._data_ops._evaluation import _Graph from skrub._data_ops import DataOp from skrub._data_ops._subsampling import SubsamplePreviews from collections import deque from ._cse import apply_cse -from ._dataframe_ops import rewrite_dataframe_ops, add_splitting_op +from ._dataframe_ops import add_splitting_op from ._dataframe_ops import rewrite_dataframe_ops, group_dataframe_ops from ._ops import ChoiceOp, ImplOp, Op, SearchEvalOp, as_op from ._op_utils import clone_sub_dag, find_choice_naive, replace_op_in_outputs, show_graph, topological_iterator +from ._skrub_graph import build_graph from time import perf_counter import logging from stratum._config import FLAGS @@ -38,7 +37,7 @@ def topological_traverse(nodes, parents, children): def apply_cse_on_skrub_ir(dag: DataOp): """ Apply CSE on a Skrub DataOp DAG and return the deduplicated DAG. (Deprecated versio of optimize function)""" - graph = _Graph().run(dag) + graph = build_graph(dag) nodes = graph["nodes"] parents = graph["parents"] children = graph["children"] @@ -48,6 +47,7 @@ def apply_cse_on_skrub_ir(dag: DataOp): return dag class OptConfig(): + # TODO we should move this class to the _config.py file def __init__(self, cse: bool = True, unroll_choices: bool = True, dataframe_ops: bool = True): self.cse = cse self.dataframe_ops = dataframe_ops @@ -57,58 +57,83 @@ def _debug_show_graph(sink: Op, name: str): if FLAGS.DEBUG: show_graph(sink, name) -def optimize(dag: DataOp, config: OptConfig = None): +def optimize(dag_sink: DataOp, config: OptConfig = None): """ Entry point for the logical optimizer. Takes a Skrub DataOp DAG, applies logical optimizations - and returns a topologically sorted list of Op nodes.""" + and returns an Op sink node.""" t0 = perf_counter() if config is None: config = OptConfig() - graph = _Graph().run(dag) - nodes = graph["nodes"] - parents = graph["parents"] - children = graph["children"] - + t0_graph = perf_counter() + g = build_graph(dag_sink) + nodes = g["nodes"] + parents = g["parents"] + children = g["children"] + t1_graph = perf_counter() + logger.info(f"Graph construction took {t1_graph - t0_graph:.2f} seconds") order = topological_traverse(nodes, parents, children) - if config.cse: - apply_cse(dag, nodes, order, parents) + if FLAGS.cse: + t0_cse = perf_counter() + apply_cse(dag_sink, nodes, order, parents) # TODO cse should direcly return the new list of ops ordered so we dont have to iterate again + t1_cse = perf_counter() + logger.info(f"CSE took {t1_cse - t0_cse:.2f} seconds") - sink = convert_to_ops(dag) + t0_convert = perf_counter() + sink = convert_to_ops(dag_sink) + t1_convert = perf_counter() + logger.info(f"Conversion took {t1_convert - t0_convert:.2f} seconds") + + t0_splitting = perf_counter() sink = add_splitting_op(sink) - _debug_show_graph(sink, "convertion") + t1_splitting = perf_counter() + logger.info(f"Splitting took {t1_splitting - t0_splitting:.2f} seconds") + + _debug_show_graph(sink, "convertion") + t1_splitting = perf_counter() + logger.info(f"Splitting took {t1_splitting - t0_splitting:.2f} seconds") # Rewrites: # Parsing of dataframe ops if config.dataframe_ops: + t0_dataframe = perf_counter() sink = rewrite_dataframe_ops(sink) sink = group_dataframe_ops(sink) _debug_show_graph(sink, "dataframe_rewrite") - + t1_dataframe = perf_counter() + logger.info(f"Dataframe rewrite took {t1_dataframe - t0_dataframe:.2f} seconds") # Unrolling of choices to a dag wit only a single choice op at the end if config.unroll_choices: + t0_choices = perf_counter() sink = choice_unrolling(sink) + _debug_show_graph(sink, "unrolled") + t1_choices = perf_counter() + logger.info(f"Choices unrolling took {t1_choices - t0_choices:.2f} seconds") # Final optimized DAG - _debug_show_graph(sink, "optimized") - output = [op for op in topological_iterator(sink)] + t1 = perf_counter() - logger.info("="*100 + f"\nOptimization took {t1 - t0:.2f} seconds\n" + "="*100) - return output + logger.info(f"Optimization took in total {t1 - t0:.2f} seconds") + return sink def convert_to_ops(dag: DataOp) -> Op: """ Convert a Skrub DataOp DAG to a stratum's logical IR (Op DAG)""" - graph = _Graph().run(dag) - nodes = graph["nodes"] - parents = graph["parents"] - children = graph["children"] - + t0_convert = perf_counter() + g = build_graph(dag) + nodes = g["nodes"] + parents = g["parents"] + children = g["children"] + t1_convert = perf_counter() + logger.info(f"Conversion dag took {t1_convert - t0_convert:.2f} seconds") order = topological_traverse(nodes, parents, children) sink_id = order[-1] + # make logical IR: + # we start by making unconnected ops ids_to_ops = {node: as_op(nodes[node]) for node in order} + # we then connect the ops to a graph for node in order: op = ids_to_ops[node] if isinstance(op, ImplOp) and isinstance(op.skrub_impl, SubsamplePreviews): @@ -169,7 +194,9 @@ def choice_unrolling(sink: Op): else: assert sink is last_op, "Sink should be the last op in the dag" # we reached the end of the dag + logger.debug(f"Unrolling simple choice: {op}") sink = unroll_simple_choice(sink, op, outcomes) + logger.debug(f"New sink after unrolling: {sink}") # if FLAGS.DEBUG: # show_graph(sink, f"choice-unrolled={i}") diff --git a/stratum/logical_optimizer/_skrub_graph.py b/stratum/logical_optimizer/_skrub_graph.py new file mode 100644 index 00000000..84d7525a --- /dev/null +++ b/stratum/logical_optimizer/_skrub_graph.py @@ -0,0 +1,92 @@ +"""Fast graph extraction from a skrub DataOp DAG. + +Drop-in replacement for ``skrub._data_ops._evaluation._Graph().run(dag)`` +that avoids the heavyweight generator-based ``_DataOpTraversal`` machinery. +We only need the DataOp-to-DataOp adjacency; choices, estimators, slices etc. +are irrelevant for graph structure and can be skipped. +""" + +from collections import defaultdict +from skrub._data_ops import DataOp +from skrub._data_ops._choosing import BaseChoice, Choice, Match + + +_BUILTIN_SEQ = (list, tuple, frozenset, set) + + +def _collect_child_data_ops(value): + """Yield all DataOp objects reachable from *value*. + + Handles DataOps stored directly in a field, or nested inside the built-in + container types that skrub uses (tuple, list, dict, set, frozenset), + as well as skrub Choice/Match wrappers. + """ + if isinstance(value, DataOp): + yield value + elif isinstance(value, Match): + yield from _collect_child_data_ops(value.choice) + yield from _collect_child_data_ops(value.outcome_mapping) + elif isinstance(value, Choice): + for outcome in value.outcomes: + yield from _collect_child_data_ops(outcome) + elif isinstance(value, BaseChoice): + pass + elif isinstance(value, dict): + for v in value.values(): + yield from _collect_child_data_ops(v) + elif isinstance(value, _BUILTIN_SEQ): + for item in value: + yield from _collect_child_data_ops(item) + + +def _unique(seq): + """Deduplicate while preserving order.""" + return list(dict.fromkeys(seq)) + + +def build_graph(data_op): + """Build the graph dict for a DataOp DAG. + + Returns the same ``{"nodes", "children", "parents"}`` dict produced by + ``skrub._data_ops._evaluation._Graph().run()``, with integer ids starting + from 0. + + Uses an iterative stack-based DFS that only visits DataOp nodes, + skipping the generator protocol and all non-DataOp node types. + """ + raw_nodes = {} + raw_children = defaultdict(list) + raw_parents = defaultdict(list) + + stack = [data_op] + visited = set() + + while stack: + node = stack.pop() + node_id = id(node) + if node_id in visited: + continue + visited.add(node_id) + raw_nodes[node_id] = node + + impl = node._skrub_impl + for field_name in impl._fields: + attr = getattr(impl, field_name) + for child in _collect_child_data_ops(attr): + child_id = id(child) + raw_children[node_id].append(child_id) + raw_parents[child_id].append(node_id) + if child_id not in visited: + stack.append(child) + + short = {obj_id: i for i, obj_id in enumerate(raw_nodes)} + nodes = {short[k]: v for k, v in raw_nodes.items()} + children = { + short[k]: [short[c] for c in _unique(v)] + for k, v in raw_children.items() + } + parents = { + short[k]: [short[p] for p in _unique(v)] + for k, v in raw_parents.items() + } + return {"nodes": nodes, "children": children, "parents": parents} diff --git a/stratum/patching/_gridsearch.py b/stratum/patching/_gridsearch.py index 18148354..c827b132 100644 --- a/stratum/patching/_gridsearch.py +++ b/stratum/patching/_gridsearch.py @@ -19,6 +19,7 @@ def _stratum_make_grid_search(self, *, fitted=False, keep_subsampling=False, **k cv = kwargs.get("cv", None) scoring = kwargs.get("scoring", None) return_predictions = kwargs.get("return_predictions", False) + env = kwargs.get("environment", {}) # Get the DataOp from the namespace instance dag = self._data_op @@ -27,7 +28,8 @@ def _stratum_make_grid_search(self, *, fitted=False, keep_subsampling=False, **k dag=dag, cv=cv, scoring=scoring, - return_predictions=return_predictions + return_predictions=return_predictions, + env=env ) else: # Fall back to original implementation diff --git a/stratum/runtime/_caching.py b/stratum/runtime/_caching.py new file mode 100644 index 00000000..48d4c497 --- /dev/null +++ b/stratum/runtime/_caching.py @@ -0,0 +1,112 @@ +from fileinput import filename +import json +import os +import logging +import polars as pl +import pandas as pd +from time import perf_counter + + +logger = logging.getLogger(__name__) + +CACHE_DIR = os.path.join(os.path.expanduser("~"), ".stratum", "cache") +if not os.path.exists(CACHE_DIR): + os.makedirs(CACHE_DIR) +INTERMEDIATES_DIR = os.path.join(CACHE_DIR, "intermediates") +if not os.path.exists(INTERMEDIATES_DIR): + os.makedirs(INTERMEDIATES_DIR) + + +class Cache: + def __init__(self): + self.cache = {} + self.timings = [] + # try to load cache from file + if os.path.exists(os.path.join(CACHE_DIR, "cache.json")): + logger.info(f"Loading cache from {os.path.join(CACHE_DIR, 'cache.json')}") + t0 = perf_counter() + with open(os.path.join(CACHE_DIR, "cache.json"), "r") as f: + tmp_cache = json.load(f) + for key,(file_name, converted) in tmp_cache.items(): + key = int(key) + self.cache[key] = read_value(file_name, converted) + t1 = perf_counter() + duration = t1 - t0 + logger.info(f"Cache loaded in {duration} seconds") + self.timings.append(("load_cache", duration)) + + # Cache operation counters + self.hit_count = 0 + self.miss_count = 0 + self.set_count = 0 + self.hit_time = 0.0 + self.set_time = 0.0 + + def get(self, key): + t0 = perf_counter() + result = self.cache.get(key) + t1 = perf_counter() + duration = t1 - t0 + + if result is not None: + self.hit_count += 1 + self.hit_time += duration + else: + self.miss_count += 1 + + return result + + def set(self, key, value): + t0 = perf_counter() + self.cache[key] = value + t1 = perf_counter() + duration = t1 - t0 + self.set_count += 1 + self.set_time += duration + + def persist(self): + logger.info(f"Saving cache to {os.path.join(CACHE_DIR, 'cache.json')}") + t0 = perf_counter() + file_name_cache = {} + for key, value in self.cache.items(): + converted = isinstance(value, pd.DataFrame) + if not check_if_intermediate_exists(key): + write_value(key, value) + else: + logger.debug(f"Intermediate {key} already exists, skipping write") + file_name_cache[key] = (make_intermediate_file_name(key), converted) + # clear existing cache file + if os.path.exists(os.path.join(CACHE_DIR, "cache.json")): + os.remove(os.path.join(CACHE_DIR, "cache.json")) + # write new cache file + with open(os.path.join(CACHE_DIR, "cache.json"), "w") as f: + json.dump(file_name_cache, f) + t1 = perf_counter() + duration = t1 - t0 + logger.info(f"Cache saved in {duration} seconds") + self.timings.append(("save_cache", duration)) + del self.cache + + +def make_intermediate_file_name(key): + return os.path.join(INTERMEDIATES_DIR, f"{key}.parquet") + +def check_if_intermediate_exists(key): + return os.path.exists(make_intermediate_file_name(key)) + +def read_value(file_name, converted=False): + if not os.path.exists(file_name): + raise RuntimeError(f"Intermediate {file_name} not found. Cache is corrupted. Please do 'rm -rf {CACHE_DIR}' and run your code again.") + df = pl.read_parquet(file_name) + if converted: + df = df.to_pandas() + return df + +def write_value(key, value): + if isinstance(value, pd.DataFrame): + value = pl.from_pandas(value) + if isinstance(value, pl.DataFrame): + with open(make_intermediate_file_name(key), "wb") as f: + value.write_parquet(f) + else: + raise ValueError(f"Unsupported value type: {type(value)}") \ No newline at end of file diff --git a/stratum/runtime/_hash_utils.py b/stratum/runtime/_hash_utils.py new file mode 100644 index 00000000..159454a5 --- /dev/null +++ b/stratum/runtime/_hash_utils.py @@ -0,0 +1,54 @@ +import hashlib +from sklearn.base import BaseEstimator +from skrub import TableVectorizer + +def _stable_hash_tuple(items): + """ + Hash a tuple/sequence of items deterministically across processes. + """ + # Create a deterministic hash by hashing the stable hashes of each item + hash_values = tuple(stable_hash(item) for item in items) + # Convert tuple of integers to bytes for hashing + # Use struct.pack or a simple byte representation + # For simplicity, use a delimiter-separated string representation + byte_data = b'|'.join(str(h).encode('utf-8') for h in hash_values) + return int.from_bytes(hashlib.sha256(byte_data).digest()[:8], byteorder='big') + + +def hash_estimator(est: BaseEstimator) -> int: + """ + Hash an estimator. + """ + param_hashes = [] + items = list(est.get_params().items()) + for key, value in items: + if key != "fitted_": + if isinstance(value, BaseEstimator): + param_hashes.append((key, hash_estimator(value))) + else: + param_hashes.append(((key, stable_hash(value)))) + if "fitted_" in items: + param_hashes.append(("fitted_", stable_hash(est.fitted_))) + return _stable_hash_tuple(param_hashes) + +def stable_hash(obj): + if isinstance(obj, str): + # Use SHA256 for stable hashing across processes + return int.from_bytes(hashlib.sha256(obj.encode('utf-8')).digest()[:8], byteorder='big') + elif isinstance(obj, BaseEstimator): + return hash_estimator(obj) + elif isinstance(obj, (int, float, bool, type(None))): + # These types have stable representations + # Convert to string and hash for consistency + return int.from_bytes(hashlib.sha256(repr(obj).encode('utf-8')).digest()[:8], byteorder='big') + elif isinstance(obj, list): + return _stable_hash_tuple(obj) + elif isinstance(obj, tuple): + return _stable_hash_tuple(obj) + elif isinstance(obj, dict): + # Sort items by key hash for deterministic ordering + sorted_items = sorted(obj.items(), key=lambda x: stable_hash(x[0])) + return _stable_hash_tuple((stable_hash(key), stable_hash(value)) for key, value in sorted_items) + else: + # For other types, use repr() to get a stable string representation + return int.from_bytes(hashlib.sha256(repr(obj).encode('utf-8')).digest()[:8], byteorder='big') \ No newline at end of file diff --git a/stratum/runtime/_physical_planning.py b/stratum/runtime/_physical_planning.py new file mode 100644 index 00000000..a3b66e5e --- /dev/null +++ b/stratum/runtime/_physical_planning.py @@ -0,0 +1,85 @@ +from skrub import StringEncoder, TableVectorizer +from stratum.logical_optimizer._op_utils import topological_iterator +from stratum.logical_optimizer._ops import EstimatorOp, Op, TransformerOp +from skrub._data_ops._data_ops import _wrap_estimator +from time import perf_counter +import uuid +import logging +logger = logging.getLogger(__name__) + +def get_estimator_memory_estimate(op: Op, size = 1) -> int | None: + if isinstance(op, TransformerOp): + estm = op.estimator + if isinstance(estm, TableVectorizer): + return 10*size + elif isinstance(estm, StringEncoder): + return 3*size + return None + # elif isinstance(op, EstimatorOp): + # return 10*size + else: + return 1 + +def get_independent_set(ops: list[Op], ancestors: dict[Op]) -> list[Op]: + # Find the largest subset of ops that don't depend on each other + # Two ops conflict if one is an ancestor of the other + def have_dependency(est1: Op, est2: Op) -> bool: + """Check if est1 and est2 have a dependency (one is ancestor of the other).""" + return est1 in ancestors.get(est2, set()) or est2 in ancestors.get(est1, set()) + + # Greedily find the largest independent set + # TODO instead of greedily finding the largest independent set, we should decide based of mem and compute estimates + # Sort by number of conflicts (fewer conflicts first) to maximize the set size + conflict_counts = {est: sum(1 for other in ops if have_dependency(est, other)) + for est in ops} + + # Sort by conflict count (ascending) - estimators with fewer conflicts are prioritized + # prefer string encoder and table vectorizer over other estimators if they have the same conflict count + sorted_ests = sorted(ops, key=lambda e: (conflict_counts[e], not (isinstance(e.estimator, StringEncoder) or isinstance(e.estimator, TableVectorizer)))) + + # Greedily build the largest independent set + independent_set = [] + for est in sorted_ests: + # Check if this estimator conflicts with any already in the set + if not any(have_dependency(est, added) for added in independent_set): + independent_set.append(est) + + return independent_set + +def mark_ops_for_parallelization(ops: list[Op], ancestors: dict[Op]): + par_group_id = uuid.uuid4() + ops = [op for op in ops if get_estimator_memory_estimate(op) is not None] + selected_ops = get_independent_set(ops, ancestors) + if len(selected_ops) > 1: + selected_ops_str = ",".join(op.name for op in selected_ops) + logger.debug(f"Selected {len(selected_ops)} ops for parallelization: [{selected_ops_str}]") + for op in selected_ops: + op.parallel_group = par_group_id + else: + logger.debug(f"No ops selected for parallelization. Not enough ops to parallelize: {len(selected_ops)}.") + + +def compute_ancestors(sink: Op) -> dict[Op]: + """ Compute the ancestors of each op in the DAG. """ + ancestors = {op: set() for op in topological_iterator(sink)} + for op in topological_iterator(sink): + ancestors[op] = set() + for in_ in op.inputs: + ancestors[op].update(ancestors[in_]) + ancestors[op].add(in_) + return ancestors + +def physical_planning(sink: Op) -> Op: + """ Apply physical planning to the DAG. """ + t0 = perf_counter() + ancestors = compute_ancestors(sink) + + estimators = [op for op in topological_iterator(sink) if isinstance(op, EstimatorOp)] + mark_ops_for_parallelization(estimators, ancestors) + transformers = [op for op in topological_iterator(sink) if isinstance(op, TransformerOp)] + mark_ops_for_parallelization(transformers, ancestors) + # make_parallel_block(estimators, ancestors) + # make_parallel_block(transformers, ancestors) + t1 = perf_counter() + logger.info(f"Physical planning took: {t1 - t0:.2f} seconds") + return sink \ No newline at end of file diff --git a/stratum/runtime/_scheduler.py b/stratum/runtime/_scheduler.py index 1fed012c..2cc3cf8a 100644 --- a/stratum/runtime/_scheduler.py +++ b/stratum/runtime/_scheduler.py @@ -1,55 +1,122 @@ +import ctypes +import gc +import sys from time import perf_counter +from numpy import int32 +import psutil from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split, check_cv -from sklearn.metrics._scorer import _Scorer +from sklearn.metrics._scorer import _Scorer, get_scorer from skrub._data_ops._data_ops import EvalMode from stratum.logical_optimizer._dataframe_ops import SplitOp -from stratum.logical_optimizer._ops import ImplOp, Op +from stratum.logical_optimizer._op_utils import show_graph, topological_iterator +from stratum.logical_optimizer._ops import EstimatorOp, ImplOp, Op, TransformerOp +from joblib import Parallel, delayed +from concurrent.futures import ThreadPoolExecutor import polars as pl +from stratum._config import FLAGS +import os +from dataclasses import dataclass import logging + +from stratum.runtime._hash_utils import stable_hash logger = logging.getLogger(__name__) + +@dataclass +class _SchedulerFlags: + show_memory_usage: bool = False + stratum_gc: bool = True + stratum_malloc_trim: bool = False + +SchedulerFlags = _SchedulerFlags() + + +_libc = None +if sys.platform == "linux": + try: + _libc = ctypes.CDLL("libc.so.6") + except OSError: + pass + +def _malloc_trim(): + """Ask glibc to return free heap pages to the OS.""" + if _libc is not None: + _libc.malloc_trim(0) + + +def measure_memory_usage(): + memory_usage = psutil.Process().memory_info().rss + return format_bytes(memory_usage) + +def format_bytes(bytes: int32): + l = ["B", "KB", "MB", "GB"] + for i in range(len(l)): + if bytes < 1024: + return f"{bytes:.2f} {l[i]}" + bytes /= 1024 + return f"{bytes:.2f} {l[-1]}" + +def get_scoring_func(scoring): + """Get scoring function from str or _Scorer object.""" + if type(scoring) == str: + scoring = get_scorer(scoring) + if type(scoring) == _Scorer: + logger.info(f"Using scorer: {scoring}") + greater_is_better = scoring._sign > 0 + scoring_func = scoring._score_func + else: + greater_is_better = False + scoring_func = mean_squared_error + return scoring_func, greater_is_better + class Scheduler: """Scheduler for executing DataOpDAGs in topological order.""" - def __init__(self, ops_ordered: list[Op], print_heavy_hitters=False): + def __init__(self, print_heavy_hitters=False, cache=None, env=None, t0 = None): """Initialize scheduler with a data operations DAG.""" - self.ops_ordered = ops_ordered self.mode = "fit_transform" - self.env = {} + self.env = env if env else {} self.flagged_for_recomputation = [] self.pos_split_op = None self.timings = [] if print_heavy_hitters else None self.results_ = None + self.cv_id = -1 + self.cache = cache + self.intermediate_dependencies = {} + self.t0 = t0 if t0 is not None else perf_counter() - def evaluate(self, seed: int = 42, test_size = 0.2): - """Evaluate the pipeline with a train/test split and return predictions.""" - try: - split_op = self.compute_xy() - except RuntimeError as e: - if "X and y nodes not found in the DAG" in str(e): - logger.warning("X and y nodes not found in the DAG, returning the last node") - return self.ops_ordered[-1].intermediate - else: - raise e + def run_gc(self): + if SchedulerFlags.stratum_gc: + freed_any = False + kv = list(self.intermediate_dependencies.items()) + for k, v in kv: + if v == 0: + logger.debug(f"GC: deleting {k}") + k.intermediate = None + del self.intermediate_dependencies[k] + freed_any = True - train_index, test_index = train_test_split(range(len(split_op.inputs[0].intermediate)), test_size=test_size, random_state=seed) - split_op.indices = train_index - self.compute(self.pos_split_op) - split_op.indices = test_index - pred = self.compute(self.pos_split_op, mode="predict") - return pred["vals"][0] + if freed_any and SchedulerFlags.stratum_malloc_trim: + gc.collect() + _malloc_trim() def grid_search(self, cv=None, scoring=None, return_predictions=False): - """Perform grid search with cross-validation on the DataOp DAG in a sequential top-down manner.""" + """Perform grid search with cross-validation on the logical DAG.""" # default to scikit-learn's CV cv = check_cv(cv) - # start with computing till X and y node + if SchedulerFlags.show_memory_usage: + memory_usage = measure_memory_usage() + logger.debug(f"Memory usage at start of grid search: {memory_usage}") + + # start with computing till we reach the split op logger.debug("\n" + "="*100 + "\n" + "Starting grid search" + "\n" + "="*100 + "\n") split_op = self.compute_xy() + for in_ in split_op.inputs: + self.intermediate_dependencies[in_] *= cv.get_n_splits()*2 results, predictions = [], [] logger.debug("\n" + "="*100 + "\n" + "XY computed" + "\n" + "="*100 + "\n") @@ -57,23 +124,13 @@ def grid_search(self, cv=None, scoring=None, return_predictions=False): self.results_ = results return predictions if return_predictions else None - - def get_scoring_func(self, scoring): - """Get scoring function from str or _Scorer object.""" - if type(scoring) == str: - coeff = -1 if scoring.startswith("neg_") else 1 - scoring_func = lambda test, pred: mean_squared_error(test, pred) * coeff - elif type(scoring) == _Scorer: - scoring_func = scoring._score_func - else: - scoring_func = mean_squared_error - return scoring_func - def cross_validate(self, split_op, cv, scoring, predictions: list, results: list, return_predictions: bool): - scoring_func = self.get_scoring_func(scoring) + """Perform cross-validation on the logical DAG.""" + scoring_func, greater_is_better = get_scoring_func(scoring) # TODO we can parallelize over the folds for i, (train_index, test_index) in enumerate(cv.split(split_op.inputs[0].intermediate)): + self.cv_id = i logger.debug(f"CV Fold Nr. {i + 1}") # fit and predict the pipeline @@ -81,21 +138,105 @@ def cross_validate(self, split_op, cv, scoring, predictions: list, results: list self.compute(self.pos_split_op) logger.debug("\n" + "="*100 + "\n" + "Training done for fold " + str(i+1) + "\n" + "="*100 + "\n") split_op.indices = test_index - df = self.compute(self.pos_split_op, mode="predict") + df, y_test = self.compute(self.pos_split_op, mode="predict") logger.debug("\n" + "="*100 + "\n" + "Predicting done for fold " + str(i+1) + "\n" + "="*100 + "\n") if return_predictions: predictions.append(df) # scoring - y_test = split_op.intermediate[1] df = df.with_columns(df["vals"].map_elements(lambda pred: scoring_func(y_test, pl.Series(pred))).alias("scores")) df = df.drop("vals") results.append(df) results = pl.concat(results) - results = results.group_by("id").mean().sort("scores", descending=True) + results = results.group_by("id").mean().sort("scores", descending=greater_is_better) return results + def process_op(self, op: Op): + """Process a single DataOp node and return its output.""" + if SchedulerFlags.stratum_gc: + for in_ in op.inputs: + self.intermediate_dependencies[in_] -= 1 + logger.debug(f"[{perf_counter() - self.t0:.2f}s] Processing op: {op}") + + try: + # cache lookup + cache_key = None + if self.cache is not None and isinstance(op, TransformerOp) and op.name == "TableVectorizer": + cache_key = stable_hash((op.get_hash(), self.cv_id, self.mode)) + logger.debug(f"Cache lookup for op: {op} with key: {cache_key}") + cache_value = self.cache.get(cache_key) + if cache_value is not None: + logger.debug(f"Cache hit for op: {op}") + op.intermediate = cache_value + return op + + t0 = perf_counter() if self.timings is not None else 0 + op.process(mode=self.mode, environment=self.env) + if self.timings is not None: + duration = perf_counter() - t0 + self.timings.append((str(op), duration)) + + # cache write + if self.cache is not None and isinstance(op, TransformerOp) and op.name == "TableVectorizer": + cache_value = op.intermediate + self.cache.set(cache_key, cache_value) + logger.debug(f"Cached result of op: {op} with key: {cache_key}") + + except Exception as e: + raise RuntimeError(f"[{self.mode}] Error processing '{op}': {e}") + + self.run_gc() + self.intermediate_dependencies[op] = len(op.outputs) + + if SchedulerFlags.show_memory_usage: + gc.collect() + memory_usage = measure_memory_usage() + logger.debug(f"[{(perf_counter() - self.t0):.2f}s] Memory usage after processing {op}: {memory_usage}") + logger.debug(f"Memory usage of intermediate of {op}: {format_bytes(op.get_intermediate_size())}") + + return op + + def _format_predict_result(self, pred): + """Helper method to format prediction results consistently.""" + if isinstance(pred, list): + return pl.DataFrame(pred) + elif isinstance(pred, dict) and "id" in pred and "vals" in pred: + return pl.DataFrame([pred]) + else: + return pl.DataFrame({"vals": [pred], "id": ["default"]}) + + def _flag_op_for_recomputation_if_needed(self, op: Op): + """Helper method to flag an op for recomputation if it's an ImplOp with EvalMode.""" + if isinstance(op, ImplOp) and isinstance(op.skrub_impl, EvalMode): + self.flagged_for_recomputation.append(op) + +class SequentialScheduler(Scheduler): + def __init__(self, dag_sink: Op, print_heavy_hitters=False, cache=None, env=None, t0 = None): + super().__init__(print_heavy_hitters, cache=cache, env=env, t0=t0) + self.ops_ordered = [op for op in topological_iterator(dag_sink)] + + def evaluate(self, seed: int = 42, test_size = 0.2): + """Evaluate the pipeline with a train/test split and return predictions.""" + try: + split_op = self.compute_xy() + except RuntimeError as e: + if "X and y nodes not found in the DAG" in str(e): + logger.warning("X and y nodes not found in the DAG, returning the last node") + return self.ops_ordered[-1].intermediate + else: + raise e + + train_index, test_index = train_test_split(range(len(split_op.inputs[0].intermediate)), test_size=test_size, random_state=seed) + split_op.indices = train_index + for in_ in split_op.inputs: + self.intermediate_dependencies[in_] *= 2 + self.compute(self.pos_split_op) + split_op.indices = test_index + pred, _ = self.compute(self.pos_split_op, mode="predict") + return pred["vals"][0] + + def compute(self, start_pos: int, mode="fit_transform"): """Compute the pipeline from start_pos onwards with given inputs.""" ops_to_compute = self.ops_ordered[start_pos:] @@ -103,15 +244,15 @@ def compute(self, start_pos: int, mode="fit_transform"): ops_to_compute = self.flagged_for_recomputation + ops_to_compute self.mode = mode + y_true = None for node in ops_to_compute: self.process_op(node) + if mode == "predict" and isinstance(node, SplitOp): + y_true = node.intermediate[1] if mode == "predict": pred = self.ops_ordered[-1].intermediate - if isinstance(pred, list): - return pl.DataFrame(pred) - else: - return pl.DataFrame({"vals": [pred], "id": ["default"]}) + return self._format_predict_result(pred), y_true return None def compute_xy(self) -> SplitOp: @@ -121,20 +262,124 @@ def compute_xy(self) -> SplitOp: self.pos_split_op = i return op self.process_op(op) - if isinstance(op, ImplOp) and isinstance(op.skrub_impl, EvalMode): - self.flagged_for_recomputation.append(op) + self._flag_op_for_recomputation_if_needed(op) raise RuntimeError("X and y nodes not found in the DAG") - def process_op(self, op: Op): - """Process a single DataOp node and return its output.""" - logger.debug(f"Processing op: {op}") - t0 = perf_counter() if self.timings is not None else 0 +class ParallelScheduler(Scheduler): + def __init__(self, dag_sink: Op, parallel_groups: dict[int, (int, list[Op])], print_heavy_hitters=False, backend="threading", max_workers=None, cache=None, env=None): + super().__init__(print_heavy_hitters, cache=cache, env=env) + self.linearize_dag(dag_sink) + self.backend = backend + if max_workers is None: + max_workers = os.cpu_count() or 8 + self.max_workers = max_workers + + def linearize_dag(self, dag_sink: Op): + parallel_groups = {} + for op in topological_iterator(dag_sink): + if op.parallel_group is not None: + group = parallel_groups.get(op.parallel_group, []) + group.append(op) + parallel_groups[op.parallel_group] = group + groups_str = "\n".join(" ["+",".join(op.name for op in g) +"]" for g in parallel_groups.values()) #cant use f-string because of py3.11 + logger.debug(f"Parallel groups:\n{groups_str}\n") + for group in parallel_groups.values(): + inputs_union = set() + for op in group: + inputs_union.update(op.inputs) + for op in group: + # add additional dependencies s.t. all ops in the group are ready to compute + for in_ in inputs_union: + if in_ not in op.inputs: + if op.additional_inputs is None: + op.additional_inputs = [] + op.additional_inputs.append(in_) + if in_.additional_outputs is None: + in_.additional_outputs = [] + in_.additional_outputs.append(op) + if FLAGS.DEBUG: + show_graph(dag_sink, "parallel_process_plan") + + blocks = [] + group_added = {} + for op in topological_iterator(dag_sink): + + if op.parallel_group is None: + blocks.append(op) + else: + group = parallel_groups[op.parallel_group] + if not group_added.get(op.parallel_group, False): + blocks.append(group) + group_added[op.parallel_group] = True + + + self.blocks = blocks + + def compute(self, start_pos: int, mode="fit_transform"): + """Compute the pipeline from start_pos onwards with given inputs.""" + blocks_to_compute = self.blocks[start_pos:] + if len(self.flagged_for_recomputation) != 0: + # Add flagged ops as individual blocks before the rest + blocks_to_compute = [op for op in self.flagged_for_recomputation] + blocks_to_compute + self.mode = mode + + y_true = None + for block in blocks_to_compute: + self.process_block(block) + if mode == "predict" and isinstance(block, SplitOp): + y_true = block.intermediate[1] + if mode == "predict": + # Get the last block's output + last_block = self.blocks[-1] + return self._format_predict_result(last_block.intermediate), y_true + return None + + def compute_xy(self) -> SplitOp: + """Compute blocks until X and y nodes are found and store them.""" + for i, block in enumerate(self.blocks): + if block.is_split_op: + self.pos_split_op = i + return block + self.process_block(block) + self._flag_op_for_recomputation_if_needed(block) + raise RuntimeError("X and y nodes not found in the DAG") + + def process_block(self, block): + """Process a single block - either an Op or a list of Ops (parallel group).""" + if isinstance(block, list): + # Parallel group - process ops in parallel + ops = block + logger.debug(f"Processing parallel block with {len(ops)} ops") + t0 = perf_counter() if self.timings is not None else 0 + + if self.backend == "process" or (self.backend == "auto" and all(isinstance(op, EstimatorOp) for op in ops)): + logger.debug(f"Using process-based parallel processing with joblib)") + results = Parallel(n_jobs=len(ops), backend="loky")( + delayed(op.get_process_task())(op.extract_args_from_inputs(self.mode)) + for op in ops + ) + + for i, (result, fitted_estimator) in enumerate(results): + ops[i].intermediate = result + ops[i].estimator = fitted_estimator + else: + logger.debug(f"Using thread-based parallel processing with ThreadPoolExecutor") + with ThreadPoolExecutor(max_workers=8) as executor: + futures = [executor.submit(self._process_op_task, op, self.mode, self.env) for op in ops] + for i, future in enumerate(futures): + ops[i].intermediate = future.result() + + if self.timings is not None: + duration = perf_counter() - t0 + self.timings.append((f"ParallelBlock({len(ops)} ops)", duration)) + else: + # Single op - process sequentially + self.process_op(block) + + def _process_op_task(self, op: Op, mode: str, environment: dict): + """Helper task for thread-based parallel processing.""" try: - op.process(mode=self.mode, environment=self.env) + op.process(mode=mode, environment=environment) + return op.intermediate except Exception as e: - raise RuntimeError(f"[{self.mode}] Error processing '{op}': {e}") - - if self.timings is not None: - duration = perf_counter() - t0 - self.timings.append((str(op), duration)) - return op \ No newline at end of file + raise RuntimeError(f"[{mode}] Error processing '{op}': {e}") \ No newline at end of file diff --git a/stratum/tests/application/test_multi_level_choice_graph.py b/stratum/tests/application/test_multi_level_choice_graph.py index bca02997..a04a6787 100644 --- a/stratum/tests/application/test_multi_level_choice_graph.py +++ b/stratum/tests/application/test_multi_level_choice_graph.py @@ -1,4 +1,5 @@ import os +import pickle import tempfile import unittest import uuid @@ -13,10 +14,14 @@ from xgboost import XGBRegressor from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import make_scorer, mean_squared_error, r2_score +from sklearn.model_selection import KFold from stratum.logical_optimizer._optimize import optimize - - +import polars as pl +import logging +logging.basicConfig(level=logging.DEBUG) +from stratum.runtime._scheduler import SchedulerFlags class TargetEncoder(BaseEstimator, TransformerMixin): + def fit(self, X, y=None): print("fit target encoder") self.global_mean_ = y.mean() @@ -87,44 +92,65 @@ def is_numeric_column(col): def df2(X): return X.skb.apply(TableVectorizer()) - X_vec = skrub.choose_from({"1": df1(X,y), "2": df2(X)}, name = "data engineering").as_data_op() + X_vec = skrub.choose_from({"1": df1(X,y), "2": df2(X)}, name = "pre").as_data_op() models = { "Ridge": Ridge(random_state=42), - "XGBoost": XGBRegressor(random_state=42), - "LightGBM": LGBMRegressor(random_state=42), - "ElasticNet": ElasticNet(random_state=42), + "xgb": XGBRegressor(random_state=42), + "lgbm": LGBMRegressor(random_state=42), + "elastic": ElasticNet(random_state=42), } preds = {name: X_vec.skb.apply(m, y=y) for name, m in models.items()} - return skrub.choose_from(preds, name="models").as_data_op() - # model = skrub.choose_from(models, name="models").as_data_op() - # preds = X_vec.skb.apply(model, y=y) - return preds + return skrub.choose_from(preds, name="m").as_data_op() -def make_data(n: int = 1000): +def make_data(n: int = 1000, seed: int = 42): + np.random.seed(seed) + rng = np.random.default_rng(seed) df = pd.DataFrame({ "Transaction unique identifier": [str(uuid.uuid4()) for _ in range(n)], - "Price": np.random.randint(50000, 2_000_000, size=n), + "Price": rng.integers(50000, 2_000_000, size=n), "Date of Transfer": pd.to_datetime( - np.random.choice(pd.date_range("2010-01-01", "2024-12-31"), size=n) + rng.choice(pd.date_range("2010-01-01", "2024-12-31"), size=n) ).astype(str), - "Property Type": np.random.choice(list("DSTFO"), size=n), - "Old/New": np.random.choice(["Y", "N"], size=n), - "Duration": np.random.choice(["F", "L"], size=n), - "Town/City": np.random.choice( + "Property Type": rng.choice(list("DSTFO"), size=n), + "Old/New": rng.choice(["Y", "N"], size=n), + "Duration": rng.choice(["F", "L"], size=n), + "Town/City": rng.choice( ["London", "Manchester", "Birmingham", "Leeds", "Bristol"], size=n ), - "District": np.random.choice( + "District": rng.choice( ["District A", "District B", "District C"], size=n ), - "County": np.random.choice( + "County": rng.choice( ["Greater London", "West Midlands", "Greater Manchester"], size=n ), - "PPDCategory Type": np.random.choice(["A", "B"], size=n), - "Record Status - monthly file only": np.random.choice(["A", "C"], size=n), + "PPDCategory Type": rng.choice(["A", "B"], size=n), + "Record Status - monthly file only": rng.choice(["A", "C"], size=n), }) return df class TestMultiLevelChoiceGraph(unittest.TestCase): + expected_results = pl.DataFrame({ + "id": [ + "m:elastic, pre:2", + "m:elastic, pre:1", + "m:Ridge, pre:2", + "m:Ridge, pre:1", + "m:xgb, pre:2", + "m:lgbm, pre:2", + "m:lgbm, pre:1", + "m:xgb, pre:1" + ], + "scores": [ + -0.000779, + -0.028774, + -0.021469, + -0.040625, + -0.156263, + -0.174555, + -0.172825, + -0.251869 + ] + }) def test_application(self): tmp_path = tempfile.mkdtemp() @@ -133,6 +159,89 @@ def test_application(self): df.to_csv(os.path.join(tmp_path, "data.csv"), index=False) preds = define_pipeline(os.path.join(tmp_path, "data.csv")) scorer = make_scorer(r2_score) - with skrub.config(DEBUG=True, open_graph=False, scheduler=True, rust_backend=False): - search = preds.skb.make_grid_search(fitted=True, cv = 2, scoring=scorer) - print(search.results_) + cv = KFold(n_splits=2, shuffle=True, random_state=42) + with skrub.config(DEBUG=True, open_graph=False, scheduler=True, rust_backend=False, scheduler_parallelism=None, stats=20): + search = preds.skb.make_grid_search(fitted=True, cv = cv, scoring=scorer) + print(search.results_) + + + def run_application(self, sched_par: str = None): + tmp_path = tempfile.mkdtemp() + df = make_data() + df.to_csv(os.path.join(tmp_path, "data.csv"), index=False) + preds = define_pipeline(os.path.join(tmp_path, "data.csv")) + preds = preds.skb.apply_func(lambda a, m: a, m=skrub.eval_mode()) + scorer = make_scorer(r2_score) + cv = KFold(n_splits=2, shuffle=True, random_state=42) + with skrub.config(DEBUG=True, open_graph=False, scheduler=True, rust_backend=False, scheduler_parallelism=sched_par, stats=20): + search = preds.skb.make_grid_search(fitted=True, cv = cv, scoring=scorer) + print(search.results_) + return search.results_ + + def test_application_no_parallelism(self): + actual_results = self.run_application() + # Convert to pandas for comparison + # TODO: pre:2 is non-deterministic right now, so we need to filter it out + filter_expr = pl.col("id").str.contains("pre:1") & ~pl.col("id").str.contains("xgb|lgbm") + actual_df = actual_results.sort("id").filter(filter_expr).to_pandas() + expected_df = self.expected_results.sort("id").filter(filter_expr).to_pandas() + print(actual_df) + pd.testing.assert_frame_equal( + actual_df, + expected_df, + atol=1e-6, + check_dtype=False + ) + + def test_application_threading(self): + SchedulerFlags.stratum_gc = False + actual_results = self.run_application(sched_par="threading") + # Convert to pandas for comparison + # TODO: pre:2 is non-deterministic right now, so we need to filter it out + filter_expr = pl.col("id").str.contains("pre:1") & ~pl.col("id").str.contains("xgb|lgbm") + actual_df = actual_results.sort("id").filter(filter_expr).to_pandas() + expected_df = self.expected_results.sort("id").filter(filter_expr).to_pandas() + print(actual_df) + pd.testing.assert_frame_equal( + actual_df, + expected_df, + atol=1e-6, + check_dtype=False + ) + SchedulerFlags.stratum_gc = True + + def test_application_process(self): + SchedulerFlags.stratum_gc = False + actual_results = self.run_application(sched_par="process") + # Convert to pandas for comparison + # TODO: pre:2 is non-deterministic right now, so we need to filter it out + filter_expr = pl.col("id").str.contains("pre:1") & ~pl.col("id").str.contains("xgb|lgbm") + actual_df = actual_results.sort("id").filter(filter_expr).to_pandas() + expected_df = self.expected_results.sort("id").filter(filter_expr).to_pandas() + print(actual_df) + pd.testing.assert_frame_equal( + actual_df, + expected_df, + atol=1e-6, + check_dtype=False + ) + SchedulerFlags.stratum_gc = True + + def test_application_auto(self): + SchedulerFlags.stratum_gc = False + actual_results = self.run_application(sched_par="auto") + # Convert to pandas for comparison + # TODO: pre:2 is non-deterministic right now, so we need to filter it out + filter_expr = pl.col("id").str.contains("pre:1") & ~pl.col("id").str.contains("xgb|lgbm") + actual_df = actual_results.sort("id").filter(filter_expr).to_pandas() + expected_df = self.expected_results.sort("id").filter(filter_expr).to_pandas() + print(actual_df) + pd.testing.assert_frame_equal( + actual_df, + expected_df, + atol=1e-6, + check_dtype=False + ) + SchedulerFlags.stratum_gc = True +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/stratum/tests/logical_optimizer/test_dataframe_ops.py b/stratum/tests/logical_optimizer/test_dataframe_ops.py index 2857931d..d1201e4d 100644 --- a/stratum/tests/logical_optimizer/test_dataframe_ops.py +++ b/stratum/tests/logical_optimizer/test_dataframe_ops.py @@ -2,11 +2,15 @@ import tempfile from stratum.logical_optimizer._dataframe_ops import AssignOp, DataSourceOp, DatetimeConversionOp, GetAttrProjectionOp, ProjectionOp from stratum.logical_optimizer._ops import GetItemOp, MethodCallOp -from stratum.logical_optimizer._optimize import OptConfig, optimize +from stratum.logical_optimizer._optimize import OptConfig, optimize as optimize_ +from stratum.logical_optimizer._op_utils import topological_iterator import stratum as skrub import pandas as pd import unittest +def optimize(dag, conf=None): + return list(topological_iterator(optimize_(dag, conf))) + class TestDataframeOps(unittest.TestCase): def setUp(self): self.df = pd.DataFrame({ @@ -53,8 +57,9 @@ def test_projection_fused_get_item_rewrite_df2(self): sub_dag2 = data sink = skrub.choose_from([sub_dag1, sub_dag2]).as_data_op() ops = optimize(sink) + print(ops) self.assertEqual(5, len(ops)) - self.assertTrue(isinstance(ops[2], GetItemOp)) + self.assertTrue(isinstance(ops[1], GetItemOp)) self.assertTrue(isinstance(ops[3], ProjectionOp)) def test_fused_get_attr_rewrite_df(self): diff --git a/stratum/tests/logical_optimizer/test_op_utils.py b/stratum/tests/logical_optimizer/test_op_utils.py index 1f44af8d..b57f1453 100644 --- a/stratum/tests/logical_optimizer/test_op_utils.py +++ b/stratum/tests/logical_optimizer/test_op_utils.py @@ -1,11 +1,14 @@ #from curses import flash import unittest import stratum as skrub -from stratum.logical_optimizer._optimize import optimize, OptConfig, choice_unrolling -from stratum.logical_optimizer._op_utils import show_graph, clone_sub_dag +from stratum.logical_optimizer._optimize import optimize as optimize_, OptConfig, choice_unrolling +from stratum.logical_optimizer._op_utils import show_graph, clone_sub_dag, topological_iterator from stratum._config import config graph = False +def optimize(dag, conf=None): + return list(topological_iterator(optimize_(dag, conf))) + class TestOpUtils(unittest.TestCase): def setUp(self): pass diff --git a/stratum/tests/logical_optimizer/test_ops.py b/stratum/tests/logical_optimizer/test_ops.py index a276d9a4..388d566f 100644 --- a/stratum/tests/logical_optimizer/test_ops.py +++ b/stratum/tests/logical_optimizer/test_ops.py @@ -1,11 +1,11 @@ import unittest import pandas as pd import stratum as skrub +from stratum.logical_optimizer._op_utils import topological_iterator from stratum.logical_optimizer._ops import ( ImplOp, Op, ChoiceOp, ValueOp, MethodCallOp, CallOp, GetAttrOp, GetItemOp, SearchEvalOp, as_op ) -from stratum.logical_optimizer._optimize import optimize -from sklearn.ensemble import RandomForestRegressor +from stratum.logical_optimizer._optimize import optimize as optimize_ from sklearn.dummy import DummyRegressor class TestOpCloning(unittest.TestCase): @@ -49,7 +49,7 @@ def test_clone_ops(self): pred = pred.skb.apply_func(lambda x,a, b: x, 1, b=1) choice = skrub.choose_from([pred], name="choice").as_data_op() out = choice.empty - ops = optimize(out) + ops = list(topological_iterator(optimize_(out))) try: ops[0].clone() @@ -94,4 +94,10 @@ def test_replace_non_existing_output(self): try: op.replace_output(3, 4) except ValueError as e: - self.assertEqual(str(e), "Output 3 not found in Op.") \ No newline at end of file + self.assertEqual(str(e), "Output 3 not found in Op.") + + + def test_var_ops(self): + var = skrub.var("test") + out = var.skb.apply_func(pd.read_csv) + ops = list(topological_iterator(optimize_(out))) diff --git a/stratum/tests/logical_optimizer/test_optimize.py b/stratum/tests/logical_optimizer/test_optimize.py index ad08fec8..5addc1ad 100644 --- a/stratum/tests/logical_optimizer/test_optimize.py +++ b/stratum/tests/logical_optimizer/test_optimize.py @@ -1,3 +1,4 @@ +from stratum.logical_optimizer._op_utils import topological_iterator from stratum.logical_optimizer._optimize import OptConfig, optimize import stratum as skrub import pandas as pd @@ -27,8 +28,7 @@ def test_optimize(self): X2 = X1.assign( year=X1["datetime"].dt.year, month=X1["datetime"].dt.month) - out = optimize(X2, OptConfig(cse=True)) - self.assertEqual(out[0].skrub_impl, data._skrub_impl) + out = list(topological_iterator(optimize(X2, OptConfig(cse=True)))) self.assertTrue(out[0].outputs[0] is out[1]) self.assertTrue(len(out[0].inputs) == 0) diff --git a/stratum/tests/runtime/test_caching.py b/stratum/tests/runtime/test_caching.py new file mode 100644 index 00000000..ef12d907 --- /dev/null +++ b/stratum/tests/runtime/test_caching.py @@ -0,0 +1,166 @@ +import unittest +import os +import sys +from sklearn.dummy import DummyRegressor +from sklearn.model_selection import KFold +import stratum as skrub +from stratum.logical_optimizer._op_utils import topological_iterator +from stratum.runtime._scheduler import SchedulerFlags +from stratum.tests.runtime.runtime_test_utils import RuntimeTest +import logging +from stratum.logical_optimizer._optimize import optimize +import pandas as pd +logging.basicConfig(level=logging.DEBUG) + + +class SearchTest(RuntimeTest): + expected_simple_hashes = { + "local": [ + 14466646976231713574, + 4283753923329093683, + 11672455255761944456, + 1, + 3, + 2, + 17673706173561179344, + 6346118744052152261, + ], + "linux": [ + 17843118638478979946, + 4283753923329093683, + 11672455255761944456, + 1, + 3, + 2, + 17673706173561179344, + 6346118744052152261, + ], + "macos": [ + 17237841316323807291, + 4283753923329093683, + 11672455255761944456, + 1, + 3, + 2, + 17673706173561179344, + 6346118744052152261, + ], + "windows": [ + 9534843511007154554, + 4283753923329093683, + 11672455255761944456, + 1, + 3, + 2, + 17673706173561179344, + 6346118744052152261, + ], + } + expected_hashes = { + "local": [ + 17214955316726503821, + 11824152000386466899, + 18298532774759976535, + 7513694150800269850, + 5537892472318521177, + 168195864670644233, + 1997578848421863092, + 14476433947220053316, + ], + "linux": [ + 7056806754431583388, + 7639690250793122720, + 17532383718078189923, + 10707031619699354836, + 7435966898669112865, + 8941144976148573683, + 16675763945336090482, + 13801223252098341323, + ], + "macos": [ + 11800167861632073492, + 13894009875302220469, + 2903296657173264096, + 4207835120194851649, + 11109528315728706675, + 17956785590977498290, + 10919015601046973997, + 12242410082145359458, + ], + "windows": [ + 6235675172187585043, + 6926154378978508485, + 3072755605188723418, + 18085496009191016749, + 15337519220874548500, + 10614601562615527768, + 15722255280919350770, + 508214583577843344, + ], + } + + @classmethod + def _detect_mode(cls) -> str | None: + """Detect environment to pick the right expected hash set. + + - \"local\": developer machine at /Users/elias/PycharmProjects/stratum/ + - \"linux\" / \"macos\" / \"windows\": GitHub runners on the respective OS. + """ + file_path = os.path.abspath(__file__) + local_root = "/Users/elias/PycharmProjects/stratum/" + if file_path.startswith(local_root): + return "local" + if sys.platform.startswith("linux"): + return "linux" + if sys.platform.startswith("darwin"): + return "macos" + if sys.platform.startswith(("win32", "cygwin")): + return "windows" + return None + + def compare_hashes(self, op, expected_hash, simple = False,): + hash_val = op.simple_hash() if simple else op.get_hash() + self.assertEqual(expected_hash, hash_val, f"Hash mismatch for {op}") + + + + def test_hashes(self): + file_path = os.path.join(os.path.dirname(__file__), "data.csv") + mode = self._detect_mode() + if mode not in self.expected_simple_hashes or mode not in self.expected_hashes: + self.skipTest(f"No expected hashes defined for mode={mode!r}") + self.df.to_csv(file_path, index=False) + data = skrub.as_data_op(file_path).skb.apply_func(pd.read_csv) + X = data[["x", "datetime"]].skb.mark_as_X() + y = data["y"].skb.mark_as_y() + + x_vec = X.skb.apply(skrub.TableVectorizer()) + pred = x_vec.skb.apply(DummyRegressor(), y=y) + pred = optimize(pred) + ops = list(topological_iterator(pred)) + + for i, op in enumerate(ops): + self.compare_hashes(op, self.expected_simple_hashes[mode][i], simple=True) + for i, op in enumerate(ops): + self.compare_hashes(op, self.expected_hashes[mode][i]) + + def test_search(self): + SchedulerFlags.stratum_gc = False + file_path = os.path.join(os.path.dirname(__file__), "data.csv") + mode = self._detect_mode() + if mode not in self.expected_simple_hashes or mode not in self.expected_hashes: + self.skipTest(f"No expected hashes defined for mode={mode!r}") + self.df.to_csv(file_path, index=False) + data = skrub.as_data_op(file_path).skb.apply_func(pd.read_csv) + X = data[["x", "datetime"]].skb.mark_as_X() + y = data["y"].skb.mark_as_y() + + x_vec = X.skb.apply(skrub.TableVectorizer()) + pred = x_vec.skb.apply(DummyRegressor(), y=y) + cv = KFold(n_splits=3, shuffle=True, random_state=42) + with skrub.config(scheduler=True, stats=20, caching=True): + search = pred.skb.make_grid_search(cv=cv, fitted=True,scoring="neg_mean_squared_error") + SchedulerFlags.stratum_gc = True + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/stratum/tests/runtime/test_evaluate.py b/stratum/tests/runtime/test_evaluate.py index ddcff401..03df8afd 100644 --- a/stratum/tests/runtime/test_evaluate.py +++ b/stratum/tests/runtime/test_evaluate.py @@ -1,3 +1,5 @@ +from contextlib import redirect_stderr, redirect_stdout +from io import StringIO import unittest from sklearn.datasets import make_regression from sklearn.preprocessing import StandardScaler @@ -6,7 +8,7 @@ import pandas as pd from stratum._api import evaluate from stratum.tests.runtime.runtime_test_utils import RuntimeTest, datetime_pipeline1 -import stratum +from stratum.runtime._scheduler import logger import logging logging.basicConfig(level=logging.INFO) @@ -76,5 +78,23 @@ def test_evaluate(self): pred = x_scaled.skb.apply(RandomForestRegressor(random_state=42), y=y) self.compare_evaluate(pred) + def test_evaluate_with_error(self): + # generate data using sklearn + data = skrub.as_data_op(self.df) + data = data.skb.apply_func(lambda x,m: x if m == "preview" else int("not a number :P"), m=skrub.eval_mode()) + try: + evaluate(data, seed=self.seed, test_size=self.test_size) + self.fail("Expected RuntimeError") + except RuntimeError as e: + self.assertEqual("[fit_transform] Error processing 'CallOp()': invalid literal for int() with base 10: 'not a number :P'",str(e)) + except Exception as e: + self.fail("Expected RuntimeError, got %s" % type(e)) + + def test_evaluate_no_X_y(self): + # generate data using sklearn + data = skrub.as_data_op(self.df) + with self.assertLogs(logger, level=logging.WARNING) as log: + evaluate(data, seed=self.seed, test_size=self.test_size) + self.assertIn("X and y nodes not found in the DAG", log.output[0]) if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/stratum/tests/runtime/test_search.py b/stratum/tests/runtime/test_search.py index e49827cf..4df8f281 100644 --- a/stratum/tests/runtime/test_search.py +++ b/stratum/tests/runtime/test_search.py @@ -39,7 +39,7 @@ def test_search(self): search_stratum, preds = grid_search(y, cv=cv, scoring="neg_mean_squared_error", return_predictions=True) search = y.skb.make_grid_search(cv=cv, fitted=True,scoring="neg_mean_squared_error") - assert(np.allclose(search.results_["mean_test_score"], search_stratum.results_["scores"])) + assert(np.allclose(search.results_["mean_test_score"]*-1, search_stratum.results_["scores"])) @@ -65,6 +65,17 @@ def test_search_with_no_y(self): except RuntimeError as e: self.assertEqual("X and y nodes not found in the DAG",str(e)) + def test_search_with_no_y_parrel_scheduler(self): + start = skrub.as_data_op(True) + end = start.skb.apply_func(lambda a: a).skb.mark_as_X() + + try: + with skrub.config(stats=20, scheduler_parallelism="threading"): + grid_search(end, return_predictions=True) + self.fail("Expected RuntimeError") + except RuntimeError as e: + self.assertEqual("X and y nodes not found in the DAG",str(e)) + def test_search_choice_not_at_the_end1(self): data = skrub.as_data_op(self.df) @@ -122,8 +133,8 @@ def test_search_with_stats(self): out = stdout.getvalue() out = out.split("\n") self.assertIn("Heavy hitters", out[2]) - self.assertIn("CallOp()", out[4]) - assert(out[4].split(" ")[-1] == "10") + self.assertIn("CallOp()", out[5]) + assert(out[5].split(" ")[-1] == "10") def test_fused_attr(self): diff --git a/stratum/tests/test_init_module.py b/stratum/tests/test_init_module.py index 407bb440..030a9eee 100644 --- a/stratum/tests/test_init_module.py +++ b/stratum/tests/test_init_module.py @@ -1,6 +1,7 @@ import stratum import os -from stratum._config import _env_bool +from stratum._config import _env_bool, _env_str +from stratum._config import FLAGS def test_versions_contains_strings(): versions = stratum.versions() @@ -23,4 +24,22 @@ def test_env_bool_false_values(): assert _env_bool("TEST_BOOL", True) is False del os.environ["TEST_BOOL"] +def test_config_scheduler_parallelism(): + with stratum.config(scheduler_parallelism="threading"): + assert FLAGS.scheduler_parallelism == "threading" + with stratum.config(scheduler_parallelism="process"): + assert FLAGS.scheduler_parallelism == "process" + with stratum.config(scheduler_parallelism="auto"): + assert FLAGS.scheduler_parallelism == "auto" + try: + with stratum.config(scheduler_parallelism="invalid"): + assert False + except ValueError as e: + assert str(e) == "scheduler_parallelism must be None, 'threading', 'process', or 'auto', got invalid" + os.environ["STRATUM_SCHEDULER_PARALLELISM"] = "threading" + assert _env_str("STRATUM_SCHEDULER_PARALLELISM") == "threading" + os.environ["STRATUM_SCHEDULER_PARALLELISM"] = "none" + assert _env_str("STRATUM_SCHEDULER_PARALLELISM") is None + del os.environ["STRATUM_SCHEDULER_PARALLELISM"] + diff --git a/uv.lock b/uv.lock index d93976b5..6e099521 100644 --- a/uv.lock +++ b/uv.lock @@ -2,9 +2,15 @@ version = 1 revision = 3 requires-python = ">=3.11" resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version >= '3.12' and python_full_version < '3.14'", - "python_full_version < '3.12'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version < '3.12' and sys_platform == 'win32'", + "python_full_version < '3.12' and sys_platform == 'emscripten'", + "python_full_version < '3.12' and sys_platform != 'emscripten' and sys_platform != 'win32'", ] [[package]] @@ -1637,7 +1643,7 @@ name = "pexpect" version = "4.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "ptyprocess" }, + { name = "ptyprocess", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } wheels = [ @@ -1982,11 +1988,11 @@ wheels = [ [[package]] name = "pytz" -version = "2025.2" +version = "2026.1.post1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } +sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, + { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, ] [[package]] @@ -2491,7 +2497,10 @@ version = "0.0.0.dev0" source = { editable = "." } dependencies = [ { name = "graphviz" }, + { name = "joblib" }, + { name = "pandas" }, { name = "polars" }, + { name = "psutil" }, { name = "pyarrow" }, { name = "scikit-learn" }, { name = "skrub" }, @@ -2518,14 +2527,17 @@ dev = [ requires-dist = [ { name = "coverage", extras = ["toml"], marker = "extra == 'test'" }, { name = "graphviz" }, + { name = "joblib" }, { name = "jupyter", marker = "extra == 'dev'" }, { name = "lightgbm", marker = "extra == 'test'", specifier = ">=4.6.0" }, + { name = "pandas", specifier = "==2.3.3" }, { name = "polars" }, + { name = "psutil" }, { name = "pyarrow", specifier = ">=22.0.0" }, { name = "pytest", marker = "extra == 'test'" }, { name = "pytest-cov", marker = "extra == 'test'" }, { name = "scikit-learn", specifier = "==1.8" }, - { name = "skrub", specifier = ">=0.3" }, + { name = "skrub", specifier = "==0.6.2" }, { name = "xgboost", marker = "extra == 'test'", specifier = ">=3.1.1" }, ] provides-extras = ["test", "benchmark", "dev"]