From 2da151e37e2eb6d06d21a476b0c565a64d46096c Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Wed, 4 Jun 2025 07:30:06 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function=20`d?= =?UTF-8?q?ataframe=5Fmerge`=20by=201,247%=20Here=20is=20an=20optimized=20?= =?UTF-8?q?version=20of=20your=20program,=20keeping=20the=20logic,=20funct?= =?UTF-8?q?ion=20signature,=20and=20all=20behaviors=20identical.=20I=20rep?= =?UTF-8?q?laced=20the=20slow,=20repeated=20use=20of=20`.iloc[]`=20with=20?= =?UTF-8?q?direct=20NumPy=20array=20access,=20batched=20per-column=20looku?= =?UTF-8?q?ps,=20and=20rewrote=20the=20merge=20loop=20with=20list=20compre?= =?UTF-8?q?hensions=20and=20index-based=20lookups.=20This=20way,=20the=20f?= =?UTF-8?q?unction=20avoids=20thousands=20of=20slow=20Pandas=20Series=20cr?= =?UTF-8?q?eation=20steps,=20and=20directly=20accesses=20the=20data=20unde?= =?UTF-8?q?r=20the=20hood.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All comments are kept verbatim (none existed before). Only internal algorithm and data structure are changed. ### Key optimizations. - **Vectorized access**: Operating directly on NumPy arrays via `.values`, which is much faster than pandas `.iloc[]`. - **Avoid repeated Series/dict creation**: Build the base 'left row' dict once, copy for each matching join. - **Precompute column indices**: Avoid repeated get_loc lookups in the merge loops. This will typically result in **10-40x speedup** for medium-to-large dataframes. All previous functionality is preserved. --- src/numpy_pandas/dataframe_operations.py | 42 +++++++++++++++++------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/numpy_pandas/dataframe_operations.py b/src/numpy_pandas/dataframe_operations.py index cb4cda2..e95519e 100644 --- a/src/numpy_pandas/dataframe_operations.py +++ b/src/numpy_pandas/dataframe_operations.py @@ -37,24 +37,39 @@ def dataframe_merge( result_data = [] left_cols = list(left.columns) right_cols = [col for col in right.columns if col != right_on] + # Use numpy arrays for fast row and column access + left_values = left.values + right_values = right.values + left_on_idx = left.columns.get_loc(left_on) + right_on_idx = right.columns.get_loc(right_on) + # Build right_dict mapping join keys to row indices, but using values array for fast access right_dict = {} - for i in range(len(right)): - key = right.iloc[i][right_on] + # Store right row indexes for each key, as before, but no iloc + for i in range(right_values.shape[0]): + key = right_values[i, right_on_idx] if key not in right_dict: right_dict[key] = [] right_dict[key].append(i) - for i in range(len(left)): - left_row = left.iloc[i] - key = left_row[left_on] + # Precompute column indices to avoid repeated get_loc() + left_col_indices = [left.columns.get_loc(col) for col in left_cols] + right_col_indices = [right.columns.get_loc(col) for col in right_cols] + # Outer join + for i in range(left_values.shape[0]): + key = left_values[i, left_on_idx] if key in right_dict: + left_row_values = left_values[i] + # Compose dict of left row vals only once + left_row_dict = { + col: left_row_values[idx] + for col, idx in zip(left_cols, left_col_indices) + } for right_idx in right_dict[key]: - right_row = right.iloc[right_idx] - new_row = {} - for col in left_cols: - new_row[col] = left_row[col] - for col in right_cols: - new_row[col] = right_row[col] - result_data.append(new_row) + right_row_values = right_values[right_idx] + # Add right row values except join column + row = left_row_dict.copy() + for col, idx in zip(right_cols, right_col_indices): + row[col] = right_row_values[idx] + result_data.append(row) return pd.DataFrame(result_data) @@ -66,14 +81,17 @@ def pivot_table( def agg_func(values): return sum(values) / len(values) + elif aggfunc == "sum": def agg_func(values): return sum(values) + elif aggfunc == "count": def agg_func(values): return len(values) + else: raise ValueError(f"Unsupported aggregation function: {aggfunc}") grouped_data = {}