diff --git a/README.md b/README.md
index 474913b2..f610c51b 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,12 @@ Allowing powerful and straightforward operations, like:
```python
# Compute the mean flux for each row of "object_nf"
import numpy as np
- object_nf.reduce(np.mean, "nested_sources.flux")
+
+ def mean_flux(row):
+ """Calculates the mean flux for each object"""
+ return np.mean(row["nested_sources.flux"])
+
+ object_nf.map_rows(mean_flux, output_names="mean_flux")
```
diff --git a/docs/gettingstarted/quickstart.ipynb b/docs/gettingstarted/quickstart.ipynb
index 8a565787..acfbfc56 100644
--- a/docs/gettingstarted/quickstart.ipynb
+++ b/docs/gettingstarted/quickstart.ipynb
@@ -282,9 +282,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Reduce Function\n",
+ "## The `map_rows` Function\n",
"\n",
- "Finally, we'll end with the flexible `reduce` function. `reduce` functions similarly to pandas' `apply` but flattens (reduces) the inputs from nested layers into array inputs to the given apply function. For example, let's find the mean flux for each dataframe in \"nested\":"
+ "Finally, we'll end with the flexible `map_rows` function. `map_rows` functions similarly to pandas' `apply` but applies row by row and flattens the inputs from nested layers into array inputs to the given apply function. For example, let's find the mean flux for each dataframe in \"nested\":"
]
},
{
@@ -297,7 +297,8 @@
"\n",
"# use hierarchical column names to access the flux column\n",
"# passed as an array to np.mean\n",
- "nf.reduce(np.mean, \"lightcurve.brightness\")"
+ "# row_container signals how to pass the data to the function, in this case as direct arguments\n",
+ "nf.map_rows(np.mean, \"lightcurve.brightness\", row_container=\"args\")"
]
},
{
@@ -313,15 +314,15 @@
"metadata": {},
"outputs": [],
"source": [
- "def show_inputs(*args):\n",
- " return args"
+ "def show_inputs(row):\n",
+ " return row"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Applying some inputs via reduce, we see how it sends inputs to a given function. The output frame `nf_inputs` consists of two columns containing the output of the “ra” column and the “lightcurve.time” column."
+ "Applying some inputs via `map_rows`, we see how it sends inputs to a given function. The output frame `nf_inputs` consists of two columns containing the output of the “ra” column and the “lightcurve.time” column."
]
},
{
@@ -330,8 +331,12 @@
"metadata": {},
"outputs": [],
"source": [
- "nf_inputs = nf.reduce(show_inputs, \"ra\", \"lightcurve.time\")\n",
- "nf_inputs"
+ "# row_container=\"dict\" passes the data as a dictionary to the function\n",
+ "nf_inputs = nf.map_rows(show_inputs, columns=[\"ra\", \"lightcurve.time\"], row_container=\"dict\")\n",
+ "nf_inputs\n",
+ "\n",
+ "# map_rows returns a dataframe view of the dicts, but the two columns can be accessed with show_inputs as\n",
+ "# row[\"ra\"] and row[\"lightcurve.time\"]"
]
},
{
@@ -343,6 +348,23 @@
"nf_inputs.loc[0]"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# row_container=\"args\" passes the data as arguments to the function\n",
+ "\n",
+ "\n",
+ "def show_inputs(*args):\n",
+ " return args\n",
+ "\n",
+ "\n",
+ "nf_inputs = nf.map_rows(show_inputs, columns=[\"ra\", \"lightcurve.time\"], row_container=\"args\")\n",
+ "nf_inputs"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/docs/pre_executed/nested_spectra.ipynb b/docs/pre_executed/nested_spectra.ipynb
index 3f974c7b..789b158d 100644
--- a/docs/pre_executed/nested_spectra.ipynb
+++ b/docs/pre_executed/nested_spectra.ipynb
@@ -280,7 +280,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -452,7 +452,7 @@
}
],
"source": [
- "spec_ndf = xid_ndf.add_nested(flat_spec, \"coadd_spectrum\").set_index(\"objid\")\n",
+ "spec_ndf = xid_ndf.join_nested(flat_spec, \"coadd_spectrum\").set_index(\"objid\")\n",
"spec_ndf"
]
},
diff --git a/docs/pre_executed/performance.ipynb b/docs/pre_executed/performance.ipynb
index 18161671..e65df5f9 100644
--- a/docs/pre_executed/performance.ipynb
+++ b/docs/pre_executed/performance.ipynb
@@ -98,7 +98,7 @@
"# Read in parquet data\n",
"# nesting sources into objects\n",
"nf = npd.read_parquet(\"objects.parquet\")\n",
- "nf = nf.add_nested(npd.read_parquet(\"ztf_sources.parquet\"), \"ztf_sources\")\n",
+ "nf = nf.join_nested(npd.read_parquet(\"ztf_sources.parquet\"), \"ztf_sources\")\n",
"\n",
"# Filter on object\n",
"nf = nf.query(\"ra > 10.0\")\n",
diff --git a/docs/reference/accessor.rst b/docs/reference/accessor.rst
index 8f01a4d6..092f6794 100644
--- a/docs/reference/accessor.rst
+++ b/docs/reference/accessor.rst
@@ -18,12 +18,9 @@ Functions
NestSeriesAccessor.to_lists
NestSeriesAccessor.to_flat
NestSeriesAccessor.to_flatten_inner
- NestSeriesAccessor.with_field
- NestSeriesAccessor.with_flat_field
- NestSeriesAccessor.with_list_field
- NestSeriesAccessor.with_filled_field
- NestSeriesAccessor.without_field
- NestSeriesAccessor.query_flat
- NestSeriesAccessor.get_flat_index
- NestSeriesAccessor.get_flat_series
- NestSeriesAccessor.get_list_series
+ NestSeriesAccessor.set_column
+ NestSeriesAccessor.set_flat_column
+ NestSeriesAccessor.set_list_column
+ NestSeriesAccessor.set_filled_column
+ NestSeriesAccessor.drop
+ NestSeriesAccessor.query
diff --git a/docs/reference/nesteddtype.rst b/docs/reference/nesteddtype.rst
index ecb6f611..798eabe5 100644
--- a/docs/reference/nesteddtype.rst
+++ b/docs/reference/nesteddtype.rst
@@ -17,6 +17,6 @@ Functions
NestedDtype.construct_array_type
NestedDtype.construct_from_string
- NestedDtype.from_fields
+ NestedDtype.from_columns
NestedDtype.from_pandas_arrow_dtype
NestedDtype.to_pandas_arrow_dtype
\ No newline at end of file
diff --git a/docs/reference/nestedframe.rst b/docs/reference/nestedframe.rst
index 9409f886..f7c6b867 100644
--- a/docs/reference/nestedframe.rst
+++ b/docs/reference/nestedframe.rst
@@ -10,12 +10,21 @@ Constructor
NestedFrame
+Helpful Properties
+~~~~~~~~~~~~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ NestedFrame.nested_columns
+ NestedFrame.base_columns
+ NestedFrame.all_columns
+
Nesting
~~~~~~~~~
.. autosummary::
:toctree: api/
- NestedFrame.add_nested
+ NestedFrame.join_nested
NestedFrame.nest_lists
NestedFrame.from_flat
NestedFrame.from_lists
@@ -25,7 +34,8 @@ Extended Pandas.DataFrame Interface
.. note::
The NestedFrame extends the Pandas.DataFrame interface, so all methods
- of Pandas.DataFrame are available. The following methods are extended
+ of Pandas.DataFrame are available. The following methods are a mix of
+ newly added methods and extended methods from Pandas DataFrame
to support NestedFrame functionality. Please reference the Pandas
documentation for more information.
https://pandas.pydata.org/docs/reference/frame.html
@@ -33,11 +43,12 @@ Extended Pandas.DataFrame Interface
.. autosummary::
:toctree: api/
+ NestedFrame.get_subcolumns
NestedFrame.eval
NestedFrame.query
NestedFrame.dropna
NestedFrame.sort_values
- NestedFrame.reduce
+ NestedFrame.map_rows
NestedFrame.drop
NestedFrame.min
NestedFrame.max
diff --git a/docs/reference/nestedseries.rst b/docs/reference/nestedseries.rst
index 0ebd519c..2aaafbc5 100644
--- a/docs/reference/nestedseries.rst
+++ b/docs/reference/nestedseries.rst
@@ -16,4 +16,4 @@ Functions
:toctree: api/
NestedSeries.to_lists
- NestedSeries.to_flat
\ No newline at end of file
+ NestedSeries.explode
\ No newline at end of file
diff --git a/docs/tutorials/data_loading_notebook.ipynb b/docs/tutorials/data_loading_notebook.ipynb
index a09a4052..37b01e15 100644
--- a/docs/tutorials/data_loading_notebook.ipynb
+++ b/docs/tutorials/data_loading_notebook.ipynb
@@ -141,7 +141,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can then create an additional pandas dataframes for the nested columns and pack them into our `NestedFrame` with `NestedFrame.add_nested()` function. `add_nested` will align the nest based on the index by default (a column may be selected instead via the `on` kwarg), as we see the `nested` `DataFrame` has a repeated index corresponding to the `nf` `NestedFrame`."
+ "We can then create an additional pandas dataframes for the nested columns and pack them into our `NestedFrame` with `NestedFrame.join_nested()` function. `join_nested` will align the nest based on the index by default (a column may be selected instead via the `on` kwarg), as we see the `nested` `DataFrame` has a repeated index corresponding to the `nf` `NestedFrame`."
]
},
{
@@ -158,7 +158,7 @@
" index=[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],\n",
")\n",
"\n",
- "nf = nf.add_nested(nested, \"nested\")\n",
+ "nf = nf.join_nested(nested, \"nested\")\n",
"nf"
]
},
@@ -182,7 +182,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We could add other nested columns by creating new sub-tables and adding them with `add_nested()`. Note that while the tables added with each `add_nested()` must be rectangular, they do not need to have the same dimensions between calls. We could add another nested row with a different number of observations."
+ "We could add other nested columns by creating new sub-tables and adding them with `join_nested()`. Note that while the tables added with each `join_nested()` must be rectangular, they do not need to have the same dimensions between calls. We could add another nested row with a different number of observations."
]
},
{
@@ -199,7 +199,7 @@
" index=[0, 0, 1, 1, 1, 2],\n",
")\n",
"\n",
- "nf = nf.add_nested(nested, \"nested2\")\n",
+ "nf = nf.join_nested(nested, \"nested2\")\n",
"nf"
]
},
diff --git a/docs/tutorials/data_manipulation.ipynb b/docs/tutorials/data_manipulation.ipynb
index f0383532..fbd8e2fa 100644
--- a/docs/tutorials/data_manipulation.ipynb
+++ b/docs/tutorials/data_manipulation.ipynb
@@ -105,13 +105,6 @@
"## Adding or Replacing Nested Columns"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "> *A Note on Performance: These operations involve full reconstruction of the nested columns so expect impacted performance when doing this at scale. It may be appropriate to do these operations within reduce functions directly (e.g. subtracting a value from a column) if performance is key.*"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -210,7 +203,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "This is functionally equivalent to using `add_nested`:"
+ "This is functionally equivalent to using `join_nested`:"
]
},
{
@@ -224,7 +217,7 @@
},
"outputs": [],
"source": [
- "ndf.add_nested(ndf[\"nested.band\"].to_frame(), \"bands_from_add_nested\")"
+ "ndf.join_nested(ndf[\"nested.band\"].to_frame(), \"bands_from_add_nested\")"
]
},
{
@@ -254,7 +247,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The above again being shorthand for the following `add_nested` call:"
+ "The above again being shorthand for the following `join_nested` call:"
]
},
{
@@ -263,7 +256,7 @@
"metadata": {},
"outputs": [],
"source": [
- "ndf.add_nested(flat_df, \"example_from_add_nested\")"
+ "ndf.join_nested(flat_df, \"example_from_add_nested\")"
]
},
{
diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb
index 6d17cb23..57fb5f09 100644
--- a/docs/tutorials/low_level.ipynb
+++ b/docs/tutorials/low_level.ipynb
@@ -111,7 +111,7 @@
"id": "33d8caacf0bf042e",
"metadata": {},
"source": [
- "You can also get a list of fields with `.fields` attribute:"
+ "You can also get a list of columns with the `.columns` attribute:"
]
},
{
@@ -126,7 +126,7 @@
},
"outputs": [],
"source": [
- "nested_series.nest.fields"
+ "nested_series.nest.columns"
]
},
{
@@ -205,23 +205,23 @@
"new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n",
"\n",
"# Create a new series with a new column\n",
- "new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n",
+ "new_series = new_series.nest.set_column(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n",
"\n",
"# Create a new series with a column removed, you can also pass a list of columns to remove\n",
- "new_series = new_series.nest.without_field(\"band\")\n",
+ "new_series = new_series.nest.drop(\"band\")\n",
"\n",
"# Add a new column with a python list instead of a Series\n",
- "new_series = new_series.nest.with_field(\n",
+ "new_series = new_series.nest.set_column(\n",
" \"new_column\",\n",
" [1, 2] * (new_series.nest.flat_length // 2),\n",
")\n",
"\n",
"# Add a new column repeating values for each nested element\n",
"# It can be useful when you want to move some metadata to the nested data\n",
- "new_series = new_series.nest.with_filled_field(\"index_mult_100\", new_series.index * 100)\n",
+ "new_series = new_series.nest.set_filled_column(\"index_mult_100\", new_series.index * 100)\n",
"\n",
"# Create a new series, with a column dtype changed\n",
- "new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n",
+ "new_series = new_series.nest.set_column(\"t\", new_series.nest[\"t\"].astype(np.int8))\n",
"\n",
"new_series.nest.to_flat()"
]
@@ -293,7 +293,7 @@
"source": [
"# Adjust each time to be relative to the first observation\n",
"dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n",
- "new_series = new_series.nest.with_list_field(\"dt\", dt)\n",
+ "new_series = new_series.nest.set_list_column(\"dt\", dt)\n",
"new_series.nest.to_flat()"
]
},
@@ -367,7 +367,7 @@
"We have already seen how `.nest` accessor could be used to get different views on the nested data: \"flat\" dataframe, and list-array dataframe with columns of `pd.ArrowDtype`.\n",
"\n",
"This section is about converting nested Series to and from other data types.\n",
- "If you just need to add a nested column to a `NestedFrame`, you can do it with `.add_nested()` method."
+ "If you just need to add a nested column to a `NestedFrame`, you can do it with `.join_nested()` method."
]
},
{
@@ -542,7 +542,7 @@
" {\"t\": [4, 5], \"flux\": [0.4, 0.5]},\n",
" None,\n",
" ],\n",
- " dtype=NestedDtype.from_fields({\"t\": pa.float64(), \"flux\": pa.float32()}),\n",
+ " dtype=NestedDtype.from_columns({\"t\": pa.float64(), \"flux\": pa.float32()}),\n",
")\n",
"series_from_pack"
]
@@ -588,7 +588,7 @@
" pd.DataFrame({\"t\": [1, 2, 3], \"band\": [\"g\", \"r\", \"r\"]}),\n",
" {\"t\": np.array([4, 5]), \"band\": [None, \"r\"]},\n",
" ],\n",
- " dtype=NestedDtype.from_fields({\"t\": pa.float64(), \"band\": pa.string()}),\n",
+ " dtype=NestedDtype.from_columns({\"t\": pa.float64(), \"band\": pa.string()}),\n",
")\n",
"series_from_dtype"
]
diff --git a/pyproject.toml b/pyproject.toml
index 73ba92d5..c77bf625 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,9 @@ dependencies = [
# We use internal pd._libs.missing and experimental ArrowExtensionArray
"pandas>=2.2.3,<2.4",
"pyarrow>=16", # remove struct_field_names() and struct_fields() when upgraded to 18+
-
+ "Deprecated>=1.2.0",
+ "wrapt>=1.12.1",
+
# NOTE: package PINNED at <0.3.0, see https://github.com/astronomy-commons/lsdb/issues/1047
"universal_pathlib>=0.2,<0.3.0",
]
@@ -43,6 +45,7 @@ dev = [
"aiohttp",
"requests",
"s3fs",
+ "types-Deprecated", # Needed for mypy type checking of Deprecated package
]
[build-system]
diff --git a/src/nested_pandas/datasets/generation.py b/src/nested_pandas/datasets/generation.py
index b8f263ec..888cf540 100644
--- a/src/nested_pandas/datasets/generation.py
+++ b/src/nested_pandas/datasets/generation.py
@@ -50,7 +50,7 @@ def generate_data(n_base, n_layer, seed=None) -> NestedFrame:
"index": np.arange(layer_size * n_base) % n_base,
}
layer_nf = NestedFrame(data=layer_data).set_index("index")
- base_nf = base_nf.add_nested(layer_nf, key)
+ base_nf = base_nf.join_nested(layer_nf, key)
return base_nf
else:
raise TypeError("Input to n_layer is not an int or dict.")
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
index bfb63fb5..b0ed818a 100644
--- a/src/nested_pandas/nestedframe/core.py
+++ b/src/nested_pandas/nestedframe/core.py
@@ -1,14 +1,15 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations
-import warnings
from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Callable
+from typing import Literal
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
+from deprecated import deprecated
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, Hashable, IndexLabel, Mapping
from pandas.api.extensions import no_default
@@ -89,7 +90,7 @@ def all_columns(self) -> dict:
all_columns = {"base": self.columns}
for column in self.columns:
if isinstance(self.dtypes[column], NestedDtype):
- nest_cols = self[column].nest.fields
+ nest_cols = self[column].columns
all_columns[column] = nest_cols
return all_columns
@@ -102,6 +103,11 @@ def nested_columns(self) -> list:
nest_cols.append(column)
return nest_cols
+ @property
+ def base_columns(self) -> list[str]:
+ """Returns the list of base (non-nested) column names"""
+ return [col for col in self.columns if col not in self.nested_columns]
+
def _repr_html_(self) -> str | None:
"""Override html representation"""
@@ -204,7 +210,7 @@ def _is_known_hierarchical_column(self, components: list[str] | str) -> bool:
base_name = components[0]
if self._is_nested_column(base_name):
nested_name = ".".join(components[1:])
- return nested_name in self.dtypes[base_name].fields
+ return nested_name in self.dtypes[base_name].column_dtypes
return False
def _is_nested_column(self, col: str):
@@ -247,7 +253,7 @@ def _getitem_str(self, item):
if self._is_known_hierarchical_column(components):
nested = components[0]
field = ".".join(components[1:])
- return self[nested].nest.get_flat_series(field)
+ return self[nested].nest.to_flat(columns=[field])[field]
else:
raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns")
@@ -287,16 +293,15 @@ def __setitem__(self, key, value):
if i == 0:
new_nested = value[col]
else:
- # there must be a better way than through list fields
- for field in value[col].nest.fields:
- new_nested = new_nested.nest.with_list_field(
- field, value[col].nest.get_list_series(field)
- )
+ # there must be a better way than through list columns
+ list_cols = value[col].to_lists()
+ for column in value[col].columns:
+ new_nested = new_nested.nest.set_list_column(column, list_cols[column])
value = new_nested
# Assign a DataFrame as a new column, auto-nesting it
elif key not in self.columns:
- # Note this uses the default approach for add_nested, which is a left join on index
- new_df = self.add_nested(value, name=key)
+ # Note this uses the default approach for join_nested, which is a left join on index
+ new_df = self.join_nested(value, name=key)
self._update_inplace(new_df)
return
@@ -315,9 +320,9 @@ def __setitem__(self, key, value):
# Support a special case of embedding a base column into a nested column, with values being
# repeated in each nested list-array.
if isinstance(value, pd.Series) and self.index.equals(value.index):
- new_nested_series = self[nested].nest.with_filled_field(field, value)
+ new_nested_series = self[nested].nest.set_filled_column(field, value)
else:
- new_nested_series = self[nested].nest.with_flat_field(field, value)
+ new_nested_series = self[nested].nest.set_flat_column(field, value)
return super().__setitem__(nested, new_nested_series)
# Adding a new nested structure from a column
@@ -327,7 +332,7 @@ def __setitem__(self, key, value):
if isinstance(value, pd.Series):
value.name = field
value = value.to_frame()
- new_df = self.add_nested(value, name=new_nested)
+ new_df = self.join_nested(value, name=new_nested)
self._update_inplace(new_df)
return None
@@ -338,6 +343,47 @@ def __delitem__(self, key):
"""Delete a column or a nested field using dot notation (e.g., del nf['nested.x'])"""
self.drop([key], axis=1, inplace=True)
+ def get_subcolumns(self, nested_columns="all") -> list[str]:
+ """Returns a set of all subcolumn names from a set of nested columns, including dot notation
+
+ Parameters
+ ----------
+ nested_columns : 'all' or str or list of str, optional
+ The nested columns to get subcolumns from. Default is 'all', which means all nested columns.
+
+ Returns
+ -------
+ list of str
+ A list of subcolumn names in dot notation, e.g. 'nested.a'
+
+ Examples
+ --------
+ >>> from nested_pandas.datasets import generate_data
+
+ >>> nf = generate_data(5,10, seed=1)
+ >>> nf["nested2"] = nf["nested"] # create a second nested column for demonstration
+ >>> nf.get_subcolumns()
+ ['nested.t', 'nested.flux', 'nested.band', 'nested2.t', 'nested2.flux', 'nested2.band']
+
+ >>> nf.get_subcolumns("nested")
+ ['nested.t', 'nested.flux', 'nested.band']
+ """
+ # By default, get all subcolumns from all nested columns
+ if nested_columns == "all":
+ nested_columns = self.nested_columns
+ if isinstance(nested_columns, str):
+ nested_columns = [nested_columns]
+ subcols = []
+ for nested_column in nested_columns:
+ subcols += [f"{nested_column}.{col}" for col in self[nested_column].columns]
+
+ # I don't believe we need an error if we don't find any, as upstream errors will always trigger
+ # on wrong column names
+ return subcols
+
+ @deprecated(
+ version="0.6.0", reason="`add_nested` will be removed in version 0.7.0, " "use `join_nested` instead."
+ )
def add_nested(
self,
obj,
@@ -402,6 +448,72 @@ def add_nested(
1 2 5 [{c: 4}; …] (3 rows)
2 3 6 [{c: 7}; …] (3 rows)
"""
+ return self.join_nested(obj, name, how=how, on=on, dtype=dtype)
+
+ def join_nested(
+ self,
+ obj,
+ name: str,
+ *,
+ how: str = "left",
+ on: None | str | list[str] = None,
+ dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
+ ) -> Self: # type: ignore[name-defined] # noqa: F821
+ """Packs input object to a nested column and adds it to the NestedFrame
+
+ This method returns a new NestedFrame with the added nested column.
+
+ Parameters
+ ----------
+ obj : pd.DataFrame or a sequence of items convertible to nested structures
+ The object to be packed into nested pd.Series and added to
+ the NestedFrame. If a DataFrame is passed, it must have non-unique
+ index values, which are used to pack the DataFrame. If a sequence
+ of elements is passed, it is packed into a nested pd.Series.
+ Sequence elements may be individual pd.DataFrames, dictionaries
+ (keys are nested column names, values are arrays of the same
+ length), or any other object convertible to pa.StructArray.
+ Additionally, None and pd.NA are allowed as elements to represent
+ missing values.
+ name : str
+ The name of the nested column to be joined to the NestedFrame.
+ how : {'left', 'right', 'outer', 'inner'}, default: 'left'
+ How to handle the operation of the two objects:
+
+ - left: use calling frame's index.
+ - right: use the calling frame's index and order but drop values
+ not in the other frame's index.
+ - outer: form union of calling frame's index with other frame's
+ index, and sort it lexicographically.
+ - inner: form intersection of calling frame's index with other
+ frame's index, preserving the order of the calling index.
+ on : str, default: None
+ A column in the list
+ dtype : dtype or None
+ NestedDtype to use for the nested column; pd.ArrowDtype or
+ pa.DataType can also be used to specify the nested dtype. If None,
+ the dtype is inferred from the input object.
+
+ Returns
+ -------
+ NestedFrame
+ A new NestedFrame with the joined nested column.
+
+ Examples
+ --------
+
+ >>> import nested_pandas as npd
+ >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
+ ... index=[0,1,2])
+ >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
+ ... index=[0,0,0,1,1,1,2,2,2])
+ >>> # By default, aligns on the index
+ >>> nf.join_nested(nf2, "nested")
+ a b nested
+ 0 1 4 [{c: 1}; …] (3 rows)
+ 1 2 5 [{c: 4}; …] (3 rows)
+ 2 3 6 [{c: 7}; …] (3 rows)
+ """
if on is not None and not isinstance(on, str):
raise ValueError("Currently we only support a single column for 'on'")
# Add sources to objects
@@ -450,18 +562,6 @@ def nest_lists(self, columns: list[str], name: str) -> NestedFrame:
2 3 6 [{e: 7}; …] (3 rows)
"""
- # Check if `name` is actually a list and `columns` is a string
- if isinstance(name, Sequence) and not isinstance(name, str) and isinstance(columns, str):
- warnings.warn(
- "DeprecationWarning: The argument order for `nest_lists` has changed: "
- "`nest_lists(name, columns)` is now `nest_lists(columns, name)`. "
- "Please update your code.",
- DeprecationWarning,
- stacklevel=2,
- )
- # Swap the arguments
- name, columns = columns, name
-
return NestedFrame.from_lists(self.copy(), list_columns=columns, name=name)
@classmethod
@@ -527,7 +627,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None,
# add nested
if nested_columns is None:
nested_columns = [col for col in df.columns if col not in base_columns]
- return out_df.add_nested(df[nested_columns], name=name)
+ return out_df.join_nested(df[nested_columns], name=name)
@classmethod
def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
@@ -590,7 +690,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
if len(df) == 0:
# if the dataframe is empty, just return an empty nested column
# since there are no iterable values to pack
- packed_df = NestedFrame().add_nested(df[list_columns], name=name)
+ packed_df = NestedFrame().join_nested(df[list_columns], name=name)
else:
# Check that each column has iterable elements
for col in list_columns:
@@ -694,9 +794,9 @@ def drop(
for col in nested_cols:
sub_cols = [label.split(".")[1] for label in nested_labels if label.split(".")[0] == col]
if inplace:
- self[col] = self[col].nest.without_field(sub_cols)
+ self[col] = self[col].nest.drop(sub_cols)
else:
- self = self.assign(**{f"{col}": self[col].nest.without_field(sub_cols)})
+ self = self.assign(**{f"{col}": self[col].nest.drop(sub_cols)})
# drop remaining base columns
if len(base_labels) > 0:
@@ -786,7 +886,7 @@ def min(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs):
# handle nested columns
nested_mins = []
for nest_col in self.nested_columns:
- nested_df = self[nest_col].nest.to_flat()
+ nested_df = self[nest_col].explode()
nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns]
nested_mins.append(nested_df.min(numeric_only=numeric_only, **kwargs))
@@ -860,7 +960,7 @@ def max(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs):
# handle nested columns
nested_maxs = []
for nest_col in self.nested_columns:
- nested_df = self[nest_col].nest.to_flat()
+ nested_df = self[nest_col].explode()
nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns]
nested_maxs.append(nested_df.max(numeric_only=numeric_only, **kwargs))
@@ -965,7 +1065,7 @@ def describe(self, exclude_nest: bool = False, percentiles=None, include=None, e
# check the nested columns
else:
- nested_df = self[checkable].nest.to_flat()
+ nested_df = self[checkable].explode()
nested_df.columns = [f"{checkable}.{col}" for col in nested_df.columns]
try:
nested_desc = nested_df.describe(
@@ -1100,7 +1200,7 @@ def explode(self, column: IndexLabel, ignore_index: bool = False):
raise ValueError(
f"One or few rows of {nested_col} have different element counts from {nested_columns[0]}"
)
- flat = w_ordinal_idx[nested_col].nest.to_flat()
+ flat = w_ordinal_idx[nested_col].explode()
# Check if counts (lengths) of this nested column mismatch with one of the list columns.
if is_base_exploded and not base_exploded.index.equals(flat.index):
raise ValueError(
@@ -1173,7 +1273,7 @@ def fillna(
... data={"d": [np.nan, np.nan, np.nan], "e": [np.nan, 1, np.nan]},
... index=[0, 1, 2]
... )
- >>> nf = nf.add_nested(nested, "nested")
+ >>> nf = nf.join_nested(nested, "nested")
>>> nf.fillna(0)
a b c nested
@@ -1190,7 +1290,7 @@ def fillna(
filled_df = super().__getitem__(base_cols).fillna(value=value, axis=axis, inplace=False, limit=limit)
for nest_col in self.nested_columns:
- nested_df = self[nest_col].nest.to_flat()
+ nested_df = self[nest_col].explode()
nested_value: Any
if isinstance(value, Mapping):
nested_value = {}
@@ -1201,7 +1301,7 @@ def fillna(
else:
nested_value = value
nested_df = nested_df.fillna(value=nested_value, axis=axis, inplace=False, limit=None)
- filled_df = filled_df.add_nested(nested_df, nest_col)
+ filled_df = filled_df.join_nested(nested_df, nest_col)
if inplace:
self._update_inplace(filled_df)
@@ -1576,7 +1676,7 @@ def dropna(
raise ValueError("ignore_index is not supported for nested columns")
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
- target_flat = self[target].nest.to_flat()
+ target_flat = self[target].explode()
target_flat = target_flat.set_index(self[target].array.get_list_index())
if inplace:
target_flat.dropna(
@@ -1693,7 +1793,7 @@ def sort_values(
key=key,
)
else: # target is a nested column
- target_flat = self[target].nest.to_flat()
+ target_flat = self[target].explode()
target_flat = target_flat.set_index(self[target].array.get_list_index())
if target_flat.index.name is None: # set name if not present
@@ -1726,6 +1826,9 @@ def sort_values(
return None
return new_df
+ @deprecated(
+ version="0.6.0", reason="`reduce` will be removed in version 0.7.0, " "use `map_rows` instead."
+ )
def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs) -> NestedFrame: # type: ignore[override]
"""
Takes a function and applies it to each top-level row of the NestedFrame.
@@ -1886,6 +1989,280 @@ def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs
# Otherwise, return the results as a new NestedFrame
return results_nf
+ def map_rows(
+ self,
+ func: Callable[..., Any],
+ columns: None | str | list[str] = None,
+ row_container: Literal["dict"] | Literal["args"] = "dict",
+ output_names: None | str | list[str] = None,
+ infer_nesting: bool = True,
+ append_columns: bool = False,
+ **kwargs,
+ ) -> NestedFrame: # type: ignore[override]
+ """
+ Takes a function and applies it to each top-level row of the NestedFrame.
+
+ Nested columns are packaged alongside base columns and available for function use, where base columns
+ are passed as scalars and nested columns are passed as numpy arrays. The way in which the row data is
+ packaged is configurable (by default, a dictionary) and controlled by the `row_container` argument.
+
+ Parameters
+ ----------
+ func : callable
+ Function to apply to each nested dataframe. The first arguments to `func` should be which
+ columns to apply the function to. See the Notes for recommendations
+ on writing func outputs.
+ columns : None | str | list of str
+ Specifies which columns to pass to the function in the row_container format.
+ If None, all columns are passed. If list of str, those columns are passed.
+ If str, a single column is passed or if the string is a nested column, then all nested sub-columns
+ are passed (e.g. columns="nested" passes all columns of the nested dataframe "nested"). To pass
+ individual nested sub-columns, use the hierarchical column name (e.g. columns=["nested.t",...]).
+ row_container : 'dict' or 'args', default 'dict'
+ Specifies how the row data will be packaged when passed as an input to the function.
+ If 'dict', the function will be called as `func({"col1": value, ...}, **kwargs)`, so func should
+ expect a single dictionary input with keys corresponding to column names.
+ If 'args', the function will be called as `func(value, ..., **kwargs)`, so func should expect
+ positional arguments corresponding to the columns specified in `args`.
+ output_names : None | str | list of str
+ Specifies the names of the output columns in the resulting NestedFrame. If None, the function
+ will return whatever names the user function returns. If specified will override any names
+ returned by the user function provided the number of names matches the number of outputs. When not
+ specified and the user function returns values without names (e.g. a list or tuple), the output
+ columns will be enumerated (e.g. "0", "1", ...).
+ infer_nesting : bool, default True
+ If True, the function will pack output columns into nested
+ structures based on column names adhering to a nested naming
+ scheme. E.g. "nested.b" and "nested.c" will be packed into a column
+ called "nested" with columns "b" and "c". If False, all outputs
+ will be returned as base columns. Note that this will trigger off of names specified in
+ `output_names` in addition to names returned by the user function.
+ append_columns : bool, default False
+ if True, the output columns should be appended to those in the original NestedFrame.
+ kwargs : keyword arguments, optional
+ Keyword arguments to pass to the function.
+
+ Returns
+ -------
+ `NestedFrame`
+ `NestedFrame` with the results of the function applied to the columns of the frame.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> import numpy as np
+ >>> nf = generate_data(5,5, seed=1)
+ >>> # define a custom user function
+ >>> # map_rows will return a NestedFrame with two columns
+ >>> def example_func(row):
+ ... return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"]
+
+ >>> # apply the function
+ >>> nf.map_rows(example_func, output_names=["mean", "mean_minus_base"])
+ mean mean_minus_base
+ 0 11.533440 11.116418
+ 1 10.307751 9.587426
+ 2 8.294042 8.293928
+ 3 9.655291 9.352958
+ 4 10.687591 10.540836
+
+ We can pass along only the columns we need for the function using the `columns` argument, which
+ removes the performance overhead of packaging all columns for each row:
+
+ >>> nf.map_rows(example_func, columns=["a", "nested.t"], output_names=["mean", "mean_minus_base"])
+ mean mean_minus_base
+ 0 11.533440 11.116418
+ 1 10.307751 9.587426
+ 2 8.294042 8.293928
+ 3 9.655291 9.352958
+ 4 10.687591 10.540836
+
+ Alternatively, we can pass along the row data as positional arguments
+ instead of a dictionary by setting `row_container="args"` and adjusting
+ our function signature accordingly:
+
+ >>> def example_func(a, time):
+ ... return np.mean(time), np.mean(time) - a
+
+ >>> nf.map_rows(example_func,
+ ... columns=["a", "nested.t"],
+ ... output_names=["mean", "mean_minus_base"],
+ ... row_container="args")
+ mean mean_minus_base
+ 0 11.533440 11.116418
+ 1 10.307751 9.587426
+ 2 8.294042 8.293928
+ 3 9.655291 9.352958
+ 4 10.687591 10.540836
+
+ Additional arguments that don't depend on row data can be passed as kwargs:
+
+ >>> def example_func(row, scale):
+ ... return np.mean(row["nested.t"]) * scale
+
+ >>> nf.map_rows(example_func, columns=["nested.t"], output_names="mean", scale=1)
+ mean
+ 0 11.533440
+ 1 10.307751
+ 2 8.294042
+ 3 9.655291
+ 4 10.687591
+
+ Functions that target a single nested structure can just pass along
+ the nested column name and all sub-columns will be available:
+
+ >>> def first_val(row):
+ ... return {"first_"+key.split(".")[1]:row[key][0] for key in row.keys()}
+
+ >>> nf.map_rows(first_val, columns="nested")
+ first_t first_flux first_band
+ 0 8.383890 31.551563 r
+ 1 13.704390 68.650093 g
+ 2 4.089045 83.462567 g
+ 3 17.562349 1.828828 g
+ 4 0.547752 75.014431 g
+
+ You may want the result of a `map_rows` call to have nested structure,
+ we can achieve this by using the `infer_nesting` kwarg:
+
+ >>> # define a custom user function that returns nested structure
+ >>> def example_func(row):
+ ... '''map_rows will return a NestedFrame with nested structure'''
+ ... return {"offsets.t_a": row["nested.t"] - row["a"],
+ ... "offsets.t_b": row["nested.t"] - row["b"]}
+
+ By giving both output columns the prefix "offsets.", we signal
+ to map_rows to infer that these should be packed into a nested column
+ called "offsets".
+
+ >>> # apply the function with `infer_nesting` (True by default)
+ >>> nf.map_rows(example_func, columns=["a", "b", "nested.t"], infer_nesting=True)
+ offsets
+ 0 [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows)
+ 1 [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows)
+ 2 [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows)
+ 3 [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows)
+ 4 [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows)
+
+ Notes
+ -----
+ If concerned about performance, specify `columns` to only include the columns
+ needed for the function, as this will avoid the overhead of packaging
+ all columns for each row.
+
+ By default, `map_rows` will produce a `NestedFrame` with enumerated
+ column names for each returned value of the function. It's recommended
+ to either specify `output_names` or have `func` return a dictionary
+ where each key is an output column of the dataframe returned by
+ `map_rows` (as shown above).
+
+ >>> def example_func(row):
+ ... return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"]
+
+ >>> # first output column will be named "0", second "1"
+ >>> nf.map_rows(example_func)
+ 0 1
+ 0 11.533440 11.116418
+ 1 10.307751 9.587426
+ 2 8.294042 8.293928
+ 3 9.655291 9.352958
+ 4 10.687591 10.540836
+ """
+ # Determine args
+ if columns is None:
+ # If None, pass all columns, with nested columns expanded to sub-columns
+ columns = self.base_columns + self.get_subcolumns(nested_columns="all")
+ elif isinstance(columns, str):
+ # If it's a nested column, grab all sub-columns
+ columns = self.get_subcolumns(columns) if columns in self.nested_columns else [columns]
+
+ # Check arg validity
+ requested_columns = []
+ for arg in columns:
+ if not isinstance(arg, str):
+ raise TypeError(
+ f"Received an argument '{arg}' that is not a string. "
+ "All arguments to `map_rows` must be strings corresponding to"
+ " column names to pass along to the function."
+ )
+ components = self._parse_hierarchical_components(arg)
+ if not self._is_known_column(components):
+ raise ValueError(
+ f"Received a string argument '{arg}' that was not found in the columns list. "
+ "All arguments to `map_rows` must be strings corresponding to"
+ " column names to pass along to the function."
+ )
+ layer = "base" if len(components) < 2 else components[0]
+ col = components[-1]
+ requested_columns.append((layer, col))
+
+ # Construct row containers and apply
+ if row_container == "dict":
+ arg_dict = {}
+ for layer, col in requested_columns:
+ if layer == "base":
+ arg_dict[col] = self[col]
+ else:
+ arg_dict[".".join([layer, col])] = self[layer].array.iter_field_lists(col)
+ results = [
+ func({col: val for col, val in zip(arg_dict.keys(), row, strict=True)}, **kwargs)
+ for row in zip(*arg_dict.values(), strict=True)
+ ]
+
+ elif row_container == "args":
+ # Build iterators for each column
+ iterators = []
+ for layer, col in requested_columns:
+ if layer == "base":
+ iterators.append(self[col])
+ else:
+ iterators.append(self[layer].array.iter_field_lists(col))
+
+ results = [func(*cols, **kwargs) for cols in zip(*iterators, strict=True)]
+
+ results_nf = NestedFrame(results, index=self.index)
+
+ # Override output names if specified
+ if output_names is not None:
+ if isinstance(output_names, str):
+ output_names = [output_names]
+ if len(output_names) != len(results_nf.columns):
+ raise ValueError(
+ f"Number of output names ({len(output_names)}) does not match "
+ f"the number of outputs from the function ({len(results_nf.columns)})"
+ )
+ results_nf.columns = output_names
+
+ if infer_nesting:
+ # find potential nested structures from columns
+ nested_cols = list(
+ np.unique(
+ [
+ column.split(".", 1)[0]
+ for column in results_nf.columns
+ if isinstance(column, str) and "." in column
+ ]
+ )
+ )
+
+ # pack results into nested structures
+ for layer in nested_cols:
+ layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")]
+ rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1])
+ nested_col = pack_lists(rename_df, name=layer)
+ results_nf = results_nf[
+ [col for col in results_nf.columns if not col.startswith(f"{layer}.")]
+ ]
+ results_nf[layer] = nested_col
+
+ if append_columns:
+ # Append the results to the original NestedFrame
+ return pd.concat([self, results_nf], axis=1)
+
+ # Otherwise, return the results as a new NestedFrame
+ return results_nf
+
def to_pandas(self, list_struct=False) -> pd.DataFrame:
"""Convert to an ordinal pandas DataFrame, with no NestedDtype series.
diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py
index edd293c4..25511455 100644
--- a/src/nested_pandas/series/accessor.py
+++ b/src/nested_pandas/series/accessor.py
@@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
+from deprecated import deprecated
from numpy.typing import ArrayLike
from pandas.api.extensions import register_series_accessor
@@ -40,13 +41,13 @@ def _check_series(series):
if not isinstance(dtype, NestedDtype):
raise AttributeError(f"Can only use .nest accessor with a Series of NestedDtype, got {dtype}")
- def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
+ def to_lists(self, columns: list[str] | None = None) -> pd.DataFrame:
"""Convert nested series into dataframe of list-array columns
Parameters
----------
- fields : list[str] or None, optional
- Names of the fields to include. Default is None, which means all fields.
+ columns : list[str] or None, optional
+ Names of the columns to include. Default is None, which means all columns.
Returns
-------
@@ -67,22 +68,22 @@ def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
3 [17.56234873 2.80773877] [69.23226157 16.98304196] ['r' 'r']
4 [0.54775186 3.96202978] [87.63891523 87.81425034] ['g' 'r']
"""
- fields = fields if fields is not None else list(self._series.array.field_names)
- if len(fields) == 0:
+ columns = columns if columns is not None else list(self._series.array.field_names)
+ if len(columns) == 0:
raise ValueError("Cannot convert a struct with no fields to lists")
- list_df = self._series.array.pa_table.select(fields).to_pandas(types_mapper=nested_types_mapper)
+ list_df = self._series.array.pa_table.select(columns).to_pandas(types_mapper=nested_types_mapper)
list_df.index = self._series.index
return list_df
- def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
+ def to_flat(self, columns: list[str] | None = None) -> pd.DataFrame:
"""Convert nested series into dataframe of flat arrays
Parameters
----------
- fields : list[str] or None, optional
- Names of the fields to include. Default is None, which means all fields.
+ columns : list[str] or None, optional
+ Names of the columns to include. Default is None, which means all columns.
Returns
-------
@@ -109,32 +110,33 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
4 3.96203 87.81425 r
"""
- fields = fields if fields is not None else list(self._series.array.field_names)
- if len(fields) == 0:
- raise ValueError("Cannot flatten a struct with no fields")
+ columns = columns if columns is not None else list(self._series.array.field_names)
+ if len(columns) == 0:
+ raise ValueError("Cannot flatten a struct with no columns")
- index = self.get_flat_index()
+ index = self.flat_index
- flat_chunks: dict[str, list[pa.Array]] = {field: [] for field in fields}
+ flat_chunks: dict[str, list[pa.Array]] = {column: [] for column in columns}
for chunk in self._series.array.struct_array.iterchunks():
struct_array = cast(pa.StructArray, chunk)
- for field in fields:
- list_array = cast(pa.ListArray, struct_array.field(field))
+ for column in columns:
+ list_array = cast(pa.ListArray, struct_array.field(column))
flat_array = list_array.flatten()
- flat_chunks[field].append(flat_array)
+ flat_chunks[column].append(flat_array)
flat_series = {}
- for field, chunks in flat_chunks.items():
- dtype = self._series.dtype.field_dtype(field)
- chunked_array = pa.chunked_array(chunks, type=self._series.dtype.fields[field])
- flat_series[field] = pd.Series(
+ for column, chunks in flat_chunks.items():
+ dtype = self._series.dtype.column_dtype(column)
+ chunked_array = pa.chunked_array(chunks, type=self._series.dtype.column_dtypes[column])
+ flat_series[column] = pd.Series(
chunked_array,
index=index,
- name=field,
+ name=column,
copy=False,
dtype=dtype,
)
+ # TODO: Consider returning a NestedSeries if only one column is present
return pd.DataFrame(flat_series)
@property
@@ -148,10 +150,29 @@ def flat_length(self) -> int:
return self._series.array.flat_length
@property
+ @deprecated(
+ version="0.6.0", reason="`fields` will be removed in version 0.7.0, " "use `columns` instead."
+ )
def fields(self) -> list[str]:
+ """Names of the nested columns"""
+ return self.columns
+
+ @property
+ def columns(self) -> list[str]:
"""Names of the nested columns"""
return self._series.array.field_names
+ @property
+ def flat_index(self) -> pd.Index:
+ """Index of the flattened arrays"""
+ flat_index = np.repeat(self._series.index, np.diff(self._series.array.list_offsets))
+ # pd.Index supports np.repeat, so flat_index is the same type as self._series.index
+ flat_index = cast(pd.Index, flat_index)
+ return flat_index
+
+ @deprecated(
+ version="0.6.0", reason="`with_field` will be removed in version 0.7.0, " "use `set_column` instead."
+ )
def with_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field from flat-array of values and return a new series
@@ -183,8 +204,45 @@ def with_field(self, field: str, value: ArrayLike) -> NestedSeries:
0 8.38389 80.074457 r 50.0
1 13.40935 89.460666 g 50.0
"""
- return self.with_flat_field(field, value)
+ return self.set_column(field, value)
+
+ def set_column(self, column: str, value: ArrayLike) -> NestedSeries:
+ """Set the column from a flat-array of values and return a new series
+
+ It is an alias for `.nest.set_flat_column`.
+
+ Parameters
+ ----------
+ column : str
+ Name of the column to set. If not present, it will be added.
+ value : ArrayLike
+ Array of values to set. It must be a scalar or have the same length
+ as the flat arrays, e.g. `self.flat_length`.
+
+ Returns
+ -------
+ NestedSeries
+ The new series with the field set.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(5, 2, seed=1)
+
+ >>> nested_with_avg = nf["nested"].nest.set_column("avg_flux", 50.0)
+ >>> # Look at one row of the series
+ >>> nested_with_avg[0]
+ t flux band avg_flux
+ 0 8.38389 80.074457 r 50.0
+ 1 13.40935 89.460666 g 50.0
+ """
+ return self.set_flat_column(column, value)
+ @deprecated(
+ version="0.6.0",
+ reason="`with_flat_field` will be removed in version 0.7.0, " "use `set_flat_column` instead.",
+ )
def with_flat_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field from flat-array of values and return a new series
@@ -215,10 +273,46 @@ def with_flat_field(self, field: str, value: ArrayLike) -> NestedSeries:
0 8.38389 80.074457 r 50.0
1 13.40935 89.460666 g 50.0
"""
+ return self.set_flat_column(field, value)
+
+ def set_flat_column(self, column: str, value: ArrayLike) -> NestedSeries:
+ """Set the column from flat-array of values and return a new series
+
+ Parameters
+ ----------
+ column : str
+ Name of the column to set. If not present, it will be added.
+ value : ArrayLike
+ Array of values to set. It must be a scalar or have the same length
+ as the flat arrays, e.g. `self.flat_length`.
+
+ Returns
+ -------
+ NestedSeries
+ The new series with the field set.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(5, 2, seed=1)
+
+ >>> nested_with_avg = nf["nested"].nest.set_flat_column("avg_flux",
+ ... 50.0)
+ >>> # Look at one row of the series
+ >>> nested_with_avg[0]
+ t flux band avg_flux
+ 0 8.38389 80.074457 r 50.0
+ 1 13.40935 89.460666 g 50.0
+ """
new_array = self._series.array.copy()
- new_array.set_flat_field(field, value)
+ new_array.set_flat_field(column, value)
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
+ @deprecated(
+ version="0.6.0",
+ reason="`with_list_field` will be removed in version 0.7.0, " "use `set_list_column` instead.",
+ )
def with_list_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field from list-array of values and return a new series
@@ -250,11 +344,49 @@ def with_list_field(self, field: str, value: ArrayLike) -> NestedSeries:
0 2.935118 39.676747 g g
1 3.725204 41.919451 r g
+ """
+ return self.set_list_column(field, value)
+
+ def set_list_column(self, column: str, value: ArrayLike) -> NestedSeries:
+ """Set the field from list-array of values and return a new series
+
+ Parameters
+ ----------
+ column : str
+ Name of the column to set. If not present, it will be added.
+ value : ArrayLike
+ Array of values to set. It must be a list-array of the same length
+ as the series.
+
+ Returns
+ -------
+ NestedSeries
+ The new series with the field set.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(2, 2, seed=1)
+
+ >>> nf_new_band = nf["nested"].nest.set_list_column("new_band",
+ ... [["g","g"],
+ ... ["r","r"]])
+ >>> # Look at one row of the series
+ >>> nf_new_band[0]
+ t flux band new_band
+ 0 2.935118 39.676747 g g
+ 1 3.725204 41.919451 r g
+
"""
new_array = self._series.array.copy()
- new_array.set_list_field(field, value)
+ new_array.set_list_field(column, value)
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
+ @deprecated(
+ version="0.6.0",
+ reason="`with_filled_field` will be removed in version 0.7.0, " "use `set_filled_column` instead.",
+ )
def with_filled_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field by repeating values and return a new series
@@ -284,6 +416,43 @@ def with_filled_field(self, field: str, value: ArrayLike) -> NestedSeries:
>>> nf_filled = nf["nested"].nest.with_filled_field("a", [1,2,3])
+ >>> # Look at one row of the series
+ >>> nf_filled[0]
+ t flux band a
+ 0 3.725204 20.445225 g 1
+ 1 10.776335 67.046751 r 1
+ """
+ return self.set_filled_column(field, value)
+
+ def set_filled_column(self, column: str, value: ArrayLike) -> NestedSeries:
+ """Set the column by repeating values and return a new series
+
+ The input value array must have as many elements as the Series,
+ each of them will be repeated in the corresponding list.
+
+ .nest.set_filled_column("a", [1, 2, 3]) will create a nested column
+ "a" with values [[1, 1, ...], [2, 2, ...], [3, 3, ...]].
+
+ Parameters
+ ----------
+ column : str
+ Name of the field to set. If not present, it will be added.
+ value : ArrayLike
+ Array of values to set. It must have the same length as the series.
+
+ Returns
+ -------
+ NestedSeries
+ The new series with the field set.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(3, 2, seed=1)
+
+ >>> nf_filled = nf["nested"].nest.set_filled_column("a", [1,2,3])
+
>>> # Look at one row of the series
>>> nf_filled[0]
t flux band a
@@ -291,9 +460,12 @@ def with_filled_field(self, field: str, value: ArrayLike) -> NestedSeries:
1 10.776335 67.046751 r 1
"""
new_array = self._series.array.copy()
- new_array.fill_field_lists(field, value)
+ new_array.fill_field_lists(column, value)
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
+ @deprecated(
+ version="0.6.0", reason="`without_field` will be removed in version 0.7.0, " "use `drop` instead."
+ )
def without_field(self, field: str | list[str]) -> NestedSeries:
"""Remove the field(s) from the series and return a new series
@@ -323,13 +495,47 @@ def without_field(self, field: str | list[str]) -> NestedSeries:
4 [{t: 0.547752, band: 'g'}; …] (2 rows)
Name: nested, dtype: nested
"""
- if isinstance(field, str):
- field = [field]
+ return self.drop(field)
+
+ def drop(self, column: str | list[str]) -> NestedSeries:
+ """Remove the column(s) from the series and return a new series
+
+ Note, that at least one nested column must be left in the series.
+
+ Parameters
+ ----------
+ column : str or list[str]
+ Name of the column(s) to remove.
+
+ Returns
+ -------
+ NestedSeries
+ The new series without the column(s).
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(5, 2, seed=1)
+
+ >>> nf["nested"].nest.drop("flux")
+ 0 [{t: 8.38389, band: 'r'}; …] (2 rows)
+ 1 [{t: 13.70439, band: 'g'}; …] (2 rows)
+ 2 [{t: 4.089045, band: 'g'}; …] (2 rows)
+ 3 [{t: 17.562349, band: 'r'}; …] (2 rows)
+ 4 [{t: 0.547752, band: 'g'}; …] (2 rows)
+ Name: nested, dtype: nested
+ """
+ if isinstance(column, str):
+ column = [column]
new_array = self._series.array.copy()
- new_array.pop_fields(field)
+ new_array.pop_fields(column)
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
+ @deprecated(
+ version="0.6.0", reason="`query_flat` will be removed in version 0.7.0, " "use `query` instead."
+ )
def query_flat(self, query: str) -> NestedSeries:
"""Query the flat arrays with a boolean expression
@@ -360,6 +566,38 @@ def query_flat(self, query: str) -> NestedSeries:
4 [{t: 0.547752, flux: 75.014431, band: 'g'}; …]...
dtype: nested
"""
+ return self.query(query)
+
+ def query(self, query: str) -> NestedSeries:
+ """Query the flat arrays with a boolean expression
+
+ Currently, it will remove empty rows from the output series.
+ # TODO: preserve the index keeping empty rows
+
+ Parameters
+ ----------
+ query : str
+ Boolean expression to filter the rows.
+
+ Returns
+ -------
+ NestedSeries
+ The filtered series.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(5, 5, seed=1)
+
+ >>> nf["nested"].nest.query("flux > 50")
+ 0 [{t: 13.40935, flux: 98.886109, band: 'g'}]
+ 1 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
+ 2 [{t: 4.089045, flux: 83.462567, band: 'g'}]
+ 3 [{t: 2.807739, flux: 78.927933, band: 'r'}; …]...
+ 4 [{t: 0.547752, flux: 75.014431, band: 'g'}; …]...
+ dtype: nested
+ """
flat = self.to_flat().query(query)
if len(flat) == 0:
@@ -370,6 +608,10 @@ def query_flat(self, query: str) -> NestedSeries:
)
return pack_sorted_df_into_struct(flat)
+ @deprecated(
+ version="0.6.0",
+ reason="`get_flat_index` will be removed in version 0.7.0, " "use the `flat_index` property instead.",
+ )
def get_flat_index(self) -> pd.Index:
"""Index of the flat arrays
@@ -389,11 +631,12 @@ def get_flat_index(self) -> pd.Index:
>>> nf["nested"].nest.get_flat_index()
Index([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype='int64')
"""
- flat_index = np.repeat(self._series.index, np.diff(self._series.array.list_offsets))
- # pd.Index supports np.repeat, so flat_index is the same type as self._series.index
- flat_index = cast(pd.Index, flat_index)
- return flat_index
+ return self.flat_index
+ @deprecated(
+ version="0.6.0",
+ reason="`get_flat_series` will be removed in version 0.7.0, " "use `to_flat()[column]` instead.",
+ )
def get_flat_series(self, field: str) -> pd.Series:
"""Get the flat-array field as a pd.Series
@@ -426,7 +669,6 @@ def get_flat_series(self, field: str) -> pd.Series:
4 87.81425
Name: flux, dtype: double[pyarrow]
"""
-
flat_chunks = []
for nested_chunk in self._series.array.struct_array.iterchunks():
struct_array = cast(pa.StructArray, nested_chunk)
@@ -434,19 +676,24 @@ def get_flat_series(self, field: str) -> pd.Series:
flat_array = list_array.flatten()
flat_chunks.append(flat_array)
- flat_chunked_array = pa.chunked_array(flat_chunks, type=self._series.dtype.fields[field])
+ flat_chunked_array = pa.chunked_array(flat_chunks, type=self._series.dtype.column_dtypes[field])
flat_series = pd.Series(
flat_chunked_array,
- dtype=self._series.dtype.field_dtype(field),
- index=self.get_flat_index(),
+ dtype=self._series.dtype.column_dtype(field),
+ # index=self.get_flat_index(),
+ index=self.flat_index,
name=field,
copy=False,
)
- if isinstance(self._series.dtype.field_dtype(field), NestedDtype):
+ if isinstance(self._series.dtype.column_dtype(field), NestedDtype):
return NestedSeries(flat_series, copy=False)
return flat_series
+ @deprecated(
+ version="0.6.0",
+ reason="`get_list_series` will be removed in version 0.7.0, " "use `to_lists()[column]` instead.",
+ )
def get_list_series(self, field: str) -> pd.Series:
"""Get the list-array field as a Series
@@ -474,7 +721,6 @@ def get_list_series(self, field: str) -> pd.Series:
4 [87.63891523 87.81425034]
Name: flux, dtype: list[pyarrow]
"""
-
list_chunked_array = self._series.array.pa_table[field]
return pd.Series(
list_chunked_array,
@@ -504,7 +750,26 @@ def __getitem__(self, key: str | list[str]) -> NestedSeries:
return NestedSeries(new_array, index=self._series.index, name=self._series.name)
# If the key is a single string, return the flat series for that field
- return self.get_flat_series(key)
+ flat_chunks = []
+ for nested_chunk in self._series.array.struct_array.iterchunks():
+ struct_array = cast(pa.StructArray, nested_chunk)
+ list_array = cast(pa.ListArray, struct_array.field(key))
+ flat_array = list_array.flatten()
+ flat_chunks.append(flat_array)
+
+ flat_chunked_array = pa.chunked_array(flat_chunks, type=self._series.dtype.column_dtypes[key])
+
+ flat_series = pd.Series(
+ flat_chunked_array,
+ dtype=self._series.dtype.column_dtype(key),
+ # index=self.get_flat_index(),
+ index=self.flat_index,
+ name=key,
+ copy=False,
+ )
+ if isinstance(self._series.dtype.column_dtype(key), NestedDtype):
+ return NestedSeries(flat_series, copy=False)
+ return flat_series
def __setitem__(self, key: str, value: ArrayLike) -> None:
"""Replace the field values from flat-array of values
@@ -529,7 +794,7 @@ def __setitem__(self, key: str, value: ArrayLike) -> None:
self._series.array.set_flat_field(key, value, keep_dtype=True)
return
- if isinstance(value, pd.Series) and not self.get_flat_index().equals(value.index):
+ if isinstance(value, pd.Series) and not self.flat_index.equals(value.index):
raise ValueError("Cannot set field with a Series of different index")
pa_array = pa.array(value, from_pandas=True)
@@ -650,9 +915,9 @@ def to_flatten_inner(self, field: str) -> NestedSeries:
1 0.146756 b 0.547752 87.638915 g
1 0.146756 b 3.96203 87.81425 r
"""
- if not isinstance(self._series.dtype.field_dtype(field), NestedDtype):
+ if not isinstance(self._series.dtype.column_dtype(field), NestedDtype):
raise ValueError(
- f"Field '{field}' dtype must be NestedDtype, got '{self._series.dtype.field_dtype(field)}'"
+ f"Field '{field}' dtype must be NestedDtype, got '{self._series.dtype.column_dtype(field)}'"
)
# Copy series and make an "ordinal" index
diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py
index 6ecb6de7..c5495a40 100644
--- a/src/nested_pandas/series/dtype.py
+++ b/src/nested_pandas/series/dtype.py
@@ -7,6 +7,7 @@
import pandas as pd
import pyarrow as pa
+from deprecated import deprecated
from pandas import ArrowDtype
from pandas.api.extensions import register_extension_dtype
from pandas.core.arrays import ExtensionArray
@@ -27,9 +28,36 @@ class NestedDtype(ExtensionDtype):
Parameters
----------
- pyarrow_dtype : pyarrow.StructType or pd.ArrowDtype
- The pyarrow data type to use for the nested type. It must be a struct
- type where all fields are list types.
+ pyarrow_dtype : pyarrow.StructType, pd.ArrowDtype, or Mapping[str, pa.DataType]
+ The pyarrow data type to use for the nested type. It may be provided as
+ a pyarrow.StructType, a pandas.ArrowDtype, or a mapping of column names to
+ pyarrow data types (such as a dictionary).
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> from nested_pandas import NestedDtype
+
+ From pa.StructType:
+
+ >>> dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64())),
+ ... pa.field("b", pa.list_(pa.float64()))]))
+ >>> dtype
+ nested
+
+ From pd.ArrowDtype:
+
+ >>> import pandas as pd
+ >>> dtype = NestedDtype(pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64())),
+ ... pa.field("b", pa.list_(pa.float64()))])))
+ >>> dtype
+ nested
+
+ From mapping of column names to pyarrow data types:
+
+ >>> dtype = NestedDtype({"a": pa.int64(), "b": pa.float64()})
+ >>> dtype
+ nested
"""
# ExtensionDtype overrides #
@@ -49,9 +77,10 @@ def na_value(self) -> Type[pd.NA]: # type: ignore[valid-type]
def name(self) -> str:
"""The string representation of the nested type"""
# Replace pd.ArrowDtype with pa.DataType, because it has nicer __str__
+ field_dtypes = {field: self.column_dtype(field) for field in list(self.column_dtypes.keys())}
nice_dtypes = {
field: dtype.pyarrow_dtype if isinstance(dtype, pd.ArrowDtype) else dtype
- for field, dtype in self.field_dtypes.items()
+ for field, dtype in field_dtypes.items()
}
fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
return f"nested<{fields}>"
@@ -123,12 +152,12 @@ def construct_from_string(cls, string: str) -> Self: # type: ignore[name-define
except ValueError as e:
raise TypeError(
f"Parsing pyarrow specific parameters in the string is not supported yet: {value_type}. "
- "Please use NestedDtype() or NestedDtype.from_fields() instead."
+ "Please use NestedDtype() or NestedDtype.from_columns() instead."
) from e
fields[field_name] = pa_value_type
- return cls.from_fields(fields)
+ return cls.from_columns(fields)
# ArrowDtype would return None so we do
def _get_common_dtype(self, dtypes: list) -> None:
@@ -158,14 +187,37 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray:
pyarrow_dtype: pa.StructType
def __init__(self, pyarrow_dtype: pa.DataType) -> None:
+ # Allow pd.ArrowDtypes on init
+ if isinstance(pyarrow_dtype, pd.ArrowDtype):
+ pyarrow_dtype = pyarrow_dtype.pyarrow_dtype
+
+ # Allow from_columns-style mapping inputs
+ if isinstance(pyarrow_dtype, Mapping):
+ pyarrow_dtype = pa.struct({col: pa.list_(pa_type) for col, pa_type in pyarrow_dtype.items()})
+ pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
+
self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
@property
+ @deprecated(
+ version="0.6.0",
+ reason="`struct_list_pa_dtype` will be removed in version 0.7.0, "
+ "use `_struct_list_pa_dtype` instead.",
+ )
def struct_list_pa_dtype(self) -> pa.StructType:
+ """Struct-list pyarrow type representing the nested type."""
+ return self._struct_list_pa_dtype
+
+ @property
+ def _struct_list_pa_dtype(self) -> pa.StructType:
"""Struct-list pyarrow type representing the nested type."""
return self.pyarrow_dtype
@classmethod
+ @deprecated(
+ version="0.6.0",
+ reason="`from_fields` will be removed in version 0.7.0, " "use `from_columns` instead.",
+ )
def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Make NestedDtype from a mapping of field names and list item types.
@@ -190,7 +242,34 @@ def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore
... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
... )
"""
- pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()})
+ return cls.from_columns(fields)
+
+ @classmethod
+ def from_columns(cls, columns: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
+ """Make NestedDtype from a mapping of column names and list item types.
+
+ Parameters
+ ----------
+ columns : Mapping[str, pa.DataType]
+ A mapping of column names and their item types. Since all fields are lists, the item types are
+ inner types of the lists, not the list types themselves.
+
+ Returns
+ -------
+ NestedDtype
+ The constructed NestedDtype.
+
+ Examples
+ --------
+ >>> dtype = NestedDtype.from_columns({"a": pa.float64(), "b": pa.int64()})
+ >>> dtype
+ nested
+ >>> assert (
+ ... dtype.pyarrow_dtype
+ ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
+ ... )
+ """
+ pyarrow_dtype = pa.struct({column: pa.list_(pa_type) for column, pa_type in columns.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
return cls(pyarrow_dtype=pyarrow_dtype)
@@ -224,11 +303,20 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListT
)
@property
+ @deprecated(
+ version="0.6.0", reason="`fields` will be removed in version 0.7.0, " "use `column_dtypes` instead."
+ )
def fields(self) -> dict[str, pa.DataType]:
"""The mapping of field names and their item types."""
- return {field.name: field.type.value_type for field in self.pyarrow_dtype}
+ return self.column_dtypes
@property
+ def column_dtypes(self) -> dict[str, pa.DataType]:
+ """The mapping of field names and their item types."""
+ return {column.name: column.type.value_type for column in self.pyarrow_dtype}
+
+ @property
+ @deprecated(version="0.6.0", reason="`struct_list_pa_dtype` will be removed in version 0.7.0.")
def field_names(self) -> list[str]:
"""The list of field names of the nested type"""
return [field.name for field in self.pyarrow_dtype]
@@ -273,6 +361,10 @@ def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype:
return ArrowDtype(self.list_struct_pa_dtype)
return ArrowDtype(self.pyarrow_dtype)
+ @deprecated(
+ version="0.6.0",
+ reason="`field_dtype` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.",
+ )
def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821
"""Pandas dtype of a field, pd.ArrowDType or NestedDtype.
@@ -287,13 +379,33 @@ def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-
If the field is a list-struct, return NestedDtype, else wrap it
as a pd.ArrowDtype.
"""
- list_type = self.pyarrow_dtype.field(field).type
+ return self.column_dtype(field)
+
+ def column_dtype(self, column: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821
+ """Pandas dtype of a column, pd.ArrowDType or NestedDtype.
+
+ Parameters
+ ----------
+ column : str
+ Column name
+
+ Returns
+ -------
+ pd.ArrowDtype | NestedDtype
+ If the column is a list-struct, return NestedDtype, else wrap it
+ as a pd.ArrowDtype.
+ """
+ list_type = self.pyarrow_dtype.field(column).type
value_type = list_type.value_type
if is_pa_type_is_list_struct(value_type):
return type(self)(value_type)
return pd.ArrowDtype(value_type)
@property
+ @deprecated(
+ version="0.6.0",
+ reason="`field_dtypes` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.",
+ )
def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]: # type: ignore[name-defined] # noqa: F821
"""Pandas dtypes of this dtype's fields."""
return {field: self.field_dtype(field) for field in self.field_names}
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
index 86de6f94..41fb6cec 100644
--- a/src/nested_pandas/series/ext_array.py
+++ b/src/nested_pandas/series/ext_array.py
@@ -768,7 +768,7 @@ def _convert_struct_scalar_to_df(
return na_value
series = {}
for name, list_scalar in value.items():
- dtype: pd.ArrowDtype | NestedDtype | None = self.dtype.field_dtype(name)
+ dtype: pd.ArrowDtype | NestedDtype | None = self.dtype.column_dtype(name)
# It gave pd.ArrowDtype for non-NestedDtype fields,
# make it None if we'd like to use pandas "ordinary" dtypes.
if not pyarrow_dtypes and not isinstance(dtype, NestedDtype):
diff --git a/src/nested_pandas/series/nestedseries.py b/src/nested_pandas/series/nestedseries.py
index dd4b54cb..4d9b5c21 100644
--- a/src/nested_pandas/series/nestedseries.py
+++ b/src/nested_pandas/series/nestedseries.py
@@ -1,6 +1,7 @@
from functools import wraps
import pandas as pd
+from deprecated import deprecated
from nested_pandas.series.dtype import NestedDtype
@@ -32,9 +33,20 @@ def __init__(self, *args, **kwargs):
@property
@nested_only
+ @deprecated(
+ version="0.6.0",
+ reason="The `fields` property is deprecated and will be removed in version 0.7.0,"
+ "use `columns` instead.",
+ )
def fields(self):
"""Returns the fields of the nested series as a list."""
- return self.nest.fields
+ return self.columns
+
+ @property
+ @nested_only
+ def columns(self):
+ """Returns the names of the nested columns of the nested series as a list."""
+ return self.nest.columns
@property
@nested_only
@@ -55,8 +67,8 @@ def __getitem__(self, key):
if not isinstance(self.dtype, NestedDtype):
return super().__getitem__(key)
- # Return a flattened series for a single field
- if isinstance(key, str) and key in self.fields:
+ # Return a flattened series for a single column
+ if isinstance(key, str) and key in self.columns:
return self.nest[key]
# For list-like keys, perform sub-column selection
@@ -77,13 +89,16 @@ def __setitem__(self, key, value):
return super().__setitem__(key, value)
# Use nest setitem when setting on a single field
- if isinstance(key, str) and key in self.fields:
+ if isinstance(key, str) and key in self.columns:
self.nest[key] = value
return
return super().__setitem__(key, value)
@nested_only
+ @deprecated(
+ version="0.6.0", reason="`to_flat` will be removed in version 0.7.0, " "use `explode` instead."
+ )
def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
"""Convert nested series into dataframe of flat arrays.
@@ -117,16 +132,52 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
4 3.96203 87.81425 r
"""
- return self.nest.to_flat(fields=fields)
+ return self.explode(columns=fields)
@nested_only
- def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
+ def explode(self, columns: list[str] | None = None) -> pd.DataFrame:
+ """Unpack nested series into dataframe of flat arrays.
+
+ Parameters
+ ----------
+ columns : list[str] or None, optional
+ Names of the columns to include. Default is None, which means all columns.
+
+ Returns
+ -------
+ pd.DataFrame
+ Dataframe of flat arrays.
+
+ Examples
+ --------
+
+ >>> from nested_pandas.datasets.generation import generate_data
+ >>> nf = generate_data(5, 2, seed=1)
+
+ >>> nf["nested"].explode()
+ t flux band
+ 0 8.38389 80.074457 r
+ 0 13.40935 89.460666 g
+ 1 13.70439 96.826158 g
+ 1 8.346096 8.504421 g
+ 2 4.089045 31.342418 g
+ 2 11.173797 3.905478 g
+ 3 17.562349 69.232262 r
+ 3 2.807739 16.983042 r
+ 4 0.547752 87.638915 g
+ 4 3.96203 87.81425 r
+
+ """
+ return self.nest.to_flat(columns=columns)
+
+ @nested_only
+ def to_lists(self, columns: list[str] | None = None) -> pd.DataFrame:
"""Convert nested series into dataframe of list-array columns.
Parameters
----------
- fields : list[str] or None, optional
- Names of the fields to include. Default is None, which means all fields.
+ columns : list[str] or None, optional
+ Names of the columns to include. Default is None, which means all columns.
Returns
-------
@@ -147,4 +198,4 @@ def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
3 [17.56234873 2.80773877] [69.23226157 16.98304196] ['r' 'r']
4 [0.54775186 3.96202978] [87.63891523 87.81425034] ['g' 'r']
"""
- return self.nest.to_lists(fields=fields)
+ return self.nest.to_lists(columns=columns)
diff --git a/src/nested_pandas/utils/utils.py b/src/nested_pandas/utils/utils.py
index 16b0012d..df76f40b 100644
--- a/src/nested_pandas/utils/utils.py
+++ b/src/nested_pandas/utils/utils.py
@@ -60,8 +60,10 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame:
if by is None:
counts = pd.Series(df[nested].nest.list_lengths, name=f"n_{nested}", index=df.index)
else:
- counts = df.reduce(
- lambda x: dict(zip(*np.unique(x, return_counts=True), strict=False)), f"{nested}.{by}"
+ counts = df.map_rows(
+ lambda x: dict(zip(*np.unique(x, return_counts=True), strict=False)),
+ columns=f"{nested}.{by}",
+ row_container="args",
)
counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns})
counts = counts.reindex(sorted(counts.columns), axis=1)
diff --git a/tests/nested_pandas/e2e_tests/test_issue89.py b/tests/nested_pandas/e2e_tests/test_issue89.py
index f6a96167..459d5d8b 100644
--- a/tests/nested_pandas/e2e_tests/test_issue89.py
+++ b/tests/nested_pandas/e2e_tests/test_issue89.py
@@ -25,7 +25,7 @@ def test_issue89():
partitioning=None,
).set_index("ps1_objid")
- object_ndf = object_ndf.add_nested(source_ndf, "ztf_source")
+ object_ndf = object_ndf.join_nested(source_ndf, "ztf_source")
nf = object_ndf
- nf.reduce(np.mean, "ztf_source.mjd")
+ nf.map_rows(np.mean, "ztf_source.mjd", row_container="args")
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
index 24b56103..c4e0195f 100644
--- a/tests/nested_pandas/nestedframe/test_io.py
+++ b/tests/nested_pandas/nestedframe/test_io.py
@@ -29,8 +29,8 @@ def test_read_parquet():
assert nf.nested_columns == ["nested", "lincc"]
# Check the nested columns
- assert nf.nested.nest.fields == ["t", "flux", "band"]
- assert nf.lincc.nest.fields == ["band", "frameworks"]
+ assert nf.nested.nest.columns == ["t", "flux", "band"]
+ assert nf.lincc.nest.columns == ["band", "frameworks"]
def test_read_parquet_list():
@@ -46,8 +46,8 @@ def test_read_parquet_list():
assert nf.nested_columns == ["nested", "lincc"]
# Check the nested columns
- assert nf.nested.nest.fields == ["t", "flux", "band"]
- assert nf.lincc.nest.fields == ["band", "frameworks"]
+ assert nf.nested.nest.columns == ["t", "flux", "band"]
+ assert nf.lincc.nest.columns == ["band", "frameworks"]
# Check loading list works correctly
assert len(nf) == 2 * len(single_file_nf)
@@ -65,8 +65,8 @@ def test_read_parquet_directory():
assert nf.nested_columns == ["nested", "lincc"]
# Check the nested columns
- assert nf.nested.nest.fields == ["t", "flux", "band"]
- assert nf.lincc.nest.fields == ["band", "frameworks"]
+ assert nf.nested.nest.columns == ["t", "flux", "band"]
+ assert nf.lincc.nest.columns == ["band", "frameworks"]
def test_read_parquet_directory_with_filesystem():
@@ -82,8 +82,8 @@ def test_read_parquet_directory_with_filesystem():
assert nf.nested_columns == ["nested", "lincc"]
# Check the nested columns
- assert nf.nested.nest.fields == ["t", "flux", "band"]
- assert nf.lincc.nest.fields == ["band", "frameworks"]
+ assert nf.nested.nest.columns == ["t", "flux", "band"]
+ assert nf.lincc.nest.columns == ["band", "frameworks"]
def test_file_object_read_parquet():
@@ -95,8 +95,8 @@ def test_file_object_read_parquet():
# Make sure nested columns were recognized
assert nf.nested_columns == ["nested", "lincc"]
# Check the nested columns
- assert nf.nested.nest.fields == ["t", "flux", "band"]
- assert nf.lincc.nest.fields == ["band", "frameworks"]
+ assert nf.nested.nest.columns == ["t", "flux", "band"]
+ assert nf.lincc.nest.columns == ["band", "frameworks"]
@pytest.mark.parametrize(
@@ -131,10 +131,10 @@ def test_read_parquet_column_selection(columns):
# Check nested columns
if columns == ["nested.flux", "nested.t"]:
- assert nf.nested.nest.fields == ["flux", "t"]
+ assert nf.nested.nest.columns == ["flux", "t"]
elif columns == ["nested.band", "lincc.band"]:
- assert nf.nested.nest.fields == ["band"]
- assert nf.lincc.nest.fields == ["band"]
+ assert nf.nested.nest.columns == ["band"]
+ assert nf.lincc.nest.columns == ["band"]
@pytest.mark.parametrize("reject", [["nested"], "nested"])
@@ -341,9 +341,9 @@ def test_read_parquet_list_autocast():
nf = read_parquet(tmpfile.name, autocast_list=True)
assert nf.nested_columns == ["c", "d"]
- assert nf["c"].nest.fields == ["c"]
+ assert nf["c"].nest.columns == ["c"]
assert len(nf["c"].nest.to_flat()) == 9
- assert nf["d"].nest.fields == ["d"]
+ assert nf["d"].nest.columns == ["d"]
assert len(nf["d"].nest.to_flat()) == 9
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
index e9068f2d..8092665f 100644
--- a/tests/nested_pandas/nestedframe/test_nestedframe.py
+++ b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -59,7 +59,7 @@ def test_html_repr():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Check nested repr
base._repr_html_()
@@ -85,7 +85,7 @@ def test_all_columns():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert list(base.all_columns.keys()) == ["base", "nested"]
assert list(base.all_columns["nested"]) == list(nested.columns)
@@ -101,7 +101,7 @@ def test_nested_columns():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert base.nested_columns == ["nested"]
@@ -116,7 +116,7 @@ def test_is_known_hierarchical_column():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert base._is_known_hierarchical_column("nested.c")
assert not base._is_known_hierarchical_column("nested.b")
@@ -135,7 +135,7 @@ def test_is_known_column():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert base._is_known_column("R. A.")
assert base._is_known_column("`R. A.`")
@@ -164,7 +164,7 @@ def test_series_methods_on_nest_in_query_eval():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Prepare to test isna, notna
base.loc[1, "nested"] = None
@@ -186,7 +186,7 @@ def test_get_nested_column():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
base_c = base["nested.c"]
@@ -204,26 +204,26 @@ def test_get_nested_columns():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
df = base[["a", "b", "nested.c"]]
assert np.all(df.columns == ["a", "b", "nested"])
- assert df.dtypes["nested"].field_names == ["c"]
+ assert list(df.dtypes["nested"].column_dtypes.keys()) == ["c"]
assert np.all(df["nested"].iloc[0].columns == ["c"])
df = base[["a", "b", "nested.c", "nested.d"]]
assert np.all(df.columns == ["a", "b", "nested"])
- assert df.dtypes["nested"].field_names == ["c", "d"]
+ assert list(df.dtypes["nested"].column_dtypes.keys()) == ["c", "d"]
assert np.all(df["nested"].iloc[0].columns == ["c", "d"])
df = base[["a", "b", "nested.d", "nested.c"]]
assert np.all(df.columns == ["a", "b", "nested"])
- assert df.dtypes["nested"].field_names == ["d", "c"]
+ assert list(df.dtypes["nested"].column_dtypes.keys()) == ["d", "c"]
assert np.all(df["nested"].iloc[0].columns == ["d", "c"])
df = base[["nested.c"]]
assert np.all(df.columns == ["nested"])
- assert df.dtypes["nested"].field_names == ["c"]
+ assert list(df.dtypes["nested"].column_dtypes.keys()) == ["c"]
assert np.all(df["nested"].iloc[0].columns == ["c"])
df = base[["a", "b"]]
@@ -231,7 +231,7 @@ def test_get_nested_columns():
df = base[["a", "b", "nested"]]
assert np.all(df.columns == ["a", "b", "nested"])
- assert df.dtypes["nested"].field_names == ["c", "d"]
+ assert list(df.dtypes["nested"].column_dtypes.keys()) == ["c", "d"]
assert np.all(df["nested"].iloc[0].columns == ["c", "d"])
@@ -244,7 +244,7 @@ def test_get_nested_columns_errors():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
with pytest.raises(KeyError):
base[["a", "c"]]
@@ -265,7 +265,7 @@ def test_getitem_empty_bool_array():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
bool_index = np.array([], dtype=bool)
@@ -288,7 +288,7 @@ def test_set_or_replace_nested_col():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# test direct replacement
base["nested.c"] = base["nested.c"] + 1
@@ -304,13 +304,13 @@ def test_set_or_replace_nested_col():
# test new column assignment
base["nested.e"] = base["nested.d"] * 2
- assert "e" in base.nested.nest.fields
+ assert "e" in base.nested.nest.columns
assert np.array_equal(base["nested.d"].values.to_numpy() * 2, base["nested.e"].values.to_numpy())
# test assignment a new column with list-repeated values
base["nested.a"] = base["a"]
- assert "a" in base.nested.nest.fields
+ assert "a" in base.nested.nest.columns
assert np.array_equal(np.unique(base["a"].to_numpy()), np.unique(base["nested.a"].to_numpy()))
# rest replacement with a list-repeated column
@@ -327,13 +327,13 @@ def test_set_new_nested_col():
data={"c": c, "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# assign column cd in new_nested from c+d in nested
base["new_nested.cd"] = base["nested.c"] + base["nested.d"]
assert "new_nested" in base.nested_columns
- assert "cd" in base["new_nested"].nest.fields
+ assert "cd" in base["new_nested"].nest.columns
assert np.array_equal(
base["new_nested.cd"].values.to_numpy(),
@@ -358,8 +358,8 @@ def test_set_item_combine_nested():
list_nf["nested"] = list_nf[["c", "d"]]
assert "nested" in list_nf.columns
- assert list_nf.nested.nest.fields == ["c", "d"]
- assert len(list_nf.nested.nest.to_flat()) == 9
+ assert list_nf.nested.nest.columns == ["c", "d"]
+ assert len(list_nf.nested.explode()) == 9
def test_set_list_struct_col():
@@ -372,10 +372,10 @@ def test_set_list_struct_col():
list_struct_series = pd.Series(list_struct_array, dtype=pd.ArrowDtype(list_struct_array.type))
nf["nested2"] = list_struct_series
- assert_frame_equal(nf.nested.nest.to_flat(), nf.nested2.nest.to_flat())
+ assert_frame_equal(nf.nested.explode(), nf.nested2.explode())
nf = nf.assign(nested3=list_struct_series)
- assert_frame_equal(nf.nested.nest.to_flat(), nf.nested3.nest.to_flat())
+ assert_frame_equal(nf.nested.explode(), nf.nested3.explode())
def test_get_dot_names():
@@ -404,8 +404,8 @@ def test_nesting_limit():
nf["`nested.d`"] = nf["`.b.`"]
-def test_add_nested_with_flat_df():
- """Test that add_nested correctly adds a nested column to the base df"""
+def test_join_nested_with_flat_df():
+ """Test that join_nested correctly adds a nested column to the base df"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -414,15 +414,15 @@ def test_add_nested_with_flat_df():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert "nested" in base.columns
# to_flat() gives pd.ArrowDtype, so we skip dtype check here
- assert_frame_equal(base.nested.nest.to_flat(), nested, check_dtype=False)
+ assert_frame_equal(base.nested.explode(), nested, check_dtype=False)
-def test_add_nested_with_flat_df_and_mismatched_index():
- """Test add_nested when index values of base are missing matches in nested"""
+def test_join_nested_with_flat_df_and_mismatched_index():
+ """Test join_nested when index values of base are missing matches in nested"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6], "new_index": [0, 1, 3]}, index=[0, 1, 2])
@@ -439,7 +439,7 @@ def test_add_nested_with_flat_df_and_mismatched_index():
# Add the nested frame in a "left" fashion, where the index of the "left"
# frame (our base layer) is preserved
- left_res = base.add_nested(nested, "nested", how="left")
+ left_res = base.join_nested(nested, "nested", how="left")
assert "nested" in left_res.columns
# Check that the index of the base layer is being used
assert (left_res.index == base.index).all()
@@ -451,23 +451,23 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert left_res.loc[idx]["nested"] is None
# Test that the default behavior is the same as how="left" by comparing the pandas dataframes
- default_res = base.add_nested(nested, "nested")
+ default_res = base.join_nested(nested, "nested")
assert_frame_equal(left_res, default_res)
# Test still adding the nested frame in a "left" fashion but on the "new_index" column
# We currently don't support a list of columns for the 'on' argument
with pytest.raises(ValueError):
- left_res_on = base.add_nested(nested, "nested", how="left", on=["new_index"])
+ left_res_on = base.join_nested(nested, "nested", how="left", on=["new_index"])
# Instead we should pass a single column name, "new_index" which exists in both frames.
- left_res_on = base.add_nested(nested, "nested", how="left", on="new_index")
+ left_res_on = base.join_nested(nested, "nested", how="left", on="new_index")
assert "nested" in left_res_on.columns
# Check that the index of the base layer is still being used
assert (left_res_on.index == base.index).all()
# Assert that the new_index column we joined on was dropped from the nested layer
# but is present in the base layer
assert "new_index" in left_res_on.columns
- assert "new_index" not in left_res_on["nested"].nest.to_flat().columns
+ assert "new_index" not in left_res_on["nested"].explode().columns
# For each index in the columns we joined on, check that values are aligned correctly
for i in range(len(left_res_on.new_index)):
@@ -477,15 +477,15 @@ def test_add_nested_with_flat_df_and_mismatched_index():
if join_idx in nested["new_index"].values:
assert left_res_on.iloc[i]["nested"] is not None
# Check that it is present in new the index we constructed for the nested layer
- assert join_idx in left_res_on["nested"].nest.to_flat().index
+ assert join_idx in left_res_on["nested"].explode().index
else:
# Use an iloc
assert left_res_on.iloc[i]["nested"] is None
- assert join_idx not in left_res_on["nested"].nest.to_flat().index
+ assert join_idx not in left_res_on["nested"].explode().index
# Test adding the nested frame in a "right" fashion, where the index of the "right"
# frame (our nested layer) is preserved
- right_res = base.add_nested(nested, "nested", how="right")
+ right_res = base.join_nested(nested, "nested", how="right")
assert "nested" in right_res.columns
# Check that the index of the nested layer is being used. Note that separate
# from a traditional join this will not be the same as our nested layer index
@@ -507,16 +507,16 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert not pd.isna(right_res.loc[idx][col])
# Test still adding the nested frame in a "right" fashion but on the "new_index" column
- right_res_on = base.add_nested(nested, "nested", how="right", on="new_index")
+ right_res_on = base.join_nested(nested, "nested", how="right", on="new_index")
assert "nested" in right_res_on.columns
# Check that rows were dropped if the base layer's "new_index" value is not present
# in the "right" nested layer
assert (right_res_on.new_index.values == np.unique(nested.new_index.values)).all()
# Check that the new_index column we joined on was dropped from the nested layer
- assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
+ assert "new_index" not in right_res_on["nested"].explode().columns
# Check that the flattend nested layer has the same index as the original column we joined on
- all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values)
+ all(right_res_on.nested.explode().index.values == nested.new_index.values)
# For each index check that the base layer is aligned correctly to the nested layer
for i in range(len(right_res_on)):
@@ -536,7 +536,7 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert not pd.isna(right_res_on.iloc[i][col])
# Test the "outer" behavior
- outer_res = base.add_nested(nested, "nested", how="outer")
+ outer_res = base.join_nested(nested, "nested", how="outer")
assert "nested" in outer_res.columns
# We expect the new index to be the union of the base and nested indices
assert set(outer_res.index) == set(base.index).union(set(nested.index))
@@ -556,18 +556,18 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert not pd.isna(outer_res.loc[idx][col])
# Test still adding the nested frame in an "outer" fashion but with on the "new_index" column
- outer_res_on = base.add_nested(nested, "nested", how="outer", on="new_index")
+ outer_res_on = base.join_nested(nested, "nested", how="outer", on="new_index")
assert "nested" in outer_res_on.columns
# We expect the result's new_index column to be the set union of the values of that column
# in the base and nested frames
assert set(outer_res_on.new_index) == set(base.new_index).union(set(nested.new_index))
# Check that the new_index column we joined on was dropped from the nested layer
- assert "new_index" not in outer_res_on["nested"].nest.to_flat().columns
+ assert "new_index" not in outer_res_on["nested"].explode().columns
# Check that the flattend nested layer has the same index as the original column we joined on
# Note that it does not have index values only present in the base layer since those empty rows
# are dropped when we flatten the nested frame.
- all(outer_res_on.nested.nest.to_flat().index.values == nested.new_index.values)
+ all(outer_res_on.nested.explode().index.values == nested.new_index.values)
for i in range(len(outer_res_on)):
# The actual "index" value we "joined" on.
@@ -588,7 +588,7 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert pd.isna(outer_res_on.iloc[i][col])
# Test the "inner" behavior
- inner_res = base.add_nested(nested, "nested", how="inner")
+ inner_res = base.join_nested(nested, "nested", how="inner")
assert "nested" in inner_res.columns
# We expect the new index to be the set intersection of the base and nested indices
assert set(inner_res.index) == set(base.index).intersection(set(nested.index))
@@ -601,21 +601,21 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert not pd.isna(inner_res.loc[idx][col])
# Test still adding the nested frame in a "inner" fashion but on the "new_index" column
- inner_res_on = base.add_nested(nested, "nested", how="inner", on="new_index")
+ inner_res_on = base.join_nested(nested, "nested", how="inner", on="new_index")
assert "nested" in inner_res_on.columns
# We expect the new index to be the set intersection of the base and nested column we used
# for the 'on' argument
assert set(inner_res_on.new_index) == set(base.new_index).intersection(set(nested.new_index))
# Check that the new_index column we joined on was dropped from the nested layer
- assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
+ assert "new_index" not in right_res_on["nested"].explode().columns
# Since we have confirmed that the "nex_index" column was the intersection that we expected
# we know that none of the joined values should be none
assert not inner_res_on.isnull().values.any()
-def test_add_nested_with_series():
- """Test that add_nested correctly adds a nested column to the base df"""
+def test_join_nested_with_series():
+ """Test that join_nested correctly adds a nested column to the base df"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -625,15 +625,15 @@ def test_add_nested_with_series():
name="c",
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert "nested" in base.columns
for i in range(3):
assert_frame_equal(base.iloc[i]["nested"], nested[i])
-def test_add_nested_with_series_and_mismatched_index():
- """Test add_nested when index values of base are missing matches in nested"""
+def test_join_nested_with_series_and_mismatched_index():
+ """Test join_nested when index values of base are missing matches in nested"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.Series(
@@ -642,23 +642,23 @@ def test_add_nested_with_series_and_mismatched_index():
name="c",
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert "nested" in base.columns
assert pd.isna(base.loc[1]["nested"])
-def test_add_nested_for_empty_df():
- """Test that .add_nested() works for empty frame and empty input"""
+def test_join_nested_for_empty_df():
+ """Test that .join_nested() works for empty frame and empty input"""
base = NestedFrame(data={"a": [], "b": []}, index=[])
nested = pd.DataFrame(data={"c": []}, index=[])
- new_base = base.add_nested(nested, "nested")
+ new_base = base.join_nested(nested, "nested")
# Check original frame is unchanged
assert_frame_equal(base, NestedFrame(data={"a": [], "b": []}, index=[]))
assert "nested" in new_base.columns
- assert_frame_equal(new_base.nested.nest.to_flat(), nested.astype(pd.ArrowDtype(pa.float64())))
+ assert_frame_equal(new_base.nested.explode(), nested.astype(pd.ArrowDtype(pa.float64())))
@pytest.mark.parametrize("pandas", [False, True])
@@ -681,15 +681,15 @@ def test_from_flat(on, pandas):
if on is None:
assert list(out_nf.columns) == ["a", "b", "new_nested"]
- assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
+ assert list(out_nf.new_nested.nest.columns) == ["c", "d"]
assert len(out_nf) == 2
elif on == "a":
assert list(out_nf.columns) == ["b", "new_nested"]
- assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
+ assert list(out_nf.new_nested.nest.columns) == ["c", "d"]
assert len(out_nf) == 2
elif on == "c": # not what a user likely wants, but should still work
assert list(out_nf.columns) == ["a", "b", "new_nested"]
- assert list(out_nf.new_nested.nest.fields) == ["d"]
+ assert list(out_nf.new_nested.nest.columns) == ["d"]
assert len(out_nf) == 5
@@ -697,7 +697,7 @@ def test_recover_from_flat():
"""test that going to_flat and then from_flat recovers the same df"""
nf = generate_data(5, 10, seed=1)
- flat = nf["nested"].nest.to_flat()
+ flat = nf["nested"].explode()
nf2 = NestedFrame.from_flat(nf[["a", "b"]].join(flat), base_columns=["a", "b"], name="nested")
@@ -714,12 +714,12 @@ def test_from_flat_omitting_columns():
# omit a base column
nf = NestedFrame.from_flat(flat, base_columns=["b"], nested_columns=["c", "d"])
assert list(nf.columns) == ["b", "nested"]
- assert list(nf.nested.nest.fields) == ["c", "d"]
+ assert list(nf.nested.nest.columns) == ["c", "d"]
# omit a nested column
nf = NestedFrame.from_flat(flat, base_columns=["a", "b"], nested_columns=["c"])
assert list(nf.columns) == ["a", "b", "nested"]
- assert list(nf.nested.nest.fields) == ["c"]
+ assert list(nf.nested.nest.columns) == ["c"]
def test_from_lists():
@@ -770,7 +770,7 @@ def test_from_lists():
res = NestedFrame.from_lists(nf2, list_columns=["e", "f"])
assert list(res.columns) == ["c", "d", "nested"]
assert list(res.nested_columns) == ["nested"]
- assert list(res.nested.nest.fields) == ["e", "f"]
+ assert list(res.nested.nest.columns) == ["e", "f"]
# Check for subsetting
res = NestedFrame.from_lists(nf, base_columns=["c"], list_columns=["e"])
@@ -797,7 +797,7 @@ def test_query():
)
# Test vanilla queries
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
assert len(base.query("a > 2")) == 1
# Check for the multi-layer error
@@ -811,10 +811,10 @@ def test_query():
# Test nested queries
nest_queried = base.query("nested.c > 1")
- assert len(nest_queried.nested.nest.to_flat()) == 7
+ assert len(nest_queried.nested.explode()) == 7
nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
- assert len(nest_queried.nested.nest.to_flat()) == 5
+ assert len(nest_queried.nested.explode()) == 5
# Check edge conditions
with pytest.raises(ValueError):
@@ -853,21 +853,21 @@ def test_query_on_non_identifier_columns():
data={"a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- nf = nf.add_nested(nested, "bad dog")
+ nf = nf.join_nested(nested, "bad dog")
nf2 = nf.query("`good dog` > 3")
assert nf.shape == (3, 3)
assert nf2.shape == (2, 3)
nf3 = nf.query("`bad dog`.a > 2")
assert nf3["bad dog"].nest["a"].size == 4
- # And also for fields within the nested columns.
+ # And also for columns within the nested columns.
# Taken from GH#176
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- nf = nf.add_nested(nested, "bad dog")
+ nf = nf.join_nested(nested, "bad dog")
nf4 = nf.query("`bad dog`.`n/a` > 2")
assert nf4["bad dog"].nest["n/a"].size == 4
@@ -882,27 +882,27 @@ def test_dropna():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test basic functionality
dn_base = base.dropna(subset=["b"])
assert len(dn_base) == 2
- assert len(dn_base["nested"].nest.to_flat() == 6)
+ assert len(dn_base["nested"].explode() == 6)
# Test on_nested kwarg
dn_on_nested = base.dropna(on_nested="nested")
assert len(dn_on_nested) == 3
- assert len(dn_on_nested["nested"].nest.to_flat() == 8)
+ assert len(dn_on_nested["nested"].explode() == 8)
# Test hierarchical column subset
dn_hierarchical = base.dropna(subset="nested.c")
assert len(dn_hierarchical) == 3
- assert len(dn_hierarchical["nested"].nest.to_flat() == 8)
+ assert len(dn_hierarchical["nested"].explode() == 8)
# Test hierarchical column subset and on_nested
dn_hierarchical = base.dropna(on_nested="nested", subset="nested.c")
assert len(dn_hierarchical) == 3
- assert len(dn_hierarchical["nested"].nest.to_flat() == 8)
+ assert len(dn_hierarchical["nested"].explode() == 8)
def test_dropna_layer_as_base_column():
@@ -924,7 +924,7 @@ def test_dropna_inplace_base():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test inplace=False with base layer
dn_base = base.dropna(subset=["b"], inplace=False)
@@ -945,11 +945,11 @@ def test_dropna_inplace_nested():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test inplace=False with nested layer
dn_base = base.dropna(on_nested="nested", inplace=False)
- assert not dn_base.nested.nest.to_flat().equals(base.nested.nest.to_flat())
+ assert not dn_base.nested.explode().equals(base.nested.explode())
# Test inplace=True with nested layer
base.dropna(on_nested="nested", inplace=True)
@@ -966,7 +966,7 @@ def test_dropna_errors():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test multi-target
with pytest.raises(ValueError):
@@ -995,7 +995,7 @@ def test_sort_values():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test basic functionality
sv_base = base.sort_values("b")
@@ -1024,7 +1024,7 @@ def test_sort_values_ascension():
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test ascending=False
sv_base = base.sort_values("nested.d", ascending=False)
@@ -1039,8 +1039,25 @@ def test_sort_values_ascension():
assert list(sv_base.iloc[0]["nested"]["d"]) == [7, 5, 4]
-def test_reduce():
- """Tests that we can call reduce on a NestedFrame with a custom function."""
+def test_get_subcolumns():
+ """Tests that we can get subcolumns from a NestedFrame"""
+ nf = generate_data(5, 10, seed=1)
+ nf["nested2"] = nf["nested"]
+
+ assert nf.get_subcolumns("nested") == ["nested.t", "nested.flux", "nested.band"]
+ assert nf.get_subcolumns(["nested", "nested2"]) == [
+ "nested.t",
+ "nested.flux",
+ "nested.band",
+ "nested2.t",
+ "nested2.flux",
+ "nested2.band",
+ ]
+ assert nf.get_subcolumns() == nf.get_subcolumns(nf.nested_columns)
+
+
+def test_map_rows():
+ """Tests that we can call map_rows on a NestedFrame with a custom function."""
nf = NestedFrame(
data={"a": [1, 2, 3], "b": [2, 4, 6]},
index=pd.Index([0, 1, 2], name="idx"),
@@ -1075,24 +1092,20 @@ def test_reduce():
)
# Add two nested layers to pack into our dataframe
- nf = nf.add_nested(to_pack, "packed").add_nested(to_pack2, "packed2")
+ nf = nf.join_nested(to_pack, "packed").join_nested(to_pack2, "packed2")
# Define a simple custom function to apply to the nested data
- def get_max(col1, col2):
- # returns the max value within each specified colun
- return pd.Series([col1.max(), col2.max()], index=["max_col1", "max_col2"])
+ def get_max(row):
+ # returns the max value within each specified column
+ return pd.Series([row["packed.c"].max(), row["packed.d"].max()], index=["max_col1", "max_col2"])
# The expected max values for of our nested columns
expected_max_c = [4, 10, 4]
expected_max_d = [7, 5, 9]
expected_max_e = [9, 23, 4]
- # Test that we raise an error when no arguments are provided
- with pytest.raises(ValueError):
- nf.reduce(get_max)
-
# Batch only on columns in the first packed layer
- result = nf.reduce(get_max, "packed.c", "packed.d")
+ result = nf.map_rows(get_max, columns=["packed.c", "packed.d"])
assert len(result) == len(nf)
assert isinstance(result, NestedFrame)
assert result.index.name == "idx"
@@ -1100,8 +1113,28 @@ def get_max(col1, col2):
assert result["max_col1"].values[i] == expected_max_c[i]
assert result["max_col2"].values[i] == expected_max_d[i]
+ # Test that columns=None gives the same result
+ result_none = nf.map_rows(get_max, columns=None)
+ assert result.equals(result_none)
+
+ # As does columns="packed"
+ result_packed = nf.map_rows(get_max, columns="packed")
+ assert result.equals(result_packed)
+
+ # Test the "args" input container
+ def get_max(col1, col2):
+ # returns the max value within each specified column
+ return pd.Series([col1.max(), col2.max()], index=["max_col1", "max_col2"])
+
+ result_args = nf.map_rows(get_max, columns=["packed.c", "packed.d"], row_container="args")
+ assert result.equals(result_args)
+
+ def get_max2(row):
+ # returns the max value within each specified column
+ return pd.Series([row["packed.c"].max(), row["packed2.e"].max()], index=["max_col1", "max_col2"])
+
# Batch on columns in the first and second packed layers
- result = nf.reduce(get_max, "packed.c", "packed2.e")
+ result = nf.map_rows(get_max2, columns=["packed.c", "packed2.e"])
assert len(result) == len(nf)
assert isinstance(result, NestedFrame)
assert result.index.name == "idx"
@@ -1109,7 +1142,7 @@ def get_max(col1, col2):
assert result["max_col1"].values[i] == expected_max_c[i]
assert result["max_col2"].values[i] == expected_max_e[i]
- # Test that we can pass a scalar from the base layer to the reduce function and that
+ # Test that we can pass a scalar from the base layer to the map_rows function and that
# the user can also provide non-column arguments (in this case, the list of column names)
def offset_avg(offset, col_to_avg, column_names):
# A simple function which adds a scalar 'offset' to a column which is then averaged.
@@ -1121,26 +1154,34 @@ def offset_avg(offset, col_to_avg, column_names):
sum([7, 10, 7]) / 3.0,
]
- result = nf.reduce(offset_avg, "b", "packed.c", column_names=["offset_avg"])
+ result = nf.map_rows(
+ offset_avg, columns=["b", "packed.c"], row_container="args", column_names=["offset_avg"]
+ )
assert len(result) == len(nf)
assert isinstance(result, NestedFrame)
assert result.index.name == "idx"
for i in range(len(result)):
assert result["offset_avg"].values[i] == expected_offset_avg[i]
- # Verify that we can understand a string argument to the reduce function,
+ # Verify that we can understand a string argument to the map_rows function,
# so long as it isn't a column name.
- def make_id(col1, prefix_str):
- return f"{prefix_str}{col1}"
+ def make_id(row, prefix_str):
+ return f"{prefix_str}{row['b']}"
- result = nf.reduce(make_id, "b", prefix_str="some_id_")
+ result = nf.map_rows(make_id, columns="b", prefix_str="some_id_")
assert result[0][1] == "some_id_4"
+ # Check for output_names length error
+ with pytest.raises(ValueError):
+ result = nf.map_rows(
+ get_max, columns=["packed.c", "packed.d"], output_names=["only_one_name"], row_container="args"
+ )
+
# Verify that append_columns=True works as expected.
# Ensure that even with non-unique indexes, the final result retains
# the original index (nested-pandas#301)
nf.index = pd.Index([0, 1, 1], name="non-unique")
- result = nf.reduce(get_max, "packed.c", "packed.d", append_columns=True)
+ result = nf.map_rows(get_max, columns=["packed.c", "packed.d"], append_columns=True, row_container="args")
assert len(result) == len(nf)
assert isinstance(result, NestedFrame)
result_c = list(result.columns)
@@ -1158,8 +1199,8 @@ def make_id(col1, prefix_str):
assert result["packed.d"].values[i] == to_pack["d"].values[i]
-def test_reduce_duplicated_cols():
- """Tests nf.reduce() to correctly handle duplicated column names."""
+def test_map_rows_duplicated_cols():
+ """Tests nf.map_rows() to correctly handle duplicated column names."""
nf = NestedFrame(
data={"a": [1, 2, 3], "b": [2, 4, 6]},
index=pd.Index([0, 1, 2], name="idx"),
@@ -1194,24 +1235,24 @@ def test_reduce_duplicated_cols():
)
# Add two nested layers to pack into our dataframe
- nf = nf.add_nested(to_pack, "packed").add_nested(to_pack2, "packed2")
+ nf = nf.join_nested(to_pack, "packed").join_nested(to_pack2, "packed2")
def cols_allclose(col1, col2):
return pd.Series([np.allclose(col1, col2)], index=["allclose"])
- result = nf.reduce(cols_allclose, "packed.time", "packed2.f")
+ result = nf.map_rows(cols_allclose, columns=["packed.time", "packed2.f"], row_container="args")
assert_frame_equal(
result, pd.DataFrame({"allclose": [False, False, False]}, index=pd.Index([0, 1, 2], name="idx"))
)
- result = nf.reduce(cols_allclose, "packed.c", "packed.c")
+ result = nf.map_rows(cols_allclose, columns=["packed.c", "packed.c"], row_container="args")
assert_frame_equal(
result, pd.DataFrame({"allclose": [True, True, True]}, index=pd.Index([0, 1, 2], name="idx"))
)
-def test_reduce_infer_nesting():
- """Test that nesting inference works in reduce"""
+def test_map_rows_infer_nesting():
+ """Test that nesting inference works in map_rows"""
ndf = generate_data(3, 20, seed=1)
@@ -1222,9 +1263,9 @@ def complex_output(flux):
"lc.flux_quantiles": np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]),
}
- result = ndf.reduce(complex_output, "nested.flux")
+ result = ndf.map_rows(complex_output, columns="nested.flux", row_container="args")
assert list(result.columns) == ["max_flux", "lc"]
- assert list(result.lc.nest.fields) == ["flux_quantiles"]
+ assert list(result.lc.nest.columns) == ["flux_quantiles"]
# Test multi-column nested output
def complex_output(flux):
@@ -1234,15 +1275,15 @@ def complex_output(flux):
"lc.labels": [0.1, 0.2, 0.3, 0.4, 0.5],
}
- result = ndf.reduce(complex_output, "nested.flux")
+ result = ndf.map_rows(complex_output, columns=["nested.flux"], row_container="args")
assert list(result.columns) == ["max_flux", "lc"]
- assert list(result.lc.nest.fields) == ["flux_quantiles", "labels"]
+ assert list(result.lc.nest.columns) == ["flux_quantiles", "labels"]
# Test integer names
def complex_output(flux):
return np.max(flux), np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]), [0.1, 0.2, 0.3, 0.4, 0.5]
- result = ndf.reduce(complex_output, "nested.flux")
+ result = ndf.map_rows(complex_output, columns="nested.flux", row_container="args")
assert list(result.columns) == [0, 1, 2]
# Test multiple nested structures output
@@ -1254,10 +1295,10 @@ def complex_output(flux):
"meta.colors": ["green", "red", "blue"],
}
- result = ndf.reduce(complex_output, "nested.flux")
+ result = ndf.map_rows(complex_output, columns="nested.flux", row_container="args")
assert list(result.columns) == ["max_flux", "lc", "meta"]
- assert list(result.lc.nest.fields) == ["flux_quantiles", "labels"]
- assert list(result.meta.nest.fields) == ["colors"]
+ assert list(result.lc.nest.columns) == ["flux_quantiles", "labels"]
+ assert list(result.meta.nest.columns) == ["colors"]
# Test only nested structure output
def complex_output(flux):
@@ -1266,13 +1307,13 @@ def complex_output(flux):
"lc.labels": [0.1, 0.2, 0.3, 0.4, 0.5],
}
- result = ndf.reduce(complex_output, "nested.flux")
+ result = ndf.map_rows(complex_output, columns="nested.flux", row_container="args")
assert list(result.columns) == ["lc"]
- assert list(result.lc.nest.fields) == ["flux_quantiles", "labels"]
+ assert list(result.lc.nest.columns) == ["flux_quantiles", "labels"]
-def test_reduce_arg_errors():
- """Test that reduce errors based on non-column args trigger as expected"""
+def test_map_rows_arg_errors():
+ """Test that map_rows errors based on non-column args trigger as expected"""
ndf = generate_data(10, 10, seed=1)
@@ -1283,13 +1324,13 @@ def func(a, flux, add):
return {"nested2.flux": flux + a}
with pytest.raises(TypeError):
- ndf.reduce(func, "a", "nested.flux", True)
+ ndf.map_rows(func, columns=["a", "nested.flux", True], row_container="args")
with pytest.raises(ValueError):
- ndf.reduce(func, "ab", "nested.flux", add=True)
+ ndf.map_rows(func, columns=["ab", "nested.flux"], add=True, row_container="args")
# this should work
- ndf.reduce(func, "a", "nested.flux", add=True)
+ ndf.map_rows(func, ["a", "nested.flux"], add=True, row_container="args")
def test_scientific_notation():
@@ -1313,7 +1354,7 @@ def test_drop():
data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested").add_nested(nested2, "nested2")
+ base = base.join_nested(nested, "nested").join_nested(nested2, "nested2")
# test axis=0 drop
dropped_base = base.drop(0, axis=0)
@@ -1327,7 +1368,7 @@ def test_drop():
# Test dropping a nested column
dropped_nested = base.drop("nested.c", axis=1)
assert len(dropped_nested.columns) == len(base.columns)
- assert "c" not in dropped_nested.nested.nest.fields
+ assert "c" not in dropped_nested.nested.nest.columns
# Test dropping a non-existent column
with pytest.raises(KeyError):
@@ -1337,32 +1378,32 @@ def test_drop():
dropped_multiple = base.drop(["a", "nested.c"], axis=1)
assert len(dropped_multiple.columns) == len(base.columns) - 1
assert "a" not in dropped_multiple.columns
- assert "c" not in dropped_multiple.nested.nest.fields
+ assert "c" not in dropped_multiple.nested.nest.columns
# Test multiple nested structures
dropped_multiple = base.drop(["nested.c", "nested2.f"], axis=1)
assert len(dropped_multiple.columns) == len(base.columns)
- assert "c" not in dropped_multiple.nested.nest.fields
- assert "f" not in dropped_multiple.nested2.nest.fields
+ assert "c" not in dropped_multiple.nested.nest.columns
+ assert "f" not in dropped_multiple.nested2.nest.columns
# Test inplace=True for both base and nested columns
base2 = base.copy()
base2.drop(["a", "nested.c"], axis=1, inplace=True)
assert "a" not in base2.columns
- assert "c" not in base2["nested"].nest.fields
+ assert "c" not in base2["nested"].nest.columns
assert "b" in base2.columns
- assert "d" in base2["nested"].nest.fields
+ assert "d" in base2["nested"].nest.columns
# Test inplace=False for both base and nested columns
base3 = base.copy()
dropped = base3.drop(["a", "nested.c"], axis=1, inplace=False)
assert "a" not in dropped.columns
- assert "c" not in dropped["nested"].nest.fields
+ assert "c" not in dropped["nested"].nest.columns
assert "b" in dropped.columns
- assert "d" in dropped["nested"].nest.fields
+ assert "d" in dropped["nested"].nest.columns
# Original is unchanged
assert "a" in base3.columns
- assert "c" in base3["nested"].nest.fields
+ assert "c" in base3["nested"].nest.columns
# Test error for missing columns in multi-drop
with pytest.raises(KeyError):
@@ -1392,7 +1433,7 @@ def test_min():
nested_clean = pd.DataFrame(
data={"g": [1, 0, 3, 4, 5, 6], "h": [1, 2, 3, 4, 5, 6]}, index=[0, 0, 1, 1, 2, 2]
)
- base_clean = base.add_nested(nested_clean, "nested_clean")
+ base_clean = base.join_nested(nested_clean, "nested_clean")
min_clean = base_clean.min()
expected_clean = pd.Series({"a": 1, "b": 2, "c": "x", "nested_clean.g": 0, "nested_clean.h": 1})
assert (min_clean == expected_clean).all()
@@ -1400,14 +1441,14 @@ def test_min():
nested_nan = pd.DataFrame(
data={"g": [1, np.nan, 3, 4, 5, 6], "h": [np.nan, np.nan, 3, 4, np.nan, 6]}, index=[0, 0, 1, 1, 2, 2]
)
- base_nan = base.add_nested(nested_nan, "nested_nan")
+ base_nan = base.join_nested(nested_nan, "nested_nan")
min_nan = base_nan.min()
assert isinstance(min_nan, pd.Series)
expected_nan = pd.Series({"a": 1, "b": 2, "c": "x", "nested_nan.g": 1, "nested_nan.h": 3})
assert (min_nan == expected_nan).all()
# 1 nested column
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
r2 = base.min(exclude_nest=True, numeric_only=True)
assert (r2 == pd.Series({"a": 1, "b": 2})).all()
r3 = base.min(exclude_nest=True)
@@ -1417,7 +1458,7 @@ def test_min():
assert (r4 == expected4).all()
# 2 nested columns
- base = base.add_nested(nested2, "nested2")
+ base = base.join_nested(nested2, "nested2")
r5 = base.min(exclude_nest=True, numeric_only=True)
assert (r5 == pd.Series({"a": 1, "b": 2})).all()
r6 = base.min(exclude_nest=True)
@@ -1439,7 +1480,7 @@ def test_min():
# only nested column
base2 = NestedFrame(data={"x": [0, 1, 2]}, index=[0, 1, 2])
nested3 = NestedFrame(data={"a": [1, 2, 3, 4, 5, 6], "b": [2, 4, 6, 8, 9, 0]}, index=[0, 0, 1, 1, 1, 2])
- base2 = base2.add_nested(nested3, "nested3")
+ base2 = base2.join_nested(nested3, "nested3")
base2 = base2.drop(["x"], axis=1)
r8 = base2.min(exclude_nest=True)
assert isinstance(r8, pd.Series)
@@ -1469,7 +1510,7 @@ def test_max():
nested_clean = pd.DataFrame(
data={"g": [1, 0, 3, 4, 5, 6], "h": [1, 2, 3, 4, 5, 6]}, index=[0, 0, 1, 1, 2, 2]
)
- base_clean = base.add_nested(nested_clean, "nested_clean")
+ base_clean = base.join_nested(nested_clean, "nested_clean")
max_clean = base_clean.max()
expected_clean = pd.Series({"a": 3, "b": 6, "c": "z", "nested_clean.g": 6, "nested_clean.h": 6})
assert (max_clean == expected_clean).all()
@@ -1478,14 +1519,14 @@ def test_max():
data={"g": [1, np.nan, 3, 4, np.nan, np.nan], "h": [np.nan, np.nan, 3, 4, 5, np.nan]},
index=[0, 0, 1, 1, 2, 2],
)
- base_nan = base.add_nested(nested_nan, "nested_nan")
+ base_nan = base.join_nested(nested_nan, "nested_nan")
max_nan = base_nan.max()
assert isinstance(max_nan, pd.Series)
expected_nan = pd.Series({"a": 3, "b": 6, "c": "z", "nested_nan.g": 4, "nested_nan.h": 5})
assert (max_nan == expected_nan).all()
# 1 nested column
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
r2 = base.max(exclude_nest=True, numeric_only=True)
assert (r2 == pd.Series({"a": 3, "b": 6})).all()
r3 = base.max(exclude_nest=True)
@@ -1495,7 +1536,7 @@ def test_max():
assert (r4 == expected4).all()
# 2 nested columns
- base = base.add_nested(nested2, "nested2")
+ base = base.join_nested(nested2, "nested2")
r5 = base.max(exclude_nest=True, numeric_only=True)
assert (r5 == pd.Series({"a": 3, "b": 6})).all()
r6 = base.max(exclude_nest=True)
@@ -1517,7 +1558,7 @@ def test_max():
# only nested column
base2 = NestedFrame(data={"x": [0, 1, 2]}, index=[0, 1, 2])
nested3 = NestedFrame(data={"a": [1, 2, 3, 4, 5, 6], "b": [2, 4, 6, 8, 9, 0]}, index=[0, 0, 1, 1, 1, 2])
- base2 = base2.add_nested(nested3, "nested3")
+ base2 = base2.join_nested(nested3, "nested3")
base2 = base2.drop(["x"], axis=1)
r8 = base2.max(exclude_nest=True)
assert isinstance(r8, pd.Series)
@@ -1592,7 +1633,7 @@ def test_describe():
base_num.describe(exclude=np.number)
# adding number nested columns
- base_mix = base_mix.add_nested(nested_num, "nested_num")
+ base_mix = base_mix.join_nested(nested_num, "nested_num")
r6 = base_mix.describe()
assert isinstance(r6, NestedFrame)
assert r6.shape[1] == 3
@@ -1620,12 +1661,12 @@ def test_describe():
assert "a" in r10.columns
assert "b" in r10.columns
- base_num = base_num.add_nested(nested_num, "nested_num")
+ base_num = base_num.join_nested(nested_num, "nested_num")
with pytest.raises(ValueError):
base_num.describe(exclude=np.number)
# adding mixed type nested columns
- base_mix = base_mix.add_nested(nested_mix, "nested_mix")
+ base_mix = base_mix.join_nested(nested_mix, "nested_mix")
r11 = base_mix.describe()
assert isinstance(r11, NestedFrame)
assert r11.shape[1] == 4
@@ -1664,7 +1705,7 @@ def test_describe():
# only nested column
base2 = NestedFrame(data={"x": [0, 1, 2]}, index=[0, 1, 2])
- base2 = base2.add_nested(nested_mix, "nested_mix").add_nested(nested_num, "nested_num")
+ base2 = base2.join_nested(nested_mix, "nested_mix").join_nested(nested_num, "nested_num")
base2 = base2.drop(["x"], axis=1)
r17 = base2.describe()
assert isinstance(r17, NestedFrame)
@@ -1676,7 +1717,7 @@ def test_describe():
base2.describe(include=object)
# edge case: object base with numeric nest
- base_obj = base_obj.add_nested(nested_mix, "nested_mix").add_nested(nested_num, "nested_num")
+ base_obj = base_obj.join_nested(nested_mix, "nested_mix").join_nested(nested_num, "nested_num")
r18 = base_obj.describe()
assert isinstance(r18, NestedFrame)
assert r18.shape[1] == 3
@@ -1702,7 +1743,7 @@ def test_explode_1():
data={"f": ["A", "B", "C", "D", "E", "A", "A", "B"], "g": [5, 4, 7, 5, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested_num, "nested_num").add_nested(nested_mix, "nested_mix")
+ base = base.join_nested(nested_num, "nested_num").join_nested(nested_mix, "nested_mix")
# explode on base columns
r1 = base.explode(column=["a"])
@@ -1748,8 +1789,8 @@ def test_explode_non_unique_index():
# Add a new nested column which has the same element length as the "nested"
nf["aligned_nested.aligned_t"] = nf["nested.t"]
# Add a new nested column which has different lengths
- nf["unaligned_nested"] = nf.reduce(
- lambda x: {"unaligned_nested.unaligned_t": x[:2]}, "nested.t"
+ nf["unaligned_nested"] = nf.map_rows(
+ lambda x: {"unaligned_nested.unaligned_t": x[:2]}, columns="nested.t", row_container="args"
).reset_index(drop=True)
# Add a list column which has the same lengths
nf["aligned_list_t"] = nf["nested"].nest.to_lists("t")["t"]
@@ -1852,7 +1893,7 @@ def test_fillna():
assert (r0["b"] == pd.Series([2, 0, 6])).all()
# 1 nested column
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
r1 = base.fillna(0)
expected1 = pd.Series([0, 4, 0, 5, 3, 1, 0, 3, 4], index=[0, 0, 0, 1, 1, 1, 2, 2, 2])
assert (r1["nested.d"] == expected1).all()
@@ -1866,7 +1907,7 @@ def test_fillna():
assert np.isnan(r3["a"][1])
# 2 nested columns
- base = base.add_nested(nested2, "nested2")
+ base = base.join_nested(nested2, "nested2")
r4 = base.fillna(0)
expected4 = pd.Series([0, 0, 0, 1, 4, 0, 4, 1], index=[0, 0, 0, 1, 1, 2, 2, 2])
assert (r4["nested2.e"] == expected4).all()
@@ -1899,7 +1940,7 @@ def test_eval():
index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"),
)
- nf = nf.add_nested(to_pack, "packed")
+ nf = nf.join_nested(to_pack, "packed")
p5 = nf.eval("packed.d > 5")
assert isinstance(p5, _SeriesFromNest)
assert p5.any()
@@ -1948,7 +1989,7 @@ def test_mixed_eval_funcs():
index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"),
)
# Reduction
- nf = nf.add_nested(to_pack, "packed")
+ nf = nf.join_nested(to_pack, "packed")
assert (nf.eval("a + packed.c.median()") == pd.Series([4, 5, 6])).all()
# Across the nest: each base column element applies to each of its indexes
@@ -1972,7 +2013,7 @@ def test_eval_assignment():
},
index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"),
)
- nf = nf.add_nested(to_pack, "packed")
+ nf = nf.join_nested(to_pack, "packed")
# Assigning to new base columns from old base columns
nf_b = nf.eval("c = a + 1")
assert len(nf_b.columns) == len(nf.columns) + 1
@@ -1980,17 +2021,17 @@ def test_eval_assignment():
# Assigning to new nested columns from old nested columns
nf_nc = nf.eval("packed.e = packed.c + 1")
- assert len(nf_nc.packed.nest.fields) == len(nf["packed"].nest.fields) + 1
+ assert len(nf_nc.packed.nest.columns) == len(nf["packed"].nest.columns) + 1
assert (nf_nc["packed.e"] == nf["packed.c"] + 1).all()
# Verify that overwriting a nested column works
nf_nc_2 = nf_nc.eval("packed.e = packed.c * 2")
- assert len(nf_nc_2.packed.nest.fields) == len(nf_nc["packed"].nest.fields)
+ assert len(nf_nc_2.packed.nest.columns) == len(nf_nc["packed"].nest.columns)
assert (nf_nc_2["packed.e"] == nf["packed.c"] * 2).all()
# Assigning to new nested columns from a combo of base and nested
nf_nx = nf.eval("packed.f = a + packed.c")
- assert len(nf_nx.packed.nest.fields) == len(nf["packed"].nest.fields) + 1
+ assert len(nf_nx.packed.nest.columns) == len(nf["packed"].nest.columns) + 1
assert (nf_nx["packed.f"] == nf["a"] + nf["packed.c"]).all()
assert (nf_nx["packed.f"] == pd.Series([1, 3, 5, 12, 6, 5, 4, 7, 4], index=to_pack.index)).all()
@@ -2006,15 +2047,15 @@ def test_eval_assignment():
# Create new nests via eval()
nf_n2 = nf.eval("p2.c2 = packed.c * 2")
- assert len(nf_n2.p2.nest.fields) == 1
+ assert len(nf_n2.p2.nest.columns) == 1
assert (nf_n2["p2.c2"] == nf["packed.c"] * 2).all()
assert (nf_n2["p2.c2"] == pd.Series([0, 4, 8, 20, 8, 6, 2, 8, 2], index=to_pack.index)).all()
assert len(nf_n2.columns) == len(nf.columns) + 1 # new packed column
- assert len(nf_n2.p2.nest.fields) == 1
+ assert len(nf_n2.p2.nest.columns) == 1
# Assigning to new columns across two different nests
nf_n3 = nf_n2.eval("p2.d = p2.c2 + packed.d * 2 + b")
- assert len(nf_n3.p2.nest.fields) == 2
+ assert len(nf_n3.p2.nest.columns) == 2
assert (nf_n3["p2.d"] == nf_n2["p2.c2"] + nf["packed.d"] * 2 + nf["b"]).all()
# Now test multiline and inplace=True
@@ -2031,8 +2072,8 @@ def test_eval_assignment():
inplace=True,
)
assert set(nf.nested_columns) == {"packed", "p2"}
- assert set(nf.packed.nest.fields) == {"c", "d", "e", "time"}
- assert set(nf.p2.nest.fields) == {"e", "f"}
+ assert set(nf.packed.nest.columns) == {"c", "d", "e", "time"}
+ assert set(nf.p2.nest.columns) == {"e", "f"}
assert (nf["p2.e"] == nf["packed.d"] * 2 + nf.c).all()
assert (nf["p2.f"] == nf["p2.e"] + nf.b).all()
@@ -2044,7 +2085,7 @@ def test_eval_assignment():
data={"n/a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "n/b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- nf = nf.add_nested(nested, "bad dog")
+ nf = nf.join_nested(nested, "bad dog")
nfx = nf.eval("`bad dog`.`n/c` = `bad dog`.`n/b` + 2.5")
# The number of columns at the top should not have changed
assert len(nfx.columns) == len(nf.columns)
@@ -2080,15 +2121,15 @@ def test_nest_lists():
empty_ndf = NestedFrame({"a": [], "b": [], "c": []})
empty_ndf = empty_ndf.nest_lists(columns=["b", "c"], name="nested")
assert len(empty_ndf) == 0
- assert empty_ndf.nested.nest.to_flat().shape == (0, 2)
- assert empty_ndf.nested.nest.fields == ["b", "c"]
+ assert empty_ndf.nested.explode().shape == (0, 2)
+ assert empty_ndf.nested.nest.columns == ["b", "c"]
assert set(empty_ndf.columns) == set(["a", "nested"])
# Test packing empty lists as columns.
empty_list_ndf = NestedFrame({"a": [1], "b": [[]], "c": [[]]})
empty_list_ndf = empty_list_ndf.nest_lists(columns=["b", "c"], name="nested")
- assert empty_list_ndf.nested.nest.to_flat().shape == (0, 2)
- assert empty_list_ndf.nested.nest.fields == ["b", "c"]
+ assert empty_list_ndf.nested.explode().shape == (0, 2)
+ assert empty_list_ndf.nested.nest.columns == ["b", "c"]
assert set(empty_list_ndf.columns) == {"a", "nested"}
# Test that we raise an error if the columns are not lists
@@ -2120,10 +2161,6 @@ def test_nest_lists():
with pytest.raises(ValueError):
ndf.nest_lists(columns=["c", "d"], name="nested")
- # Test nest_lists ordering deprecation warning
- with pytest.warns(DeprecationWarning):
- res = ndf.nest_lists("nested", ["c", "b"])
-
def test_nestlists_nonunique_index():
"""Test that nest_lists works with a non-unique index."""
@@ -2143,11 +2180,11 @@ def test_delitem_base_and_nested():
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
- # Delete a nested field
+ # Delete a nested column
del base["nested.c"]
- assert "c" not in base["nested"].nest.fields
+ assert "c" not in base["nested"].nest.columns
# Delete a base column
del base["a"]
assert "a" not in base.columns
@@ -2174,7 +2211,7 @@ def test_auto_nest_on_dataframe_assignment():
assert "nested" in base.nested_columns
# The flat representation should match the original DataFrame (ignoring dtype)
- flat = base["nested"].nest.to_flat()
+ flat = base["nested"].explode()
assert (flat.values == nested.values).all()
assert list(flat.columns) == list(nested.columns)
assert list(flat.index) == list(nested.index)
@@ -2215,5 +2252,5 @@ def test_issue350():
"""https://github.com/lincc-frameworks/nested-pandas/issues/350"""
nf = generate_data(3, 2)
nf = nf.set_index(np.array([100, 100, 101]))
- result = nf.reduce(lambda flux: {"new.flux": flux}, "nested.flux")
+ result = nf.map_rows(lambda flux: {"new.flux": flux}, columns="nested.flux", row_container="args")
assert len(result) == 3
diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py
index 6caf213f..f15c9596 100644
--- a/tests/nested_pandas/series/test_accessor.py
+++ b/tests/nested_pandas/series/test_accessor.py
@@ -108,8 +108,8 @@ def test_to_lists_for_chunked_array():
assert_frame_equal(lists, desired)
-def test_to_lists_with_fields():
- """Test that the .nest.to_lists(fields=...) method works."""
+def test_to_lists_with_columns():
+ """Test that the .nest.to_lists(columns=...) method works."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])]),
@@ -119,7 +119,7 @@ def test_to_lists_with_fields():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1])
- lists = series.nest.to_lists(fields=["a"])
+ lists = series.nest.to_lists(columns=["a"])
desired = pd.DataFrame(
data={
@@ -134,7 +134,7 @@ def test_to_lists_with_fields():
def test_to_lists_fails_for_empty_input():
- """Test that the .nest.to_lists([]) fails when no fields are provided."""
+ """Test that the .nest.to_lists([]) fails when no columns are provided."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([]), np.array([])]),
@@ -229,8 +229,8 @@ def test_to_flat_for_chunked_array():
assert_frame_equal(flat, desired)
-def test_to_flat_with_fields():
- """Test that the .nest.to_flat(fields=...) method works."""
+def test_to_flat_with_columns():
+ """Test that the .nest.to_flat(columns=...) method works."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -240,7 +240,7 @@ def test_to_flat_with_fields():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1])
- flat = series.nest.to_flat(fields=["a"])
+ flat = series.nest.to_flat(columns=["a"])
desired = pd.DataFrame(
data={
@@ -262,7 +262,7 @@ def test_to_flat_with_fields():
def test_to_flat_multiple_nesting():
- """Test that the .nest.to_flat() method works well with inner nested fields."""
+ """Test that the .nest.to_flat() method works well with inner nested columns."""
nf = generate_data(10, 2)
nf["a"] = nf["a"].astype(pd.ArrowDtype(pa.float64()))
nf["b"] = nf["b"].astype(pd.ArrowDtype(pa.float64()))
@@ -277,7 +277,7 @@ def test_to_flat_multiple_nesting():
def test_to_flat_fails_for_empty_input():
- """Test that the .nest.to_flat([]) fails when no fields are provided."""
+ """Test that the .nest.to_flat([]) fails when no columns are provided."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([]), np.array([])]),
@@ -291,8 +291,8 @@ def test_to_flat_fails_for_empty_input():
_ = series.nest.to_flat([])
-def test_fields():
- """Test that the .nest.fields attribute works."""
+def test_columns():
+ """Test that the .nest.columns attribute works."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -302,7 +302,7 @@ def test_fields():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1])
- assert_array_equal(series.nest.fields, ["a", "b"])
+ assert_array_equal(series.nest.columns, ["a", "b"])
def test_list_lengths():
@@ -332,8 +332,8 @@ def test_flat_length():
assert series.nest.flat_length == 6
-def test_with_flat_field():
- """Test that the .nest.set_flat_field() method works."""
+def test_set_flat_column():
+ """Test that the .nest.set_flat_column() method works."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -343,7 +343,7 @@ def test_with_flat_field():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1])
- new_series = series.nest.with_flat_field("a", np.array(["a", "b", "c", "d", "e", "f"]))
+ new_series = series.nest.set_flat_column("a", np.array(["a", "b", "c", "d", "e", "f"]))
assert_series_equal(
new_series.nest["a"],
@@ -356,8 +356,8 @@ def test_with_flat_field():
)
-def test_with_field():
- """Test that .nest.with_field is just an alias to .nest.with_flat_field."""
+def test_set_column():
+ """Test that .nest.set_column is just an alias to .nest.set_flat_column."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -367,13 +367,13 @@ def test_with_field():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1])
assert_series_equal(
- series.nest.with_field("a", np.array(["a", "b", "c", "d", "e", "f"])),
- series.nest.with_flat_field("a", np.array(["a", "b", "c", "d", "e", "f"])),
+ series.nest.set_column("a", np.array(["a", "b", "c", "d", "e", "f"])),
+ series.nest.set_flat_column("a", np.array(["a", "b", "c", "d", "e", "f"])),
)
-def test_with_list_field():
- """Test that the .nest.set_list_field() method works."""
+def test_set_list_column():
+ """Test that the .nest.set_list_column() method works."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -383,7 +383,7 @@ def test_with_list_field():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1])
- new_series = series.nest.with_list_field("c", [["a", "b", "c"], ["d", "e", "f"]])
+ new_series = series.nest.set_list_column("c", [["a", "b", "c"], ["d", "e", "f"]])
assert_series_equal(
new_series.nest["c"],
@@ -396,18 +396,18 @@ def test_with_list_field():
)
-def test_with_filled_field():
- """Test .nest.with_filled_field("field", value)"""
+def test_set_filled_column():
+ """Test .nest.set_filled_column("column", value)"""
series = pack_seq(
[
pd.DataFrame({"a": [1, 2, 3], "b": [1.0, 5.0, 6.0]}),
pd.DataFrame({"a": [1, 2], "b": [None, 0.0]}),
]
)
- new_series = series.nest.with_filled_field(
+ new_series = series.nest.set_filled_column(
"a",
[0, 100],
- ).nest.with_filled_field(
+ ).nest.set_filled_column(
"c",
["abc", "xyz"],
)
@@ -423,8 +423,8 @@ def test_with_filled_field():
assert_series_equal(new_series.nest["c"], desired.nest["c"])
-def test_without_field_single_field():
- """Test .nest.without_field("field")"""
+def test_drop_single_column():
+ """Test .nest.drop("column")"""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]),
@@ -434,7 +434,7 @@ def test_without_field_single_field():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
- new_series = series.nest.without_field("a")
+ new_series = series.nest.drop("a")
desired_struct_array = pa.StructArray.from_arrays(
arrays=[
@@ -447,8 +447,8 @@ def test_without_field_single_field():
assert_series_equal(new_series, desired)
-def test_without_field_multiple_fields():
- """Test .nest.without_field(["field1", "field2"])"""
+def test_drop_multiple_columns():
+ """Test .nest.drop(["col1", "col2"])"""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]),
@@ -459,7 +459,7 @@ def test_without_field_multiple_fields():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
- new_series = series.nest.without_field(["a", "b"])
+ new_series = series.nest.drop(["a", "b"])
desired_struct_array = pa.StructArray.from_arrays(
arrays=[
@@ -472,8 +472,8 @@ def test_without_field_multiple_fields():
assert_series_equal(new_series, desired)
-def test_without_field_raises_for_missing_field():
- """Test .nest.without_field("field") raises for missing field."""
+def test_drop_raises_for_missing_column():
+ """Test .nest.drop("column") raises for missing column."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]),
@@ -485,11 +485,11 @@ def test_without_field_raises_for_missing_field():
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
with pytest.raises(ValueError):
- _ = series.nest.without_field("d")
+ _ = series.nest.drop("d")
-def test_without_field_raises_for_missing_fields():
- """Test .nest.without_field(["field1", "field2"]) raises for missing fields."""
+def test_drop_raises_for_missing_columns():
+ """Test .nest.drop(["col1", "col2"]) raises for missing columns."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]),
@@ -501,11 +501,11 @@ def test_without_field_raises_for_missing_fields():
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
with pytest.raises(ValueError):
- _ = series.nest.without_field(["a", "d"])
+ _ = series.nest.drop(["a", "d"])
-def test_query_flat_1():
- """Test that the .nest.query_flat() method works."""
+def test_query_1():
+ """Test that the .nest.query() method works."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([4.0, 5.0, 6.0])]),
@@ -515,7 +515,7 @@ def test_query_flat_1():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
- filtered = series.nest.query_flat("a + b >= 7.0")
+ filtered = series.nest.query("a + b >= 7.0")
desired_struct_array = pa.StructArray.from_arrays(
arrays=[
@@ -530,8 +530,8 @@ def test_query_flat_1():
# Currently we remove empty rows from the output series
-def test_query_flat_empty_rows():
- """Test that the .nest.query_flat() method works as expected for empty rows."""
+def test_query_empty_rows():
+ """Test that the .nest.query() method works as expected for empty rows."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([4.0, 5.0, 6.0])]),
@@ -541,20 +541,20 @@ def test_query_flat_empty_rows():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
- filtered = series.nest.query_flat("a > 1000.0")
+ filtered = series.nest.query("a > 1000.0")
desired = NestedSeries([], dtype=series.dtype)
assert_series_equal(filtered, desired)
-def test_query_flat_with_empty_result():
+def test_query_with_empty_result():
"""Make sure the index is properly set for empty result cases"""
base = npd.NestedFrame({"a": []}, index=pd.Index([], dtype=np.float64))
nested = npd.NestedFrame({"b": []}, index=pd.Index([], dtype=np.float64))
- ndf = base.add_nested(nested, "nested")
+ ndf = base.join_nested(nested, "nested")
- res = ndf.nested.nest.query_flat("b > 2")
+ res = ndf.nested.nest.query("b > 2")
assert res.index.dtype == np.float64
@@ -572,10 +572,10 @@ def test_query_flat_with_empty_result():
),
],
)
-def test_get_flat_index(df):
- """Test .nest.get_flat_index() returns the index of the original flat df"""
+def test_flat_index(df):
+ """Test .nest.flat_index returns the index of the original flat df"""
series = pack_flat(df)
- assert_index_equal(series.nest.get_flat_index(), df.index.sort_values())
+ assert_index_equal(series.nest.flat_index, df.index.sort_values())
def test_get_list_series():
@@ -589,7 +589,7 @@ def test_get_list_series():
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
- lists = series.nest.get_list_series("a")
+ lists = series.nest.to_lists()["a"]
assert_series_equal(
lists,
@@ -615,7 +615,7 @@ def test_get_list_series_multiple_chunks():
series = pd.Series(chunked_array, dtype=NestedDtype(chunked_array.type), index=[5, 7, 9, 11, 13, 15])
assert series.array.num_chunks == 3
- lists = series.nest.get_list_series("a")
+ lists = series.nest.to_lists()["a"]
assert_series_equal(
lists,
@@ -642,8 +642,8 @@ def test_get():
assert series.nest.get("c", "default_value") == "default_value"
-def test___getitem___single_field():
- """Test that the .nest["field"] works for a single field."""
+def test___getitem___single_column():
+ """Test that the .nest["column"] works for a single column."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -673,8 +673,8 @@ def test___getitem___single_field():
)
-def test___getitem___nested_field():
- """Test that the .nest["field"] works for an inner nested field."""
+def test___getitem___nested_column():
+ """Test that the .nest["column"] works for an inner nested column."""
nf = generate_data(10, 2)
nf = nf.assign(id=np.repeat(np.r_[0:5], 2))
nf = nf.rename(columns={"nested": "inner"})
@@ -683,7 +683,7 @@ def test___getitem___nested_field():
assert_series_equal(nnf["outer"].nest["inner"], nf["inner"], check_index=False)
-def test___getitem___single_field_multiple_chunks():
+def test___getitem___single_column_multiple_chunks():
"""Reproduces issue 142
https://github.com/lincc-frameworks/nested-pandas/issues/142
@@ -710,8 +710,8 @@ def test___getitem___single_field_multiple_chunks():
)
-def test___getitem___multiple_fields():
- """Test that the .nest[["b", "a"]] works for multiple fields."""
+def test___getitem___multiple_columns():
+ """Test that the .nest[["b", "a"]] works for multiple columns."""
arrays = [
pa.array([np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])]),
pa.array([np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
@@ -770,7 +770,7 @@ def test___getitem___series_masking():
def test___setitem__():
- """Test that the .nest["field"] = ... works for a single field."""
+ """Test that the .nest["column"] = ... works for a single column."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -794,7 +794,7 @@ def test___setitem__():
def test___setitem___with_series_with_index():
- """Test that the .nest["field"] = pd.Series(...) works for a single field."""
+ """Test that the .nest["column"] = pd.Series(...) works for a single column."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -818,7 +818,7 @@ def test___setitem___with_series_with_index():
flat_series.astype(pd.ArrowDtype(pa.float64())),
)
assert_series_equal(
- series.nest.get_list_series("a"),
+ series.nest.to_lists()["a"],
pd.Series(
data=[np.array([6, 5, 4]), np.array([3, 2, 1])],
dtype=pd.ArrowDtype(pa.list_(pa.float64())),
@@ -829,14 +829,14 @@ def test___setitem___with_series_with_index():
def test___setitem___empty_series():
- """Test that series.nest["field"] = [] does nothing for empty series."""
- empty_series = pd.Series([], dtype=NestedDtype.from_fields({"a": pa.float64()}))
+ """Test that series.nest["column"] = [] does nothing for empty series."""
+ empty_series = pd.Series([], dtype=NestedDtype.from_columns({"a": pa.float64()}))
empty_series.nest["a"] = []
assert len(empty_series) == 0
def test___setitem___with_single_value():
- """Test series.nest["field"] = const"""
+ """Test series.nest["column"] = const"""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0])]),
@@ -860,7 +860,7 @@ def test___setitem___with_single_value():
def test___setitem___raises_for_wrong_dtype():
- """Test that the .nest["field"] = ... raises for a wrong dtype."""
+ """Test that the .nest["column"] = ... raises for a wrong dtype."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -875,7 +875,7 @@ def test___setitem___raises_for_wrong_dtype():
def test___setitem___raises_for_wrong_length():
- """Test that the .nest["field"] = ... raises for a wrong length."""
+ """Test that the .nest["column"] = ... raises for a wrong length."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -890,7 +890,7 @@ def test___setitem___raises_for_wrong_length():
def test___setitem___raises_for_wrong_index():
- """Test that the .nest["field"] = ... raises for a wrong index."""
+ """Test that the .nest["column"] = ... raises for a wrong index."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -911,15 +911,15 @@ def test___setitem___raises_for_wrong_index():
series.nest["a"] = flat_series
-def test___setitem___raises_for_new_field():
- """Test that series.nest["field"] = ... raises for a new field."""
+def test___setitem___raises_for_new_column():
+ """Test that series.nest["column"] = ... raises for a new column."""
series = pack_seq([{"a": [1, 2, 3]}, {"a": [4, None]}])
with pytest.raises(ValueError):
series.nest["b"] = series.nest["a"] - 1
def test___delitem___raises():
- """Test that the `del .nest["field"]` is not implemented."""
+ """Test that the `del .nest["column"]` is not implemented."""
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
@@ -988,7 +988,7 @@ def test_to_flat_dropna():
def test___contains__():
- """Test that the `"field" in .nest` works.
+ """Test that the `"column" in .nest` works.
We haven't implemented it, but base class does
"""
@@ -1025,7 +1025,7 @@ def test___eq___false_for_different_types():
def test_clear_raises():
- """Test that .nest.clear() raises - we cannot handle nested series with no fields"""
+ """Test that .nest.clear() raises - we cannot handle nested series with no columns"""
series = pack_seq([pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}), None])
with pytest.raises(NotImplementedError):
series.nest.clear()
@@ -1150,8 +1150,8 @@ def test_to_flatten_inner_none_nested():
_actual = nnf["ztf"].nest.to_flatten_inner("lc")
-def test_to_flatten_inner_wrong_field():
- """Test an exception is raised when .nest.to_flatten_inner() called for a wrong field."""
+def test_to_flatten_inner_wrong_column():
+ """Test an exception is raised when .nest.to_flatten_inner() called for a wrong column."""
nf = generate_data(10, 2)
with pytest.raises(ValueError):
nf.nested.nest.to_flatten_inner("t")
@@ -1172,6 +1172,6 @@ def test_issue266():
empty_outer_flatten = empty_nnf["outer"].nest.to_flatten_inner("inner")
- assert empty_outer_flatten.dtype == NestedDtype.from_fields(
+ assert empty_outer_flatten.dtype == NestedDtype.from_columns(
{"a": pa.float64(), "b": pa.float64(), "t": pa.float64(), "flux": pa.float64(), "band": pa.string()}
)
diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py
index 2246cf87..df68e1ed 100644
--- a/tests/nested_pandas/series/test_dtype.py
+++ b/tests/nested_pandas/series/test_dtype.py
@@ -65,7 +65,7 @@ def test_from_pyarrow_dtype_raises(pyarrow_dtype):
def test_to_pandas_arrow_dtype():
"""Test that NestedDtype.to_pandas_arrow_dtype() returns the correct pyarrow struct type."""
- dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ dtype = NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
assert dtype.to_pandas_arrow_dtype() == pd.ArrowDtype(
pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))])
)
@@ -83,18 +83,35 @@ def test_from_pandas_arrow_dtype():
assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))])
+def test_init_from_pandas_arrow_dtype():
+ """Test that we can construct NestedDtype from pandas.ArrowDtype in __init__."""
+ dtype_from_struct = NestedDtype(pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))])))
+ assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))])
+ dtype_from_list = NestedDtype(pd.ArrowDtype(pa.list_(pa.struct([pa.field("a", pa.int64())]))))
+ assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))])
+
+
def test_to_pandas_list_struct_arrow_dtype():
"""Test that NestedDtype.to_pandas_arrow_dtype(list_struct=True) returns the correct pyarrow type."""
- dtype = NestedDtype.from_fields({"a": pa.list_(pa.int64()), "b": pa.float64()})
+ dtype = NestedDtype.from_columns({"a": pa.list_(pa.int64()), "b": pa.float64()})
assert dtype.to_pandas_arrow_dtype(list_struct=True) == pd.ArrowDtype(
pa.list_(pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())]))
)
-def test_from_fields():
- """Test NestedDtype.from_fields()."""
- fields = {"a": pa.int64(), "b": pa.float64()}
- dtype = NestedDtype.from_fields(fields)
+def test_from_columns():
+ """Test NestedDtype.from_columns()."""
+ columns = {"a": pa.int64(), "b": pa.float64()}
+ dtype = NestedDtype.from_columns(columns)
+ assert dtype.pyarrow_dtype == pa.struct(
+ [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]
+ )
+
+
+def test_init_from_columns():
+ """Test NestedDtype.__init__ with columns dict."""
+ columns = {"a": pa.int64(), "b": pa.float64()}
+ dtype = NestedDtype(columns)
assert dtype.pyarrow_dtype == pa.struct(
[pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]
)
@@ -106,12 +123,12 @@ def test_na_value():
assert dtype.na_value is pd.NA
-def test_fields():
- """Test NestedDtype.fields property"""
+def test_column_dtypes():
+ """Test NestedDtype.column_dtypes property"""
dtype = NestedDtype(
pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))])
)
- assert dtype.fields == {"a": pa.int64(), "b": pa.float64()}
+ assert dtype.column_dtypes == {"a": pa.int64(), "b": pa.float64()}
def test_field_names():
@@ -119,11 +136,11 @@ def test_field_names():
dtype = NestedDtype(
pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))])
)
- assert dtype.field_names == ["a", "b"]
+ assert list(dtype.column_dtypes.keys()) == ["a", "b"]
@pytest.mark.parametrize(
- "fields",
+ "columns",
[
{"a": pa.int64(), "b": pa.float64()},
{"a": pa.int64(), "b": pa.float64(), "c": pa.int64()},
@@ -134,9 +151,9 @@ def test_field_names():
# {"a": pa.struct([pa.field("a", pa.int64())]), "b": pa.list_(pa.int64())},
],
)
-def test_name_vs_construct_from_string(fields):
+def test_name_vs_construct_from_string(columns):
"""Test that dtype.name is consistent with dtype.construct_from_string(dtype.name)."""
- dtype = NestedDtype.from_fields(fields)
+ dtype = NestedDtype.from_columns(columns)
assert dtype == NestedDtype.construct_from_string(dtype.name)
diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py
index 88afb890..ecb3891d 100644
--- a/tests/nested_pandas/series/test_ext_array.py
+++ b/tests/nested_pandas/series/test_ext_array.py
@@ -102,7 +102,7 @@ def test_from_sequence_with_list_of_dicts_with_dtype():
None,
]
actual = NestedExtensionArray.from_sequence(
- sequence, dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ sequence, dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
)
desired = NestedExtensionArray(
pa.array(
@@ -160,7 +160,7 @@ def test_from_sequence_with_ndarray_of_df_with_dtype():
sequence = np.empty(len(sequence_list), dtype=object)
sequence[:] = sequence_list
actual = NestedExtensionArray.from_sequence(
- sequence, dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ sequence, dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
)
desired = NestedExtensionArray(
pa.array(
@@ -259,7 +259,7 @@ def test_series_built_from_dict():
{"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]},
{"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]},
]
- dtype = NestedDtype.from_fields({"a": pa.uint8(), "b": pa.float64()})
+ dtype = NestedDtype.from_columns({"a": pa.uint8(), "b": pa.float64()})
series = pd.Series(data, dtype=dtype)
assert isinstance(series.array, NestedExtensionArray)
@@ -771,7 +771,7 @@ def test_series___getitem___with_slice():
item = {"a": [1.0, 2.0, 3.0], "b": [-4.0, -5.0, -6.0]}
series = pd.Series(
[item, None, item, item, None, None, item],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
sliced = series[-1:0:-2].reset_index(drop=True)
assert_series_equal(sliced, pd.Series([item, None, item], dtype=series.dtype))
@@ -782,7 +782,7 @@ def test_series___getitem___with_slice_object():
item = {"a": [1.0, 2.0, 3.0], "b": [-4.0, None, -6.0]}
series = pd.Series(
[item, None, item, item, None, None, item],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
sliced = series[slice(-1, None, -2)].reset_index(drop=True)
assert sliced.equals(pd.Series([item, None, item, item], dtype=series.dtype))
@@ -793,7 +793,7 @@ def test_series___getitem___with_list_of_integers():
item = {"a": [None, 2.0, 3.0], "b": [-4.0, -5.0, -6.0]}
series = pd.Series(
[item, None, item, item, None, None, item],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
sliced = series[[0, 2, 5]].reset_index(drop=True)
assert sliced.equals(pd.Series([item, item, None], dtype=series.dtype))
@@ -804,7 +804,7 @@ def test_series___getitem___with_integer_ndarray():
item = {"a": [1.0, 2.0, 3.0], "b": [-4.0, pd.NA, -6.0]}
series = pd.Series(
[item, None, item, item, None, None, item],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
sliced = series[np.array([6, 1, 0, 6])].reset_index(drop=True)
assert sliced.equals(pd.Series([item, None, item, item], dtype=series.dtype))
@@ -815,7 +815,7 @@ def test_series___getitem___with_boolean_ndarray():
item = {"a": [1.0, 2.0, 3.0], "b": [-4.0, -5.0, -6.0]}
series = pd.Series(
[item, None, item, item, None, None, item],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
sliced = series[np.array([True, False, False, False, False, True, True])].reset_index(drop=True)
assert_series_equal(sliced, pd.Series([item, None, item], dtype=series.dtype))
@@ -824,7 +824,7 @@ def test_series___getitem___with_boolean_ndarray():
def test_isna_when_all_na():
"""Tests isna() when all values are None."""
ext_array = NestedExtensionArray.from_sequence(
- [None, None, None], dtype=NestedDtype.from_fields({"a": pa.int64()})
+ [None, None, None], dtype=NestedDtype.from_columns({"a": pa.int64()})
)
assert_array_equal(ext_array.isna(), np.array([True, True, True]))
@@ -833,7 +833,7 @@ def test_isna_when_none_na():
"""Tests isna() when no values are None."""
ext_array = NestedExtensionArray.from_sequence(
[{"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, {"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
assert_array_equal(ext_array.isna(), np.array([False, False]))
@@ -842,7 +842,7 @@ def test_isna_when_some_na():
"""Tests isna() when some values are None."""
ext_array = NestedExtensionArray.from_sequence(
[None, {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, pd.NA, pa.scalar(None)],
- dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}),
+ dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}),
)
assert_array_equal(ext_array.isna(), np.array([True, False, True, True]))
@@ -854,7 +854,7 @@ def test_isna_when_some_na():
def test__hasna(data, desired):
"""Tests _hasna()."""
ext_array = NestedExtensionArray.from_sequence(
- data, dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ data, dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
)
assert ext_array._hasna == desired
@@ -919,7 +919,7 @@ def test_take(allow_fill, fill_value, desired_sequence):
def test_take_raises_for_empty_array_and_non_empty_index():
"""Tests that .take([i1, i2, i3]) raises for empty array"""
- ext_array = NestedExtensionArray.from_sequence([], dtype=NestedDtype.from_fields({"a": pa.int64()}))
+ ext_array = NestedExtensionArray.from_sequence([], dtype=NestedDtype.from_columns({"a": pa.int64()}))
with pytest.raises(IndexError):
_result = ext_array.take([0, 1, 2])
@@ -936,7 +936,7 @@ def test_take_raises_for_empty_array_and_non_empty_index():
def test_take_raises_for_out_of_bounds_index(indices):
"""Tests that .take([i1, i2, i3]) raises for out of bounds index."""
ext_array = NestedExtensionArray.from_sequence(
- [None, None], dtype=NestedDtype.from_fields({"a": pa.int64()})
+ [None, None], dtype=NestedDtype.from_columns({"a": pa.int64()})
)
with pytest.raises(IndexError):
ext_array.take(indices)
@@ -945,7 +945,7 @@ def test_take_raises_for_out_of_bounds_index(indices):
def test__formatter_unboxed():
"""Tests formatting of array values, when displayed alone."""
formatter = NestedExtensionArray.from_sequence(
- [], dtype=NestedDtype.from_fields({"a": pa.int64()})
+ [], dtype=NestedDtype.from_columns({"a": pa.int64()})
)._formatter(boxed=False)
df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]})
assert formatter(df) == repr(df)
@@ -954,7 +954,7 @@ def test__formatter_unboxed():
def test__formatter_boxed():
"""Tests formatting of array values, when displayed in a DataFrame or Series"""
formatter = NestedExtensionArray.from_sequence(
- [], dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ [], dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
)._formatter(boxed=True)
d = {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}
df = pd.DataFrame(d)
@@ -964,7 +964,7 @@ def test__formatter_boxed():
def test__formetter_boxed_na():
"""Tests formatting of NA array value, when displayed in a DataFrame or Series"""
formatter = NestedExtensionArray.from_sequence(
- [], dtype=NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ [], dtype=NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
)._formatter(boxed=True)
assert formatter(pd.NA) == str(pd.NA)
@@ -996,7 +996,7 @@ def test_pickability():
def test__concat_same_type():
"""Test concatenating of three NestedExtensionArrays with the same dtype."""
- dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ dtype = NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
array1 = NestedExtensionArray.from_sequence(
[{"a": [1, 2, None], "b": [-2.0, None, -4.0]}, {"a": [None], "b": [3.14]}], dtype=dtype
)
@@ -1022,7 +1022,7 @@ def test__concat_same_type():
def test_equals():
"""Test that two NestedExtensionArrays are equal."""
- dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ dtype = NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
array1 = NestedExtensionArray.from_sequence(
[{"a": [1, 2, None], "b": [-2.0, None, -4.0]}, {"a": [None], "b": [3.14]}, None], dtype=dtype
)
@@ -1047,7 +1047,7 @@ def test_equals_when_other_is_different_type():
def test_dropna():
"""Test .dropna()"""
- dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
+ dtype = NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()})
array = NestedExtensionArray.from_sequence(
[
{"a": [1, 2, None], "b": [-2.0, None, -4.0]},
@@ -1895,7 +1895,7 @@ def test_series_interpolate():
"""We do not support interpolate() on NestedExtensionArray."""
with pytest.raises(NotImplementedError):
_series = pd.Series(
- [pd.DataFrame({"a": [1, 2, 3]}), pd.NA], dtype=NestedDtype.from_fields({"a": pa.float64()})
+ [pd.DataFrame({"a": [1, 2, 3]}), pd.NA], dtype=NestedDtype.from_columns({"a": pa.float64()})
).interpolate()
diff --git a/tests/nested_pandas/series/test_nestedseries.py b/tests/nested_pandas/series/test_nestedseries.py
index 0c1bf3b6..607d0edf 100644
--- a/tests/nested_pandas/series/test_nestedseries.py
+++ b/tests/nested_pandas/series/test_nestedseries.py
@@ -32,7 +32,7 @@ def test_nestedonly_decorator():
series = NestedSeries([1, 2, 3, 4, 5])
# Check nested only properties for decorator functionality
- for prop in ["fields", "flat_length", "list_lengths"]:
+ for prop in ["columns", "flat_length", "list_lengths"]:
with pytest.raises(TypeError, match=f"'{prop}' can only be used with a NestedDtype"):
getattr(series, prop)
@@ -42,8 +42,8 @@ def test_nestedonly_decorator():
getattr(series, func)()
-def test_nestedseries_fields():
- """Test fields property of NestedSeries."""
+def test_nestedseries_columns():
+ """Test columns property of NestedSeries."""
series = NestedSeries(
data=[
(np.array([1, 2]), np.array([0, 1])),
@@ -53,7 +53,7 @@ def test_nestedseries_fields():
dtype=NestedDtype(pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.int64()))])),
)
- assert series.fields == ["a", "b"]
+ assert series.columns == ["a", "b"]
def test_nestedseries_flat_length():
@@ -84,8 +84,8 @@ def test_nestedseries_list_lengths():
assert list(series.list_lengths) == [2, 2]
-def test_nestedseries_getitem_single_field():
- """Test getitem for a single field in NestedSeries."""
+def test_nestedseries_getitem_single_column():
+ """Test getitem for a single column in NestedSeries."""
series = NestedSeries(
data=[
(np.array([1, 2]), np.array([0, 1])),
@@ -100,8 +100,8 @@ def test_nestedseries_getitem_single_field():
pd.testing.assert_series_equal(result, expected)
-def test_nestedseries_getitem_multiple_fields():
- """Test getitem for multiple fields in NestedSeries."""
+def test_nestedseries_getitem_multiple_columns():
+ """Test getitem for multiple columns in NestedSeries."""
series = NestedSeries(
data=[
(np.array([1, 2]), np.array([0, 1])),
@@ -172,8 +172,8 @@ def test_nestedseries_setitem_non_nested_dtype():
assert series[0] == 10
-def test_nestedseries_setitem_single_field():
- """Test setitem for a single field in NestedSeries."""
+def test_nestedseries_setitem_single_column():
+ """Test setitem for a single column in NestedSeries."""
series = NestedSeries(
data=[
(np.array([1, 2]), np.array([0, 1])),
@@ -192,8 +192,8 @@ def test_nestedseries_setitem_single_field():
pd.testing.assert_series_equal(series["a"], expected)
-def test_nestedseries_to_flat():
- """Test to_flat method of NestedSeries."""
+def test_nestedseries_explode():
+ """Test explode method of NestedSeries."""
series = NestedSeries(
data=[
(np.array([1, 2]), np.array([0, 1])),
@@ -203,7 +203,7 @@ def test_nestedseries_to_flat():
dtype=NestedDtype(pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.int64()))])),
)
- flat_df = series.to_flat()
+ flat_df = series.explode()
assert isinstance(flat_df, pd.DataFrame)
assert list(flat_df.columns) == ["a", "b"]
assert flat_df.shape == (4, 2)
diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py
index 0fd9707d..2451e2f6 100644
--- a/tests/nested_pandas/series/test_packer.py
+++ b/tests/nested_pandas/series/test_packer.py
@@ -38,7 +38,7 @@ def test_pack_with_flat_df():
(np.array([2, 4]), np.array([1, 1])),
],
index=pd.MultiIndex.from_arrays(([1, 1], [1, 2])),
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
@@ -62,7 +62,7 @@ def test_pack_with_flat_df_and_index():
(np.array([2, 4]), np.array([1, 1])),
],
index=[101, 102],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
@@ -91,7 +91,7 @@ def test_pack_with_flat_df_and_on():
# Since we packed on 'c', we expect to see the unique sorted
# values of 'c' as the index
index=[0, 1],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
# The index name should be the same as the column we packed on
@@ -122,7 +122,7 @@ def test_pack_with_flat_df_and_on_and_index():
],
# We still expect to see the overriden index despite packing on 'c'
index=new_index,
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
@@ -158,7 +158,7 @@ def test_pack_with_series_of_dfs():
],
index=[1, 2],
name="nested",
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(series)
assert_series_equal(series, desired)
@@ -183,7 +183,7 @@ def test_pack_flat():
(np.array([7, 8, 9]), np.array([0, 1, 0])),
],
index=[1, 2, 3, 4],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(actual)
assert_series_equal(actual, desired)
@@ -216,7 +216,7 @@ def test_pack_flat_with_on():
),
],
index=[0, 1],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
)
desired.index.name = "c"
offsets_reused(actual)
@@ -242,7 +242,7 @@ def test_pack_sorted_df_into_struct():
(np.array([7, 8, 9]), np.array([0, 1, 0])),
],
index=[1, 2, 3, 4],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(actual)
assert_series_equal(actual, desired)
@@ -285,7 +285,7 @@ def test_pack_lists():
offsets_reused(series)
for field_name in packed_df.columns:
- assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name])
+ assert_series_equal(series.nest.to_lists()[field_name], packed_df[field_name])
def test_pack_lists_with_chunked_arrays():
@@ -302,8 +302,8 @@ def test_pack_lists_with_chunked_arrays():
)
list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5])
series = packer.pack_lists(list_df)
- assert_series_equal(series.nest.get_list_series("a"), chunked_a)
- assert_series_equal(series.nest.get_list_series("b"), chunked_b)
+ assert_series_equal(series.nest.to_lists()["a"], chunked_a)
+ assert_series_equal(series.nest.to_lists()["b"], chunked_b)
def test_pack_lists_with_uneven_chunked_arrays():
@@ -320,8 +320,8 @@ def test_pack_lists_with_uneven_chunked_arrays():
)
list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5])
series = packer.pack_lists(list_df)
- assert_series_equal(series.nest.get_list_series("a"), chunked_a)
- assert_series_equal(series.nest.get_list_series("b"), chunked_b)
+ assert_series_equal(series.nest.to_lists()["a"], chunked_a)
+ assert_series_equal(series.nest.to_lists()["b"], chunked_b)
def test_pack_seq_with_dfs_and_index():
@@ -366,7 +366,7 @@ def test_pack_seq_with_dfs_and_index():
(np.array([7, 8, 9]), np.array([0, 1, 0])),
],
index=[100, 101, 102, 103],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(series)
assert_series_equal(series, desired)
@@ -395,7 +395,7 @@ def test_pack_seq_with_different_elements_and_index():
pd.NA,
],
index=[100, 101, 102, 103],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
)
offsets_reused(series)
assert_series_equal(series, desired)
@@ -436,7 +436,7 @@ def test_pack_seq_with_series_of_dfs():
(np.array([5, 6]), np.array([0, 1])),
],
index=[100, 101, 102],
- dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+ dtype=NestedDtype.from_columns(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
offsets_reused(series)
diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py
index 62e9eab8..1b9ad9da 100644
--- a/tests/nested_pandas/utils/test_utils.py
+++ b/tests/nested_pandas/utils/test_utils.py
@@ -21,7 +21,7 @@ def test_count_nested(join):
},
index=[100, 100, 100, 101, 101, 101, 102, 102, 102],
)
- base = base.add_nested(nested, "nested")
+ base = base.join_nested(nested, "nested")
# Test general count
total_counts = count_nested(base, "nested", join=join)
@@ -64,12 +64,12 @@ def test_check_expr_nesting():
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- b1 = base.add_nested(nested, "nested")
+ b1 = base.join_nested(nested, "nested")
assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""}
- b2 = base.add_nested(nested.copy(), "n")
+ b2 = base.join_nested(nested.copy(), "n")
assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""}
abc = pd.DataFrame(
@@ -80,7 +80,7 @@ def test_check_expr_nesting():
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
- b3 = base.add_nested(abc, "abc").add_nested(abc, "c")
+ b3 = base.join_nested(abc, "abc").join_nested(abc, "c")
assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"}
assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"}
@@ -90,6 +90,6 @@ def test_check_expr_nesting():
assert b1.extract_nest_names("a>3") == {""}
assert b1.extract_nest_names("a > 3") == {""}
- b4 = base.add_nested(nested, "test")
+ b4 = base.join_nested(nested, "test")
assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""}
assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""}