From 02b2ed88f83d938c61c47b45e0e7bd55c1c45d9e Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 3 Aug 2015 19:17:26 -0700 Subject: [PATCH 1/2] Clarify rules for copies vs views when indexing --- doc/indexing.rst | 230 ++++++++++++++++++------------------- doc/whats-new.rst | 3 +- xray/core/indexing.py | 10 +- xray/test/test_indexing.py | 10 +- 4 files changed, 131 insertions(+), 122 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index c25b63da88f..a87ddfc6bf9 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -53,10 +53,12 @@ DataArray: arr[0, 0] arr[:, [2, 1]] +Attributes are persisted in all indexing operations. + .. warning:: Positional indexing deviates from the NumPy when indexing with multiple - arrays like ``arr[[0, 1], [0, 1]]``, as described in :ref:`indexing details`. + arrays like ``arr[[0, 1], [0, 1]]``, as described in :ref:`orthogonal`. See :ref:`pointwise indexing` for how to achieve this functionality in xray. xray also supports label-based indexing, just like pandas. Because @@ -81,6 +83,7 @@ Setting values with label based indexing is also supported: arr.loc['2000-01-01', ['IL', 'IN']] = -10 arr + Indexing with labeled dimensions -------------------------------- @@ -204,39 +207,126 @@ index labels along a dimension dropped: ``drop`` is both a ``Dataset`` and ``DataArray`` method. -.. _indexing details: +.. _nearest neighbor lookups: -Indexing details ----------------- +Nearest neighbor lookups +------------------------ -Like pandas, whether array indexing returns a view or a copy of the underlying -data depends entirely on numpy: +The label based selection methods :py:meth:`~xray.Dataset.sel`, +:py:meth:`~xray.Dataset.reindex` and :py:meth:`~xray.Dataset.reindex_like` all +support a ``method`` keyword argument. The method parameter allows for +enabling nearest neighbor (inexact) lookups by use of the methods ``'pad'``, +``'backfill'`` or ``'nearest'``: -* Indexing with a single label or a slice returns a view. -* Indexing with a vector of array labels returns a copy. +.. ipython:: python + + data = xray.DataArray([1, 2, 3], dims='x') + data.sel(x=[1.1, 1.9], method='nearest') + data.sel(x=0.1, method='backfill') + data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') -Attributes are persisted in array indexing: +Using ``method='nearest'`` or a scalar argument with ``.sel()`` requires pandas +version 0.16 or newer. + +The method parameter is not yet supported if any of the arguments +to ``.sel()`` is a ``slice`` object: + +.. ipython:: + :verbatim: + + In [1]: data.sel(x=slice(1, 3), method='nearest') + NotImplementedError + +However, you don't need to use ``method`` to do inexact slicing. Slicing +already returns all values inside the range (inclusive), as long as the index +labels are monotonic increasing: + +.. ipython:: python + + data.sel(x=slice(0.9, 3.1)) + +Indexing axes with monotonic decreasing labels also works, as long as the +``slice`` or ``.loc`` arguments are also decreasing: .. ipython:: python - arr2 = arr.copy() - arr2.attrs['units'] = 'meters' - arr2[0, 0].attrs + reversed_data = data[::-1] + reversed_data.loc[3.1:0.9] + +.. _masking with where: + +Masking with ``where`` +---------------------- + +Indexing methods on xray objects generally return a subset of the original data. +However, it is sometimes useful to select an object with the same shape as the +original data, but with some elements masked. To do this type of selection in +xray, use :py:meth:`~xray.DataArray.where`: + +.. ipython:: python + + arr = xray.DataArray(np.arange(16).reshape(4, 4), dims=['x', 'y']) + arr.where(arr.x + arr.y < 4) + +This is particularly useful for ragged indexing of multi-dimensional data, +e.g., to apply a 2D mask to an image. Note that ``where`` follows all the +usual xray broadcasting and alignment rules for binary operations (e.g., +``+``) between the object being indexed and the condition, as described in +:ref:`comput`: + +.. ipython:: python + + arr.where(arr.y < 2) + +Multi-dimensional indexing +-------------------------- + +Xray does not yet support efficient routines for generalized multi-dimensional +indexing or regridding. However, we are definitely interested in adding support +for this in the future (see :issue:`475` for the ongoing discussion). + +.. _copies vs views: + +Copies vs. views +---------------- + +Whether array indexing returns a view or a copy of the underlying +data depends on the nature of the labels. For positional (integer) +indexing, xray follows the same rules as NumPy: + +* Positional indexing with only integers and slices returns a view. +* Positional indexing with arrays or lists returns a copy. + +The rules for label based indexing are more complex: + +* Label-based indexing with only slices returns a view. +* Label-based indexing with arrays returns a copy. +* Label-based indexing with scalars returns a view or a copy, depending + upon if the corresponding positional indexer can be represented as an + integer or a slice object. + +.. _orthogonal: + +Orthogonal (outer) vs. vectorized indexing +------------------------------------------ Indexing with xray objects has one important difference from indexing numpy arrays: you can only use one-dimensional arrays to index xray objects, and each indexer is applied "orthogonally" along independent axes, instead of -using numpy's advanced broadcasting. This means you can do indexing like this, -which would require slightly more awkward syntax with numpy arrays: +using numpy's broadcasting rules to vectorize indexers. This means you can do +indexing like this, which would require slightly more awkward syntax with +numpy arrays: .. ipython:: python arr[arr['time.day'] > 1, arr['space'] != 'IL'] -This is a much simpler model than numpy's `advanced indexing`__, -and is basically the only model that works for labeled arrays. If you would -like to do array indexing, you can always index ``.values`` directly -instead: +This is a much simpler model than numpy's `advanced indexing`__. If you would +like to do advanced-style array indexing in xray, you have several options: + +* :ref:`pointwise indexing` +* :ref:`masking with where` +* Index the underlying NumPy directly array using ``.values``: __ http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html @@ -255,6 +345,10 @@ original values are subset to the index labels still found in the new labels, and values corresponding to new labels not found in the original object are in-filled with `NaN`. +Xray operations that combine multiple xray objects generally automatically +align their arguments. However, manual alignment can be useful for greater +control. + To reindex a particular dimension, use :py:meth:`~xray.DataArray.reindex`: .. ipython:: python @@ -302,103 +396,3 @@ Both ``reindex_like`` and ``align`` work interchangeably between other = xray.DataArray(['a', 'b', 'c'], dims='other') # this is a no-op, because there are no shared dimension names ds.reindex_like(other) - -.. _nearest neighbor lookups: - -Nearest neighbor lookups ------------------------- - -The label based selection methods :py:meth:`~xray.Dataset.sel`, -:py:meth:`~xray.Dataset.reindex` and :py:meth:`~xray.Dataset.reindex_like` all -support a ``method`` keyword argument. The method parameter allows for -enabling nearest neighbor (inexact) lookups by use of the methods ``'pad'``, -``'backfill'`` or ``'nearest'``: - -.. use verbatim because I can't seem to install pandas 0.16.1 on RTD :( - -.. .. ipython:: - :verbatim: - In [35]: data = xray.DataArray([1, 2, 3], dims='x') - In [36]: data.sel(x=[1.1, 1.9], method='nearest') - Out[36]: - - array([2, 3]) - Coordinates: - * x (x) int64 1 2 - In [37]: data.sel(x=0.1, method='backfill') - Out[37]: - - array(2) - Coordinates: - x int64 1 - In [38]: data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') - Out[38]: - - array([1, 2, 2, 3, 3]) - Coordinates: - * x (x) float64 0.5 1.0 1.5 2.0 2.5 - -.. ipython:: python - - data = xray.DataArray([1, 2, 3], dims='x') - data.sel(x=[1.1, 1.9], method='nearest') - data.sel(x=0.1, method='backfill') - data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') - -Using ``method='nearest'`` or a scalar argument with ``.sel()`` requires pandas -version 0.16 or newer. - -The method parameter is not yet supported if any of the arguments -to ``.sel()`` is a ``slice`` object: - -.. ipython:: - :verbatim: - - In [1]: data.sel(x=slice(1, 3), method='nearest') - NotImplementedError - -However, you don't need to use ``method`` to do inexact slicing. Slicing -already returns all values inside the range (inclusive), as long as the index -labels are monotonic increasing: - -.. ipython:: python - - data.sel(x=slice(0.9, 3.1)) - -Indexing axes with monotonic decreasing labels also works, as long as the -``slice`` or ``.loc`` arguments are also decreasing: - -.. ipython:: python - - reversed_data = data[::-1] - reversed_data.loc[3.1:0.9] - -Masking with ``where`` ----------------------- - -Indexing methods on xray objects generally return a subset of the original data. -However, it is sometimes useful to select an object with the same shape as the -original data, but with some elements masked. To do this type of selection in -xray, use :py:meth:`~xray.DataArray.where`: - -.. ipython:: python - - arr = xray.DataArray(np.arange(16).reshape(4, 4), dims=['x', 'y']) - arr.where(arr.x + arr.y < 4) - -This is particularly useful for ragged indexing of multi-dimensional data, -e.g., to apply a 2D mask to an image. Note that ``where`` follows all the -usual xray broadcasting and alignment rules for binary operations (e.g., -``+``) between the object being indexed and the condition, as described in -:ref:`comput`: - -.. ipython:: python - - arr.where(arr.y < 2) - -Multi-dimensional indexing --------------------------- - -Xray does not yet support efficient routines for generalized multi-dimensional -indexing or regridding. However, we are definitely interested in adding support -for this in the future (see :issue:`475` for the ongoing discussion). diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 935d5be3d26..a44112a33c5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,7 +14,8 @@ v0.5.3 (unreleased) - Variables in netCDF files with multiple missing values are now decoded as NaN after issuing a warning if open_dataset is called with mask_and_scale=True. - +- We clarified our rules for when the result from an xray operation is a copy + vs. a view (see :ref:`copies vs views` for more details). - Dataset variables are now written to netCDF files in order of appearance when using the netcdf4 backend (:issue:`479`). - Added :py:meth:`~xray.Dataset.isel_points` and :py:meth:`~xray.Dataset.sel_points` diff --git a/xray/core/indexing.py b/xray/core/indexing.py index 99ebe23eefd..e0fe1fc81c5 100644 --- a/xray/core/indexing.py +++ b/xray/core/indexing.py @@ -140,6 +140,12 @@ def convert_label_indexer(index, label, index_name='', method=None): indexer = index.slice_indexer(_try_get_item(label.start), _try_get_item(label.stop), _try_get_item(label.step)) + if not isinstance(indexer, slice): + # unlike pandas, in xray we never want to silently convert a slice + # indexer into an array indexer + raise KeyError('cannot represent labeled-based slice indexer for ' + 'dimension %r with a slice over integer positions; ' + 'the index is unsorted or non-unique') else: label = np.asarray(label) if label.ndim == 0: @@ -149,8 +155,8 @@ def convert_label_indexer(index, label, index_name='', method=None): else: indexer = index.get_indexer(label, method=method) if np.any(indexer < 0): - raise ValueError('not all values found in index %r' - % index_name) + raise KeyError('not all values found in index %r' + % index_name) return indexer diff --git a/xray/test/test_indexing.py b/xray/test/test_indexing.py index db7cd322b45..5562ddb5ce8 100644 --- a/xray/test/test_indexing.py +++ b/xray/test/test_indexing.py @@ -70,11 +70,19 @@ def test_orthogonal_indexer(self): def test_convert_label_indexer(self): # TODO: add tests that aren't just for edge cases index = pd.Index([1, 2, 3]) - with self.assertRaisesRegexp(ValueError, 'not all values found'): + with self.assertRaisesRegexp(KeyError, 'not all values found'): indexing.convert_label_indexer(index, [0]) with self.assertRaises(KeyError): indexing.convert_label_indexer(index, 0) + def test_convert_unsorted_datetime_index_raises(self): + index = pd.to_datetime(['2001', '2000', '2002']) + with self.assertRaises(KeyError): + # pandas will try to convert this into an array indexer. We should + # raise instead, so we can be sure the result of indexing with a + # slice is always a view. + indexing.convert_label_indexer(index, slice('2001', '2002')) + def test_remap_label_indexers(self): # TODO: fill in more tests! data = Dataset({'x': ('x', [1, 2, 3])}) From 5814a1265cb0975fe62d5e09c0c01ead6b6f6fb9 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 19 Aug 2015 11:24:07 -0700 Subject: [PATCH 2/2] revisions per review, and note on settingwithcopy warnings --- doc/indexing.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index a87ddfc6bf9..4c62f7f765f 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -303,7 +303,13 @@ The rules for label based indexing are more complex: * Label-based indexing with arrays returns a copy. * Label-based indexing with scalars returns a view or a copy, depending upon if the corresponding positional indexer can be represented as an - integer or a slice object. + integer or a slice object. The exact rules are determined by pandas. + +Whether data is a copy or a view is more predictable in xray than in pandas, so +unlike pandas, xray does not produce `SettingWithCopy warnings`_. However, you +should still avoid assignment with chained indexing. + +.. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy .. _orthogonal: @@ -345,9 +351,9 @@ original values are subset to the index labels still found in the new labels, and values corresponding to new labels not found in the original object are in-filled with `NaN`. -Xray operations that combine multiple xray objects generally automatically -align their arguments. However, manual alignment can be useful for greater -control. +Xray operations that combine multiple objects generally automatically align +their arguments to share the same indexes. However, manual alignment can be +useful for greater control and for increased performance. To reindex a particular dimension, use :py:meth:`~xray.DataArray.reindex`: