Merge pull request #481 from jhamman/feature/isel_points

shoyer · shoyer · commit 69f7386763c8 · 2015-07-26T22:04:46.000-07:00
Add pointwise indexing via isel_points method
diff --git a/doc/api.rst b/doc/api.rst
@@ -93,6 +93,7 @@ Indexing
    Dataset.loc
    Dataset.isel
    Dataset.sel
+   Dataset.isel_points
    Dataset.squeeze
    Dataset.reindex
    Dataset.reindex_like
@@ -202,6 +203,7 @@ Indexing
    DataArray.loc
    DataArray.isel
    DataArray.sel
+   DataArray.isel_points
    DataArray.squeeze
    DataArray.reindex
    DataArray.reindex_like
diff --git a/doc/indexing.rst b/doc/indexing.rst
@@ -57,6 +57,7 @@ DataArray:
 
     Positional indexing deviates from the NumPy when indexing with multiple
     arrays like ``arr[[0, 1], [0, 1]]``, as described in :ref:`indexing details`.
+    See :ref:`pointwise indexing` and :py:meth:`~xray.Dataset.isel_points` for more on this functionality.
 
 xray also supports label-based indexing, just like pandas. Because
 we use a :py:class:`pandas.Index` under the hood, label based indexing is very
@@ -122,7 +123,7 @@ __ http://legacy.python.org/dev/peps/pep-0472/
 
 .. warning::
 
-    Do not try to assign values when using ``isel`` or ``sel``::
+    Do not try to assign values when using ``isel``, ``isel_points`` or ``sel``::
 
         # DO NOT do this
         arr.isel(space=0) = 0
@@ -134,6 +135,23 @@ __ http://legacy.python.org/dev/peps/pep-0472/
         # this is safe
         arr[dict(space=0)] = 0
 
+.. _pointwise indexing:
+
+Pointwise indexing
+------------------
+
+xray pointwise indexing supports the indexing along multiple labeled dimensions
+using list-like objects. While :py:meth:`~xray.DataArray.isel` performs
+orthogonal indexing, the :py:meth:`~xray.DataArray.isel_points` method
+provides similar numpy indexing behavior as if you were using multiple lists to index an array (e.g. `arr[[0, 1], [0, 1]]` ):
+
+.. ipython:: python
+
+    # index by integer array indices
+    da = xray.DataArray(np.arange(56).reshape((7, 8)), dims=['x', 'y'])
+    da
+    da.isel_points(x=[0, 1, 6], y=[0, 1, 0])
+
 Dataset indexing
 ----------------
 
@@ -145,13 +163,16 @@ simultaneously, returning a new dataset:
     ds = arr.to_dataset()
     ds.isel(space=[0], time=[0])
     ds.sel(time='2000-01-01')
+    ds2 = da.to_dataset()
+    ds2.isel_points(x=[0, 1, 6], y=[0, 1, 0], dim='points')
 
 Positional indexing on a dataset is not supported because the ordering of
 dimensions in a dataset is somewhat ambiguous (it can vary between different
 arrays). However, you can do normal indexing with labeled dimensions:
 
 .. ipython:: python
 
+
     ds[dict(space=[0], time=[0])]
     ds.loc[dict(time='2000-01-01')]
 
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -17,6 +17,36 @@ v0.5.3 (unreleased)
 
 - Dataset variables are now written to netCDF files in order of appearance
   when using the netcdf4 backend (:issue:`479`).
+- Added :py:meth:`~xray.Dataset.isel_points` and :py:meth:`~xray.DataArray.isel_points` to support pointwise indexing of Datasets and DataArrays (:issue:`475`).
+
+  .. ipython::
+    :verbatim:
+
+    In [1]: da = xray.DataArray(np.arange(56).reshape((7, 8)),
+                                dims=['x', 'y'])
+
+    In [2]: da
+    Out[2]:
+    <xray.DataArray (x: 7, y: 8)>
+    array([[ 0,  1,  2,  3,  4,  5,  6,  7],
+           [ 8,  9, 10, 11, 12, 13, 14, 15],
+           [16, 17, 18, 19, 20, 21, 22, 23],
+           [24, 25, 26, 27, 28, 29, 30, 31],
+           [32, 33, 34, 35, 36, 37, 38, 39],
+           [40, 41, 42, 43, 44, 45, 46, 47],
+           [48, 49, 50, 51, 52, 53, 54, 55]])
+    Coordinates:
+      * x        (x) int64 0 1 2 3 4 5 6
+      * y        (y) int64 0 1 2 3 4 5 6 7
+
+    In [3]: da.isel_points(x=[0, 1, 6], y=[0, 1, 0], dim='points')
+    Out[3]:
+    <xray.DataArray (points: 3)>
+    array([ 0,  9, 48])
+    Coordinates:
+        x        (points) int64 0 1 6
+        y        (points) int64 0 1 0
+      * points   (points) int64 0 1 2
 
 
 Bug fixes
diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py
@@ -551,6 +551,17 @@ def sel(self, method=None, **indexers):
         return self.isel(**indexing.remap_label_indexers(self, indexers,
                                                          method=method))
 
+    def isel_points(self, dim='points', **indexers):
+        """Return a new DataArray whose dataset is given by pointwise integer
+        indexing along the specified dimension(s).
+
+        See Also
+        --------
+        Dataset.isel_points
+        """
+        ds = self._dataset.isel_points(dim=dim, **indexers)
+        return self._with_replaced_dataset(ds)
+
     def reindex_like(self, other, method=None, copy=True):
         """Conform this object onto the indexes of another object, filling
         in missing values with NaN.
diff --git a/xray/core/dataset.py b/xray/core/dataset.py
@@ -1,6 +1,6 @@
 import functools
 import warnings
-from collections import Mapping
+from collections import Mapping, Sequence
 from numbers import Number
 
 import numpy as np
@@ -21,6 +21,7 @@
 from .variable import as_variable, Variable, Coordinate, broadcast_variables
 from .pycompat import (iteritems, itervalues, basestring, OrderedDict,
                        dask_array_type)
+from .combine import concat
 
 
 # list of attributes of pd.DatetimeIndex that are ndarrays of time info
@@ -1028,6 +1029,93 @@ def sel(self, method=None, **indexers):
         return self.isel(**indexing.remap_label_indexers(self, indexers,
                                                          method=method))
 
+    def isel_points(self, dim='points', **indexers):
+        """Returns a new dataset with each array indexed pointwise along the
+        specified dimension(s).
+
+        This method selects pointwise values from each array and is akin to
+        the NumPy indexing behavior of `arr[[0, 1], [0, 1]]`, except this
+        method does not require knowing the order of each array's dimensions.
+
+        Parameters
+        ----------
+        dim : str or DataArray or pandas.Index or other list-like object, optional
+            Name of the dimension to concatenate along. If dim is provided as a
+            string, it must be a new dimension name, in which case it is added
+            along axis=0. If dim is provided as a DataArray or Index or
+            list-like object, its name, which must not be present in the
+            dataset, is used as the dimension to concatenate along and the
+            values are added as a coordinate.
+        **indexers : {dim: indexer, ...}
+            Keyword arguments with names matching dimensions and values given
+            by array-like objects. All indexers must be the same length and
+            1 dimensional.
+
+        Returns
+        -------
+        obj : Dataset
+            A new Dataset with the same contents as this dataset, except each
+            array and dimension is indexed by the appropriate indexers. With
+            pointwise indexing, the new Dataset will always be a copy of the
+            original.
+
+        See Also
+        --------
+        Dataset.sel
+        DataArray.isel
+        DataArray.sel
+        DataArray.isel_points
+        """
+        indexer_dims = set(indexers)
+
+        def relevant_keys(mapping):
+            return [k for k, v in mapping.items()
+                    if any(d in indexer_dims for d in v.dims)]
+
+        data_vars = relevant_keys(self.data_vars)
+        coords = relevant_keys(self.coords)
+
+        # all the indexers should be iterables
+        keys = indexers.keys()
+        indexers = [(k, np.asarray(v)) for k, v in iteritems(indexers)]
+        # Check that indexers are valid dims, integers, and 1D
+        for k, v in indexers:
+            if k not in self.dims:
+                raise ValueError("dimension %s does not exist" % k)
+            if v.dtype.kind != 'i':
+                raise TypeError('Indexers must be integers')
+            if v.ndim != 1:
+                raise ValueError('Indexers must be 1 dimensional')
+
+        # all the indexers should have the same length
+        lengths = set(len(v) for k, v in indexers)
+        if len(lengths) > 1:
+            raise ValueError('All indexers must be the same length')
+
+        # Existing dimensions are not valid choices for the dim argument
+        if isinstance(dim, basestring):
+            if dim in self.dims:
+                # dim is an invalid string
+                raise ValueError('Existing dimension names are not valid '
+                                 'choices for the dim argument in sel_points')
+        elif hasattr(dim, 'dims'):
+            # dim is a DataArray or Coordinate
+            if dim.name in self.dims:
+                # dim already exists
+                raise ValueError('Existing dimensions are not valid choices '
+                                 'for the dim argument in sel_points')
+        else:
+            # try to cast dim to DataArray with name = points
+            from .dataarray import DataArray
+            dim = DataArray(dim, dims='points', name='points')
+
+        # TODO: This would be sped up with vectorized indexing. This will
+        # require dask to support pointwise indexing as well.
+        return concat([self.isel(**d) for d in
+                       [dict(zip(keys, inds)) for inds in
+                        zip(*[v for k, v in indexers])]],
+                      dim=dim, coords=coords, data_vars=data_vars)
+
     def reindex_like(self, other, method=None, copy=True):
         """Conform this object onto the indexes of another object, filling
         in missing values with NaN.
diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py
@@ -382,6 +382,66 @@ def test_sel_method(self):
         actual = data.sel(x=[0.9, 1.9], method='backfill')
         self.assertDataArrayIdentical(expected, actual)
 
+    def test_isel_points_method(self):
+        shape = (10, 5, 6)
+        np_array = np.random.random(shape)
+        da = DataArray(np_array, dims=['time', 'y', 'x'])
+        y = [1, 3]
+        x = [3, 0]
+
+        expected = da.values[:, y, x]
+
+        actual = da.isel_points(y=y, x=x, dim='test_coord')
+        assert 'test_coord' in actual.coords
+        assert actual.coords['test_coord'].shape == (len(y), )
+        assert all(x in actual for x in ['time', 'x', 'y', 'test_coord'])
+        assert actual.dims == ('test_coord', 'time')
+        actual = da.isel_points(y=y, x=x)
+        assert 'points' in actual.coords
+        # Note that because xray always concatenates along the first dimension,
+        # We must transpose the result to match the numpy style of
+        # concatentation.
+        np.testing.assert_equal(actual.T, expected)
+
+        # a few corner cases
+        da.isel_points(time=[1, 2], x=[2, 2], y=[3, 4])
+        np.testing.assert_allclose(
+            da.isel_points(time=[1], x=[2], y=[4]).values.squeeze(),
+            np_array[1, 4, 2].squeeze())
+        da.isel_points(time=[1, 2])
+        y = [-1, 0]
+        x = [-2, 2]
+        expected = da.values[:, y, x]
+        actual = da.isel_points(x=x, y=y).values
+        np.testing.assert_equal(actual.T, expected)
+
+        # test that the order of the indexers doesn't matter
+        self.assertDataArrayIdentical(
+            da.isel_points(y=y, x=x),
+            da.isel_points(x=x, y=y))
+
+        # make sure we're raising errors in the right places
+        with self.assertRaisesRegexp(ValueError,
+                                     'All indexers must be the same length'):
+            da.isel_points(y=[1, 2], x=[1, 2, 3])
+        with self.assertRaisesRegexp(ValueError,
+                                     'dimension bad_key does not exist'):
+            da.isel_points(bad_key=[1, 2])
+        with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'):
+            da.isel_points(y=[1.5, 2.2])
+        with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'):
+            da.isel_points(x=[1, 2, 3], y=slice(3))
+        with self.assertRaisesRegexp(ValueError,
+                                     'Indexers must be 1 dimensional'):
+            da.isel_points(y=1, x=2)
+        with self.assertRaisesRegexp(ValueError,
+                                     'Existing dimension names are not'):
+            da.isel_points(y=[1, 2], x=[1, 2], dim='x')
+
+        # using non string dims
+        actual = da.isel_points(y=[1, 2], x=[1, 2], dim=['A', 'B'])
+        assert 'points' in actual.coords
+
     def test_loc(self):
         self.ds['x'] = ('x', np.array(list('abcdefghij')))
         da = self.ds['foo']
diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py
@@ -661,6 +661,74 @@ def test_sel(self):
         self.assertDatasetEqual(data.isel(td=slice(1, 3)),
                                 data.sel(td=slice('1 days', '2 days')))
 
+    def test_isel_points(self):
+        data = create_test_data()
+
+        pdim1 = [1, 2, 3]
+        pdim2 = [4, 5, 1]
+        pdim3 = [1, 2, 3]
+
+        actual = data.isel_points(dim1=pdim1, dim2=pdim2, dim3=pdim3,
+                                  dim='test_coord')
+        assert 'test_coord' in actual.coords
+        assert actual.coords['test_coord'].shape == (len(pdim1), )
+
+        actual = data.isel_points(dim1=pdim1, dim2=pdim2)
+        assert 'points' in actual.coords
+        np.testing.assert_array_equal(pdim1, actual['dim1'])
+
+        # test that the order of the indexers doesn't matter
+        self.assertDatasetIdentical(data.isel_points(dim1=pdim1, dim2=pdim2),
+                                    data.isel_points(dim2=pdim2, dim1=pdim1))
+
+        # make sure we're raising errors in the right places
+        with self.assertRaisesRegexp(ValueError,
+                                     'All indexers must be the same length'):
+            data.isel_points(dim1=[1, 2], dim2=[1, 2, 3])
+        with self.assertRaisesRegexp(ValueError,
+                                     'dimension bad_key does not exist'):
+            data.isel_points(bad_key=[1, 2])
+        with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'):
+            data.isel_points(dim1=[1.5, 2.2])
+        with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'):
+            data.isel_points(dim1=[1, 2, 3], dim2=slice(3))
+        with self.assertRaisesRegexp(ValueError,
+                                     'Indexers must be 1 dimensional'):
+            data.isel_points(dim1=1, dim2=2)
+        with self.assertRaisesRegexp(ValueError,
+                                     'Existing dimension names are not valid'):
+            data.isel_points(dim1=[1, 2], dim2=[1, 2], dim='dim2')
+
+        # test to be sure we keep around variables that were not indexed
+        ds = Dataset({'x': [1, 2, 3, 4], 'y': 0})
+        actual = ds.isel_points(x=[0, 1, 2])
+        self.assertDataArrayIdentical(ds['y'], actual['y'])
+
+        # tests using index or DataArray as a dim
+        stations = Dataset()
+        stations['station'] = ('station', ['A', 'B', 'C'])
+        stations['dim1s'] = ('station', [1, 2, 3])
+        stations['dim2s'] = ('station', [4, 5, 1])
+
+        actual = data.isel_points(dim1=stations['dim1s'],
+                                  dim2=stations['dim2s'],
+                                  dim=stations['station'])
+        assert 'station' in actual.coords
+        assert 'station' in actual.dims
+        self.assertDataArrayIdentical(actual['station'].drop(['dim1', 'dim2']),
+                                      stations['station'])
+
+        # make sure we get the default points coordinate when a list is passed
+        actual = data.isel_points(dim1=stations['dim1s'],
+                                  dim2=stations['dim2s'],
+                                  dim=['A', 'B', 'C'])
+        assert 'points' in actual.coords
+
+        # can pass a numpy array
+        data.isel_points(dim1=stations['dim1s'],
+                         dim2=stations['dim2s'],
+                         dim=np.array([4, 5, 6]))
+
     def test_sel_method(self):
         data = create_test_data()