ecmwf-lab
diff --git a/‎ecml_tools/data.py
+123-18 b/‎ecml_tools/data.py
+123-18
diff --git a/‎ecml_tools/indexing.py
+129 b/‎ecml_tools/indexing.py
+129
@@ -20,12 +20,45 @@
 
 import ecml_tools
 
+from .indexing import (
+    apply_index_to_slices_changes,
+    index_to_slices,
+    length_to_slices,
+    update_tuple,
+)
+
 LOG = logging.getLogger(__name__)
 
 __all__ = ["open_dataset", "open_zarr", "debug_zarr_loading"]
 
 DEBUG_ZARR_LOADING = int(os.environ.get("DEBUG_ZARR_LOADING", "0"))
 
+DEPTH = 0
+
+
+def _debug_indexing(method):
+    def wrapper(self, index):
+        global DEPTH
+        if isinstance(index, tuple):
+            print("  " * DEPTH, "->", self, method.__name__, index)
+        DEPTH += 1
+        result = method(self, index)
+        DEPTH -= 1
+        if isinstance(index, tuple):
+            print("  " * DEPTH, "<-", self, method.__name__, result.shape)
+        return result
+
+    return wrapper
+
+
+if True:
+
+    def debug_indexing(x):
+        return x
+
+else:
+    debug_indexing = _debug_indexing
+
 
 def debug_zarr_loading(on_off):
     global DEBUG_ZARR_LOADING
@@ -190,11 +223,18 @@ def metadata_specific(self, **kwargs):
     def __repr__(self):
         return self.__class__.__name__ + "()"
 
+    @debug_indexing
     def _get_tuple(self, n):
-        raise NotImplementedError(f"Tuple not supported: {n} (class {self.__class__.__name__})")
+        raise NotImplementedError(
+            f"Tuple not supported: {n} (class {self.__class__.__name__})"
+        )
 
 
 class Source:
+    """
+    Class used to follow the provenance of a data point.
+    """
+
     def __init__(self, dataset, index, source=None, info=None):
         self.dataset = dataset
         self.index = index
@@ -340,6 +380,7 @@ def __init__(self, path):
     def __len__(self):
         return self.data.shape[0]
 
+    @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple) and any(not isinstance(i, (int, slice)) for i in n):
             return self._getitem_extended(n)
@@ -352,8 +393,7 @@ def _getitem_extended(self, index):
         Zarr does not support indexing with lists/arrays directly, so we need to implement it ourselves.
         """
 
-        if not isinstance(index, tuple):
-            return self[index]
+        assert False, index
 
         shape = self.data.shape
 
@@ -377,7 +417,7 @@ def _unwind(self, index, rest, shape, axis, axes):
         if isinstance(index, (list, tuple)):
             axes.append(axis)  # Dimension of the concatenation
             for i in index:
-                yield from self._unwind(i, rest, shape, axis, axes)
+                yield from self._unwind((slice(i, i + 1),), rest, shape, axis, axes)
             return
 
         if len(rest) == 0:
@@ -635,6 +675,31 @@ class Concat(Combined):
     def __len__(self):
         return sum(len(i) for i in self.datasets)
 
+    @debug_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        result = []
+
+        first, rest = index[0], index[1:]
+        start, stop, step = first.start, first.stop, first.step
+
+        for d in self.datasets:
+            length = d._len
+
+            result.append(d[(slice(start, stop, step),) + rest])
+
+            start -= length
+            while start < 0:
+                start += step
+
+            stop -= length
+
+            if start > stop:
+                break
+
+        return apply_index_to_slices_changes(np.concatenate(result, axis=0), changes)
+
+    @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):
             return self._get_tuple(n)
@@ -649,6 +714,7 @@ def __getitem__(self, n):
             k += 1
         return self.datasets[k][n]
 
+    @debug_indexing
     def _get_slice(self, s):
         result = []
 
@@ -716,9 +782,23 @@ def shape(self):
         assert False not in result, result
         return result
 
+    @debug_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        lengths = [d.shape[self.axis] for d in self.datasets]
+        slices = length_to_slices(index[self.axis], lengths)
+        before = index[: self.axis]
+        result = [
+            d[before + (i,)] for (d, i) in zip(self.datasets, slices) if i is not None
+        ]
+        result = np.concatenate(result, axis=self.axis)
+        return apply_index_to_slices_changes(result, changes)
+
+    @debug_indexing
     def _get_slice(self, s):
         return np.stack([self[i] for i in range(*s.indices(self._len))])
 
+    @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):
             return self._get_tuple(n)
@@ -769,9 +849,22 @@ def check_same_variables(self, d1, d2):
     def __len__(self):
         return len(self.datasets[0])
 
+    @debug_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 1, slice(None))
+
+        # TODO: optimize if index does not access all datasets, so we don't load chunks we don't need
+        result = [d[index] for d in self.datasets]
+
+        result = np.concatenate(result, axis=1)
+        return apply_index_to_slices_changes(result[:, previous], changes)
+
+    @debug_indexing
     def _get_slice(self, s):
         return np.stack([self[i] for i in range(*s.indices(self._len))])
 
+    @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):
             return self._get_tuple(n)
@@ -857,10 +950,14 @@ def __init__(self, dataset, indices):
 
         self.dataset = dataset
         self.indices = list(indices)
+        self.slice = _make_slice_or_index_from_list_or_tuple(self.indices)
+        assert isinstance(self.slice, slice)
+        print("SUBSET", self.slice)
 
         # Forward other properties to the super dataset
         super().__init__(dataset)
 
+    @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):
             return self._get_tuple(n)
@@ -871,25 +968,22 @@ def __getitem__(self, n):
         n = self.indices[n]
         return self.dataset[n]
 
+    @debug_indexing
     def _get_slice(self, s):
         # TODO: check if the indices can be simplified to a slice
         # the time checking maybe be longer than the time saved
         # using a slice
         indices = [self.indices[i] for i in range(*s.indices(self._len))]
         return np.stack([self.dataset[i] for i in indices])
 
+    @debug_indexing
     def _get_tuple(self, n):
-        first, rest = n[0], n[1:]
-
-        if isinstance(first, int):
-            return self.dataset[(self.indices[first],) + rest]
-
-        if isinstance(first, slice):
-            indices = tuple(self.indices[i] for i in range(*first.indices(self._len)))
-            indices = _make_slice_or_index_from_list_or_tuple(indices)
-            return self.dataset[(indices,) + rest]
-
-        raise NotImplementedError(f"Only int and slice supported not {type(first)}")
+        index, changes = index_to_slices(n, self.shape)
+        index, previous = update_tuple(index, 0, self.slice)
+        result = self.dataset[index]
+        result = result[previous]
+        result = apply_index_to_slices_changes(result, changes)
+        return result
 
     def __len__(self):
         return len(self.indices)
@@ -929,12 +1023,23 @@ def __init__(self, dataset, indices):
         # Forward other properties to the main dataset
         super().__init__(dataset)
 
+    @debug_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 1, slice(None))
+        result = self.dataset[index]
+        result = result[:, self.indices]
+        result = result[:, previous]
+        result = apply_index_to_slices_changes(result, changes)
+        return result
+
+    @debug_indexing
     def __getitem__(self, n):
-        # if isinstance(n, tuple):
-        #     return self._get_tuple(n)
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
 
         row = self.dataset[n]
-        if isinstance(n, (slice, tuple)):
+        if isinstance(n, slice):
             return row[:, self.indices]
 
         return row[self.indices]
 
@@ -0,0 +1,129 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+
+import numpy as np
+
+
+def _tuple_with_slices(t, shape):
+    """
+    Replace all integers in a tuple with slices, so we preserve the dimensionality.
+    """
+
+    result = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in t)
+    changes = tuple(j for (j, i) in enumerate(t) if isinstance(i, int))
+    result = tuple(slice(*s.indices(shape[i])) for (i, s) in enumerate(result))
+
+    return result, changes
+
+
+def _extend_shape(index, shape):
+    if Ellipsis in index:
+        if index.count(Ellipsis) > 1:
+            raise IndexError("Only one Ellipsis is allowed")
+        ellipsis_index = index.index(Ellipsis)
+        index = list(index)
+        index[ellipsis_index] = slice(None)
+        while len(index) < len(shape):
+            index.insert(ellipsis_index, slice(None))
+        index = tuple(index)
+
+    while len(index) < len(shape):
+        index = index + (slice(None),)
+
+    return index
+
+
+def _index_to_tuple(index, shape):
+    if isinstance(index, int):
+        return _extend_shape((index,), shape)
+    if isinstance(index, slice):
+        return _extend_shape((index,), shape)
+    if isinstance(index, tuple):
+        return _extend_shape(index, shape)
+    if index is Ellipsis:
+        return _extend_shape((Ellipsis,), shape)
+    raise ValueError(f"Invalid index: {index}")
+
+
+def index_to_slices(index, shape):
+    """
+    Convert an index to a tuple of slices, with the same dimensionality as the shape.
+    """
+    return _tuple_with_slices(_index_to_tuple(index, shape), shape)
+
+
+def apply_index_to_slices_changes(result, changes):
+    if changes:
+        shape = result.shape
+        for i in changes:
+            assert shape[i] == 1, (i, changes, shape)
+        result = np.squeeze(result, axis=changes)
+    return result
+
+
+def update_tuple(t, index, value):
+    """
+    Replace the elements of a tuple at the given index with a new value.
+    """
+    t = list(t)
+    prev = t[index]
+    t[index] = value
+    return tuple(t), prev
+
+
+def length_to_slices(index, lengths):
+    """
+    Convert an index to a list of slices, given the lengths of the dimensions.
+    """
+    total = sum(lengths)
+    start, stop, step = index.indices(total)
+
+    result = []
+
+    pos = 0
+    for length in lengths:
+        end = pos + length
+
+        b = max(pos, start)
+        e = min(end, stop)
+
+        p = None
+        if b <= e:
+            if (b - start) % step != 0:
+                b = b + step - (b - start) % step
+            b -= pos
+            e -= pos
+
+            if 0 <= b < e:
+                p = slice(b, e, step)
+
+        result.append(p)
+
+        pos = end
+
+    return result
+
+
+class IndexTester:
+    def __init__(self, shape):
+        self.shape = shape
+
+    def __getitem__(self, index):
+        return index_to_slices(index, self.shape)
+
+
+if __name__ == "__main__":
+    t = IndexTester((1000, 8, 10, 20000))
+    i = t[0, 1, 2, 3]
+    print(i)
+
+    # print(t[0])
+    # print(t[0, 1, 2, 3])
+    # print(t[0:10])
+    # print(t[...])
+    # print(t[:-1])