Skip to content

Commit 76a52b7

Browse files
authored
Add support for eland.Series.unqiue()
1 parent 15a3007 commit 76a52b7

File tree

6 files changed

+85
-4
lines changed

6 files changed

+85
-4
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
eland.Series.unique
2+
====================
3+
4+
.. currentmodule:: eland
5+
6+
.. automethod:: Series.unique

docs/sphinx/reference/series.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Computations / Descriptive Stats
7878
Series.std
7979
Series.var
8080
Series.nunique
81+
Series.unique
8182
Series.value_counts
8283
Series.mode
8384
Series.quantile

eland/operations.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,33 @@ def quantile(
800800
else:
801801
return df if is_dataframe else df.transpose().iloc[0]
802802

803+
def unique(self, query_compiler: "QueryCompiler") -> pd.Series:
804+
805+
query_params, _ = self._resolve_tasks(query_compiler)
806+
body = Query(query_params.query)
807+
808+
fields = query_compiler._mappings.all_source_fields()
809+
assert len(fields) == 1 # Unique is only for eland.Series
810+
field = fields[0]
811+
bucket_key = f"unique_{field.column}"
812+
813+
body.composite_agg_bucket_terms(
814+
name=bucket_key,
815+
field=field.aggregatable_es_field_name,
816+
)
817+
818+
# Composite aggregation
819+
body.composite_agg_start(size=DEFAULT_PAGINATION_SIZE, name="unique_buckets")
820+
821+
unique_buckets: List[Any] = sum(
822+
self.bucket_generator(query_compiler, body, agg_name="unique_buckets"), [] # type: ignore
823+
)
824+
825+
return np.array(
826+
[bucket["key"][bucket_key] for bucket in unique_buckets],
827+
dtype=field.pd_dtype,
828+
)
829+
803830
def aggs_groupby(
804831
self,
805832
query_compiler: "QueryCompiler",
@@ -920,7 +947,9 @@ def aggs_groupby(
920947
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
921948
)
922949

923-
for buckets in self.bucket_generator(query_compiler, body):
950+
for buckets in self.bucket_generator(
951+
query_compiler, body, agg_name="groupby_buckets"
952+
):
924953
# We recieve response row-wise
925954
for bucket in buckets:
926955
# groupby columns are added to result same way they are returned
@@ -984,7 +1013,7 @@ def aggs_groupby(
9841013

9851014
@staticmethod
9861015
def bucket_generator(
987-
query_compiler: "QueryCompiler", body: "Query"
1016+
query_compiler: "QueryCompiler", body: "Query", agg_name: str
9881017
) -> Generator[Sequence[Dict[str, Any]], None, Sequence[Dict[str, Any]]]:
9891018
"""
9901019
This can be used for all groupby operations.
@@ -1015,7 +1044,7 @@ def bucket_generator(
10151044
)
10161045

10171046
# Pagination Logic
1018-
composite_buckets: Dict[str, Any] = res["aggregations"]["groupby_buckets"]
1047+
composite_buckets: Dict[str, Any] = res["aggregations"][agg_name]
10191048

10201049
after_key: Optional[Dict[str, Any]] = composite_buckets.get(
10211050
"after_key", None
@@ -1028,7 +1057,7 @@ def bucket_generator(
10281057
yield buckets
10291058

10301059
body.composite_agg_after_key(
1031-
name="groupby_buckets",
1060+
name=agg_name,
10321061
after_key=after_key,
10331062
)
10341063
else:

eland/query_compiler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,9 @@ def nunique(self) -> pd.Series:
621621
self, ["nunique"], numeric_only=False
622622
)
623623

624+
def unique(self) -> pd.Series:
625+
return self._operations.unique(self)
626+
624627
def mode(
625628
self,
626629
es_size: int,

eland/series.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1560,6 +1560,24 @@ def nunique(self) -> pd.Series:
15601560
results = super().nunique()
15611561
return results.squeeze()
15621562

1563+
def unique(self) -> pd.Series:
1564+
"""
1565+
Returns all unique values within a Series.
1566+
Note that behavior is slightly different between pandas and Eland: pandas will return values in the order
1567+
they're first seen and Eland returns values in sorted order.
1568+
1569+
Returns
1570+
-------
1571+
pd.Series
1572+
A series containing unique values of given series is returned.
1573+
1574+
See Also
1575+
--------
1576+
:pandas_api_docs:`pandas.Series.unique`
1577+
1578+
"""
1579+
return self._query_compiler.unique()
1580+
15631581
def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
15641582
"""
15651583
Return variance for a Series

tests/series/test_metrics_pytest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,30 @@ def test_flights_quantile(self, column, quantile_list):
156156
else:
157157
assert pd_quantile * 0.9 <= ed_quantile <= pd_quantile * 1.1
158158

159+
@pytest.mark.parametrize("column", ["FlightDelayMin", "dayOfWeek"])
160+
def test_flights_unique_numeric(self, column):
161+
pd_flights = self.pd_flights()[column]
162+
ed_flights = self.ed_flights()[column]
163+
164+
# Pandas returns unique values in order of their appearance
165+
# ES returns results in ascending order, hence sort the pandas array to check equality
166+
pd_unique = np.sort(pd_flights.unique())
167+
ed_unique = ed_flights.unique()
168+
169+
np.testing.assert_allclose(pd_unique, ed_unique)
170+
171+
@pytest.mark.parametrize("column", ["Cancelled", "DestCountry"])
172+
def test_flights_unique_strings(self, column):
173+
pd_flights = self.pd_flights()[column]
174+
ed_flights = self.ed_flights()[column]
175+
176+
# Pandas returns unique values in order of their appearance
177+
# ES returns results in ascending order, hence sort the pandas array to check equality
178+
pd_unique = np.sort(pd_flights.unique())
179+
ed_unique = ed_flights.unique()
180+
181+
np.equal(pd_unique, ed_unique)
182+
159183
@pytest.mark.parametrize("quantiles_list", [[np.array([1, 2])], ["1", 2]])
160184
def test_quantile_non_numeric_values(self, quantiles_list):
161185
ed_flights = self.ed_flights()["dayOfWeek"]

0 commit comments

Comments
 (0)