Skip to content

Commit

Permalink
match nodes containing int/float/nan/inf metric values (#349)
Browse files Browse the repository at this point in the history
fix query matcher code v1.3.0 to handle matching nodes:
- with metric values containing a combination of ints, floats, NaN, and inf
  values (e.g., match nodes with time > 5, where time values may be an int,
  float, NaN, or inf)
- with metric value of NaN (e.g., match nodes with time == NaN)
- with metric value of inf (e.g., match nodes with time == inf)

add tests for using the query language and standard filter functions to
match nodes with NaN and inf metric values
  • Loading branch information
slabasan authored Mar 31, 2021
1 parent 971a413 commit 85c66ca
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 7 deletions.
19 changes: 13 additions & 6 deletions hatchet/graphframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,11 @@ def rewire(node, new_parent, visited):
agg_dict = {}
for col in df.columns.tolist():
if col in self.exc_metrics + self.inc_metrics:
agg_dict[col] = np.sum
# use min_count=1 (default is 0) here, so sum of an all-NA
# series is NaN, not 0
# when min_count=1, sum([NaN, NaN)] = NaN
# when min_count=0, sum([NaN, NaN)] = 0
agg_dict[col] = lambda x: x.sum(min_count=1)
else:
agg_dict[col] = lambda x: x.iloc[0]

Expand All @@ -357,7 +361,9 @@ def _init_sum_columns(self, columns, out_columns):

return out_columns

def subtree_sum(self, columns, out_columns=None, function=np.sum):
def subtree_sum(
self, columns, out_columns=None, function=lambda x: x.sum(min_count=1)
):
"""Compute sum of elements in subtrees. Valid only for trees.
For each row in the graph, ``out_columns`` will contain the
Expand All @@ -374,8 +380,7 @@ def subtree_sum(self, columns, out_columns=None, function=np.sum):
out_columns (list of str): names of columns to store results
(default: in place)
function (callable): associative operator used to sum
elements (default: sum)
elements, sum of an all-NA series is NaN (default: sum(min_count=1))
"""
out_columns = self._init_sum_columns(columns, out_columns)

Expand All @@ -387,7 +392,9 @@ def subtree_sum(self, columns, out_columns=None, function=np.sum):
self.dataframe.loc[[node] + node.children, col]
)

def subgraph_sum(self, columns, out_columns=None, function=np.sum):
def subgraph_sum(
self, columns, out_columns=None, function=lambda x: x.sum(min_count=1)
):
"""Compute sum of elements in subgraphs.
For each row in the graph, ``out_columns`` will contain the
Expand All @@ -404,7 +411,7 @@ def subgraph_sum(self, columns, out_columns=None, function=np.sum):
out_columns (list of str): names of columns to store results
(default: in place)
function (callable): associative operator used to sum
elements (default: sum)
elements, sum of an all-NA series is NaN (default: sum(min_count=1))
"""
if self.graph.is_tree():
self.subtree_sum(columns, out_columns, function)
Expand Down
35 changes: 34 additions & 1 deletion hatchet/query_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from pandas import DataFrame
from pandas.core.indexes.multi import MultiIndex

# Flake8 to ignore this import, it does not recognize that eval("np.nan") needs
# numpy package
import numpy as np # noqa: F401

from .node import Node, traversal_order


Expand Down Expand Up @@ -88,7 +92,36 @@ def filter_series(df_row):
matches = matches and False
elif isinstance(df_row[k], Real):
if isinstance(v, str) and v.lower().startswith(compops):
matches = matches and eval("{} {}".format(df_row[k], v))
# compare nan metric value to numeric query
# (e.g. np.nan > 5)
if pd.isnull(df_row[k]):
nan_str = "np.nan"
# compare nan metric value to nan query
# (e.g., np.nan == np.nan)
if nan_str in v:
matches = matches and eval(
"pd.isnull({}) == True".format(nan_str)
)
else:
matches = matches and eval(
"{} {}".format(nan_str, v)
)
elif np.isinf(df_row[k]):
inf_str = "np.inf"
# compare inf metric value to inf query
# (e.g., np.inf == np.inf)
if inf_str in v:
matches = matches and eval(
"np.isinf({}) == True".format(inf_str)
)
else:
matches = matches and eval(
"{} {}".format(inf_str, v)
)
else:
matches = matches and eval(
"{} {}".format(df_row[k], v)
)
elif isinstance(v, Real):
matches = matches and (df_row[k] == v)
else:
Expand Down
83 changes: 83 additions & 0 deletions hatchet/tests/graphframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,3 +981,86 @@ def test_output_with_cycle_graphs():
assert treeout.count("d") == 2
assert treeout.count("e") == 1
assert treeout.count("f") == 1


def test_filter_squash_query_nan_and_inf_metric(small_mock1, small_mock2):
"""Use call path query language on a metric column containing both
int/float, NaN and inf."""
gf1 = GraphFrame.from_literal(small_mock1)
gf2 = GraphFrame.from_literal(small_mock2)

gf3 = gf1 / gf2

query_nan = [{"time": "== np.nan"}]
filt_nan_gf3 = gf3.filter(query_nan, squash=True)

assert len(filt_nan_gf3.graph.roots) == 2
assert all(pd.isnull(time) for time in filt_nan_gf3.dataframe["time (inc)"])
assert all(pd.isnull(time) for time in filt_nan_gf3.dataframe["time"])
assert filt_nan_gf3.dataframe.shape[0] == 2
assert sorted(filt_nan_gf3.dataframe["name"].values) == ["D", "G"]

query_inf = [{"time": "== np.inf"}]
filt_inf_gf3 = gf3.filter(query_inf, squash=True)

assert len(filt_inf_gf3.graph.roots) == 1
assert all(np.isinf(inc_time) for inc_time in filt_inf_gf3.dataframe["time (inc)"])
assert all(np.isinf(exc_time) for exc_time in filt_inf_gf3.dataframe["time"])
assert filt_inf_gf3.dataframe.shape[0] == 1
assert filt_inf_gf3.dataframe["name"].values[0] == "B"


def test_filter_squash_query_metric_with_nan_and_inf(small_mock1, small_mock2):
"""Use call path query language to match nodes with NaN and inf metric values."""
gf1 = GraphFrame.from_literal(small_mock1)
gf2 = GraphFrame.from_literal(small_mock2)

gf3 = gf1 / gf2

query = [{"time": ">= 1"}]
filter_gf3 = gf3.filter(query, squash=True)

assert len(filter_gf3.graph.roots) == 3
assert filter_gf3.dataframe["time"].sum() == np.inf
assert filter_gf3.dataframe["time (inc)"].sum() == np.inf
assert filter_gf3.dataframe.shape[0] == 5


def test_filter_nan_and_inf(small_mock1, small_mock2):
"""Use lambda to filter for nodes with NaN and inf values."""
gf1 = GraphFrame.from_literal(small_mock1)
gf2 = GraphFrame.from_literal(small_mock2)

gf3 = gf1 / gf2

filt_nan_gf3 = gf3.filter(lambda x: pd.isnull(x["time"]), squash=True)

assert len(filt_nan_gf3.graph.roots) == 2
assert all(pd.isnull(inc_time) for inc_time in filt_nan_gf3.dataframe["time (inc)"])
assert all(pd.isnull(exc_time) for exc_time in filt_nan_gf3.dataframe["time"])
assert filt_nan_gf3.dataframe.shape[0] == 2
assert sorted(filt_nan_gf3.dataframe["name"].values) == ["D", "G"]

filt_inf_gf3 = gf3.filter(lambda x: np.isinf(x["time"]), squash=True)

assert len(filt_inf_gf3.graph.roots) == 1
assert all(np.isinf(inc_time) for inc_time in filt_inf_gf3.dataframe["time (inc)"])
assert all(np.isinf(exc_time) for exc_time in filt_inf_gf3.dataframe["time"])
assert filt_inf_gf3.dataframe.shape[0] == 1
assert filt_inf_gf3.dataframe["name"].values == "B"


def test_filter_with_nan_and_inf(small_mock1, small_mock2):
"""Use lambda to filter for metric containing int/float, NaN, and inf values."""
gf1 = GraphFrame.from_literal(small_mock1)
gf2 = GraphFrame.from_literal(small_mock2)

gf3 = gf1 / gf2

filter_gf3 = gf3.filter(lambda x: x["time"] > 5, squash=True)

assert len(filter_gf3.graph.roots) == 2
assert filter_gf3.dataframe["time"].sum() == np.inf
assert filter_gf3.dataframe["time (inc)"].sum() == np.inf
assert filter_gf3.dataframe.shape[0] == 2
assert sorted(filter_gf3.dataframe["name"].values) == ["B", "H"]

0 comments on commit 85c66ca

Please sign in to comment.