match nodes containing int/float/nan/inf metric values (#349)

fix query matcher code v1.3.0 to handle matching nodes: - with metric values containing a combination of ints, floats, NaN, and inf values (e.g., match nodes with time > 5, where time values may be an int, float, NaN, or inf) - with metric value of NaN (e.g., match nodes with time == NaN) - with metric value of inf (e.g., match nodes with time == inf) add tests for using the query language and standard filter functions to match nodes with NaN and inf metric values
LLNL · Mar 31, 2021 · 85c66ca · 85c66ca
1 parent 971a413
commit 85c66ca
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 7 deletions.
diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py
@@ -330,7 +330,11 @@ def rewire(node, new_parent, visited):
         agg_dict = {}
         for col in df.columns.tolist():
             if col in self.exc_metrics + self.inc_metrics:
-                agg_dict[col] = np.sum
+                # use min_count=1 (default is 0) here, so sum of an all-NA
+                # series is NaN, not 0
+                # when min_count=1, sum([NaN, NaN)] = NaN
+                # when min_count=0, sum([NaN, NaN)] = 0
+                agg_dict[col] = lambda x: x.sum(min_count=1)
             else:
                 agg_dict[col] = lambda x: x.iloc[0]
 
@@ -357,7 +361,9 @@ def _init_sum_columns(self, columns, out_columns):
 
         return out_columns
 
-    def subtree_sum(self, columns, out_columns=None, function=np.sum):
+    def subtree_sum(
+        self, columns, out_columns=None, function=lambda x: x.sum(min_count=1)
+    ):
         """Compute sum of elements in subtrees.  Valid only for trees.
 
         For each row in the graph, ``out_columns`` will contain the
@@ -374,8 +380,7 @@ def subtree_sum(self, columns, out_columns=None, function=np.sum):
             out_columns (list of str): names of columns to store results
                 (default: in place)
             function (callable): associative operator used to sum
-                elements (default: sum)
-
+                elements, sum of an all-NA series is NaN (default: sum(min_count=1))
         """
         out_columns = self._init_sum_columns(columns, out_columns)
 
@@ -387,7 +392,9 @@ def subtree_sum(self, columns, out_columns=None, function=np.sum):
                         self.dataframe.loc[[node] + node.children, col]
                     )
 
-    def subgraph_sum(self, columns, out_columns=None, function=np.sum):
+    def subgraph_sum(
+        self, columns, out_columns=None, function=lambda x: x.sum(min_count=1)
+    ):
         """Compute sum of elements in subgraphs.
 
         For each row in the graph, ``out_columns`` will contain the
@@ -404,7 +411,7 @@ def subgraph_sum(self, columns, out_columns=None, function=np.sum):
             out_columns (list of str): names of columns to store results
                 (default: in place)
             function (callable): associative operator used to sum
-                elements (default: sum)
+                elements, sum of an all-NA series is NaN (default: sum(min_count=1))
         """
         if self.graph.is_tree():
             self.subtree_sum(columns, out_columns, function)

diff --git a/hatchet/query_matcher.py b/hatchet/query_matcher.py
@@ -10,6 +10,10 @@
 from pandas import DataFrame
 from pandas.core.indexes.multi import MultiIndex
 
+# Flake8 to ignore this import, it does not recognize that eval("np.nan") needs
+# numpy package
+import numpy as np  # noqa: F401
+
 from .node import Node, traversal_order
 
 
@@ -88,7 +92,36 @@ def filter_series(df_row):
                                 matches = matches and False
                         elif isinstance(df_row[k], Real):
                             if isinstance(v, str) and v.lower().startswith(compops):
-                                matches = matches and eval("{} {}".format(df_row[k], v))
+                                # compare nan metric value to numeric query
+                                # (e.g. np.nan > 5)
+                                if pd.isnull(df_row[k]):
+                                    nan_str = "np.nan"
+                                    # compare nan metric value to nan query
+                                    # (e.g., np.nan == np.nan)
+                                    if nan_str in v:
+                                        matches = matches and eval(
+                                            "pd.isnull({}) == True".format(nan_str)
+                                        )
+                                    else:
+                                        matches = matches and eval(
+                                            "{} {}".format(nan_str, v)
+                                        )
+                                elif np.isinf(df_row[k]):
+                                    inf_str = "np.inf"
+                                    # compare inf metric value to inf query
+                                    # (e.g., np.inf == np.inf)
+                                    if inf_str in v:
+                                        matches = matches and eval(
+                                            "np.isinf({}) == True".format(inf_str)
+                                        )
+                                    else:
+                                        matches = matches and eval(
+                                            "{} {}".format(inf_str, v)
+                                        )
+                                else:
+                                    matches = matches and eval(
+                                        "{} {}".format(df_row[k], v)
+                                    )
                             elif isinstance(v, Real):
                                 matches = matches and (df_row[k] == v)
                             else:

diff --git a/hatchet/tests/graphframe.py b/hatchet/tests/graphframe.py
@@ -981,3 +981,86 @@ def test_output_with_cycle_graphs():
     assert treeout.count("d") == 2
     assert treeout.count("e") == 1
     assert treeout.count("f") == 1
+
+
+def test_filter_squash_query_nan_and_inf_metric(small_mock1, small_mock2):
+    """Use call path query language on a metric column containing both
+    int/float, NaN and inf."""
+    gf1 = GraphFrame.from_literal(small_mock1)
+    gf2 = GraphFrame.from_literal(small_mock2)
+
+    gf3 = gf1 / gf2
+
+    query_nan = [{"time": "== np.nan"}]
+    filt_nan_gf3 = gf3.filter(query_nan, squash=True)
+
+    assert len(filt_nan_gf3.graph.roots) == 2
+    assert all(pd.isnull(time) for time in filt_nan_gf3.dataframe["time (inc)"])
+    assert all(pd.isnull(time) for time in filt_nan_gf3.dataframe["time"])
+    assert filt_nan_gf3.dataframe.shape[0] == 2
+    assert sorted(filt_nan_gf3.dataframe["name"].values) == ["D", "G"]
+
+    query_inf = [{"time": "== np.inf"}]
+    filt_inf_gf3 = gf3.filter(query_inf, squash=True)
+
+    assert len(filt_inf_gf3.graph.roots) == 1
+    assert all(np.isinf(inc_time) for inc_time in filt_inf_gf3.dataframe["time (inc)"])
+    assert all(np.isinf(exc_time) for exc_time in filt_inf_gf3.dataframe["time"])
+    assert filt_inf_gf3.dataframe.shape[0] == 1
+    assert filt_inf_gf3.dataframe["name"].values[0] == "B"
+
+
+def test_filter_squash_query_metric_with_nan_and_inf(small_mock1, small_mock2):
+    """Use call path query language to match nodes with NaN and inf metric values."""
+    gf1 = GraphFrame.from_literal(small_mock1)
+    gf2 = GraphFrame.from_literal(small_mock2)
+
+    gf3 = gf1 / gf2
+
+    query = [{"time": ">= 1"}]
+    filter_gf3 = gf3.filter(query, squash=True)
+
+    assert len(filter_gf3.graph.roots) == 3
+    assert filter_gf3.dataframe["time"].sum() == np.inf
+    assert filter_gf3.dataframe["time (inc)"].sum() == np.inf
+    assert filter_gf3.dataframe.shape[0] == 5
+
+
+def test_filter_nan_and_inf(small_mock1, small_mock2):
+    """Use lambda to filter for nodes with NaN and inf values."""
+    gf1 = GraphFrame.from_literal(small_mock1)
+    gf2 = GraphFrame.from_literal(small_mock2)
+
+    gf3 = gf1 / gf2
+
+    filt_nan_gf3 = gf3.filter(lambda x: pd.isnull(x["time"]), squash=True)
+
+    assert len(filt_nan_gf3.graph.roots) == 2
+    assert all(pd.isnull(inc_time) for inc_time in filt_nan_gf3.dataframe["time (inc)"])
+    assert all(pd.isnull(exc_time) for exc_time in filt_nan_gf3.dataframe["time"])
+    assert filt_nan_gf3.dataframe.shape[0] == 2
+    assert sorted(filt_nan_gf3.dataframe["name"].values) == ["D", "G"]
+
+    filt_inf_gf3 = gf3.filter(lambda x: np.isinf(x["time"]), squash=True)
+
+    assert len(filt_inf_gf3.graph.roots) == 1
+    assert all(np.isinf(inc_time) for inc_time in filt_inf_gf3.dataframe["time (inc)"])
+    assert all(np.isinf(exc_time) for exc_time in filt_inf_gf3.dataframe["time"])
+    assert filt_inf_gf3.dataframe.shape[0] == 1
+    assert filt_inf_gf3.dataframe["name"].values == "B"
+
+
+def test_filter_with_nan_and_inf(small_mock1, small_mock2):
+    """Use lambda to filter for metric containing int/float, NaN, and inf values."""
+    gf1 = GraphFrame.from_literal(small_mock1)
+    gf2 = GraphFrame.from_literal(small_mock2)
+
+    gf3 = gf1 / gf2
+
+    filter_gf3 = gf3.filter(lambda x: x["time"] > 5, squash=True)
+
+    assert len(filter_gf3.graph.roots) == 2
+    assert filter_gf3.dataframe["time"].sum() == np.inf
+    assert filter_gf3.dataframe["time (inc)"].sum() == np.inf
+    assert filter_gf3.dataframe.shape[0] == 2
+    assert sorted(filter_gf3.dataframe["name"].values) == ["B", "H"]