Add support for grouping by different properties by label in Gremlin (#…

…115) * Add support for grouping by different properties by label in Gremlin * Update starter notebooks with details on usage Co-authored-by: Michael Chin <[email protected]>
aws · Apr 29, 2021 · a36e698 · a36e698
1 parent eef18ff
commit a36e698
Show file tree

Hide file tree

Showing 5 changed files with 477 additions and 50 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -3,6 +3,10 @@
 Starting with v1.31.6, this file will contain a record of major features and updates made in each release of graph-notebook.
 
 ## Upcoming
+- Add support for notebook variables in Sparql/Gremlin magic queries ([Link to PR](https://github.com/aws/graph-notebook/pull/113))
+- Add support for grouping by different properties per label in Gremlin ([Link to PR](https://github.com/aws/graph-notebook/pull/115))
+- Fix missing Boto3 dependency in setup.py ([Link to PR](https://github.com/aws/graph-notebook/pull/118))
+
 
 ## Release 2.1.1 (April 22, 2021)
 

diff --git a/src/graph_notebook/magics/graph_magic.py b/src/graph_notebook/magics/graph_magic.py
@@ -343,7 +343,7 @@ def gremlin(self, line, cell, local_ns: dict = None):
         parser.add_argument('query_mode', nargs='?', default='query',
                             help='query mode (default=query) [query|explain|profile]')
         parser.add_argument('-p', '--path-pattern', default='', help='path pattern')
-        parser.add_argument('-g', '--group-by', default='T.label',
+        parser.add_argument('-g', '--group-by', type=str, default='T.label',
                             help='Property used to group nodes (e.g. code, T.region) default is T.label')
         parser.add_argument('--store-to', type=str, default='', help='store query result to this variable')
         parser.add_argument('--ignore-groups', action='store_true', default=False, help="Ignore all grouping options")

diff --git a/src/graph_notebook/network/gremlin/GremlinNetwork.py b/src/graph_notebook/network/gremlin/GremlinNetwork.py
@@ -54,6 +54,10 @@ def parse_pattern_list_str(pattern_str: str) -> list:
 
 
 def generate_id_from_dict(data: dict) -> str:
+    # Handle cases where user requests '~label' in valueMap step, since json can't serialize non-string keys
+    if T.label in data.keys():
+        data['label'] = data[T.label]
+        del data[T.label]
     data_str = json.dumps(data, default=str)
     hashed = hashlib.md5(data_str.encode())
     generate_id = hashed.hexdigest()
@@ -93,8 +97,11 @@ def __init__(self, graph: MultiDiGraph = None, callbacks=None, label_max_length=
         if graph is None:
             graph = MultiDiGraph()
         self.label_max_length = label_max_length
-        self.group_by_property = group_by_property
-        self.ignore_groups=ignore_groups
+        try:
+            self.group_by_property = json.loads(group_by_property)
+        except ValueError:
+            self.group_by_property = group_by_property
+        self.ignore_groups = ignore_groups
         super().__init__(graph, callbacks)
 
     def add_results_with_pattern(self, results, pattern_list: list):
@@ -271,35 +278,59 @@ def add_vertex(self, v):
         if type(v) is Vertex:
             node_id = v.id
             title = v.label
-            if self.group_by_property in [T_LABEL, 'label']:
-                # This sets the group key to the label if either "label" is passed in or
-                # T.label is set in order to handle the default case of grouping by label
-                # when no explicit key is specified
-                group = v.label
-            elif self.group_by_property == 'id':
-                group = v.id
-            else:
-                group = ''
+            vertex_dict = v.__dict__
+            if not isinstance(self.group_by_property, dict):  # Handle string format group_by
+                if self.group_by_property in [T_LABEL, 'label']:  # this handles if it's just a string
+                    # This sets the group key to the label if either "label" is passed in or
+                    # T.label is set in order to handle the default case of grouping by label
+                    # when no explicit key is specified
+                    group = v.label
+                elif self.group_by_property == 'id':
+                    group = v.id
+                else:
+                    group = ''
+            else:  # handle dict format group_by
+                try:
+                    if str(v.label) in self.group_by_property:
+                        if self.group_by_property[str(v.label)]['groupby'] in [T_LABEL, 'label']:
+                            group = v.label
+                        else:
+                            group = vertex_dict[self.group_by_property[str(v.label)]['groupby']]
+                    elif str(v.id) in self.group_by_property:
+                        group = vertex_dict[self.group_by_property[str(v.id)]['groupby']]
+                    else:
+                        group = ''
+                except KeyError:
+                    group = ''
+
             label = title if len(title) <= self.label_max_length else title[:self.label_max_length - 3] + '...'
             data = {'label': label, 'title': title, 'group': group, 'properties': {'id': node_id, 'label': title}}
         elif type(v) is dict:
             properties = {}
-
             title = ''
             label = ''
             group = ''
+            # Before looping though properties, we first search for T.label in vertex dict, then set title = T.label
+            # Otherwise, we will hit KeyError if we don't iterate through T.label first to set the title
+            # Since it is needed for checking for the vertex label's desired grouping behavior in group_by_property
+            if T.label in v.keys():
+                title = str(v[T.label])
+                label = title if len(title) <= self.label_max_length else title[:self.label_max_length - 3] + '...'
             for k in v:
-                if str(k) == T_LABEL:
-                    title = str(v[k])
-                    label = title if len(title) <= self.label_max_length else title[:self.label_max_length - 3] + '...'
-                elif str(k) == T_ID:
+                if str(k) == T_ID:
                     node_id = str(v[k])
                 properties[k] = v[k]
-                if str(k) == self.group_by_property:
+                if isinstance(self.group_by_property, dict):
+                    try:
+                        if str(k) == self.group_by_property[title]['groupby']:
+                            group = str(v[k])
+                    except KeyError:
+                        continue
+                elif str(k) == self.group_by_property:
                     group = str(v[k])
 
             # handle when there is no id in a node. In this case, we will generate one which
-            # is consistently regenerated so that duplicate dicts will be dedubed to the same vertex.
+            # is consistently regenerated so that duplicate dicts will be reduced to the same vertex.
             if node_id == '':
                 node_id = f'{generate_id_from_dict(v)}'