fixes issue-511 and incorporates code from @ptth222 from PR #553'

ISA-tools · Jul 9, 2024 · 84942ce · 84942ce
1 parent db3c4a9
commit 84942ce
Show file tree

Hide file tree

Showing 13 changed files with 452 additions and 65 deletions.
diff --git a/isatools/constants.py b/isatools/constants.py
@@ -68,6 +68,8 @@
 
 QUALIFIER_LABELS = [
     'Protocol REF',
+    'Performer',
+    'Date',
     'Material Type',
     'Term Source REF',
     'Term Accession Number',
@@ -80,6 +82,8 @@
 
 ALL_LABELS.append('Protocol REF')
 ALL_LABELS.append('Label')
+ALL_LABELS.append('Performer')
+ALL_LABELS.append('Date')
 
 _LABELS_ASSAY_NODES = [
     'Assay Name',

diff --git a/isatools/isatab/dump/write.py b/isatools/isatab/dump/write.py
@@ -16,6 +16,7 @@
 )
 from isatools.isatab.defaults import log
 from isatools.isatab.graph import _all_end_to_end_paths, _longest_path_and_attrs
+from isatools.model.utils import _build_paths_and_indexes
 from isatools.isatab.utils import (
     get_comment_column,
     get_pv_columns,
@@ -260,24 +261,24 @@ def flatten(current_list):
 
             columns = []
 
-            # start_nodes, end_nodes = _get_start_end_nodes(a_graph)
-            paths = _all_end_to_end_paths(
-                a_graph, [x for x in a_graph.nodes()
-                          if isinstance(a_graph.indexes[x], Sample)])
+            paths, indexes = _build_paths_and_indexes(assay_obj.process_sequence)
+
             if len(paths) == 0:
                 log.info("No paths found, skipping writing assay file")
                 continue
-            if _longest_path_and_attrs(paths, a_graph.indexes) is None:
+            if _longest_path_and_attrs(paths, indexes) is None:
                 raise IOError(
                     "Could not find any valid end-to-end paths in assay graph")
 
             protocol_in_path_count = 0
-            for node_index in _longest_path_and_attrs(paths, a_graph.indexes):
-                node = a_graph.indexes[node_index]
+            output_label_in_path_counts = {}
+            name_label_in_path_counts = {}
+            header_count: dict[str, int] = {}
+
+            for node_index in _longest_path_and_attrs(paths, indexes):
+                node = indexes[node_index]
                 if isinstance(node, Sample):
                     olabel = "Sample Name"
-                    # olabel = "Sample Name.{}".format(sample_in_path_count)
-                    # sample_in_path_count += 1
                     columns.append(olabel)
                     columns += flatten(
                         map(lambda x: get_comment_column(olabel, x),
@@ -305,30 +306,25 @@ def flatten(current_list):
                             protocol_type = node.executes_protocol.protocol_type.term.lower()
                         else:
                             protocol_type = node.executes_protocol.protocol_type.lower()
-
-                        if protocol_type in protocol_types_dict and\
-                            protocol_types_dict[protocol_type][HEADER]:
+
+                        if protocol_type in protocol_types_dict and protocol_types_dict[protocol_type][HEADER]:
                             oname_label = protocol_types_dict[protocol_type][HEADER]
-                        else:
-                            oname_label = None
-
-                        if oname_label is not None:
-                            columns.append(oname_label)
-
-                            if node.executes_protocol.protocol_type.term.lower() in \
-                                    protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
-                                columns.append("Array Design REF")
-
+
+                            if oname_label not in name_label_in_path_counts:
+                                name_label_in_path_counts[oname_label] = 0
+                                header_count[oname_label] = 0
+                            new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
+
+                            columns.append(new_oname_label)
+                            name_label_in_path_counts[oname_label] += 1
+
+                            if protocol_type in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
+                                columns.extend(["Array Design REF"])
+
                     columns += flatten(
                         map(lambda x: get_comment_column(olabel, x),
                             node.comments))
-
-                    for output in [x for x in node.outputs if isinstance(x, DataFile)]:
-                        if output.label not in columns:
-                            columns.append(output.label)
-                        columns += flatten(
-                            map(lambda x: get_comment_column(output.label, x),
-                                output.comments))
+                    print(columns)
                 elif isinstance(node, Material):
                     olabel = node.type
                     columns.append(olabel)
@@ -340,7 +336,17 @@ def flatten(current_list):
                             node.comments))
 
                 elif isinstance(node, DataFile):
-                    pass  # handled in process
+                    # pass  # handled in process
+                    output_label = node.label
+                    if output_label not in output_label_in_path_counts:
+                        output_label_in_path_counts[output_label] = 0
+                    new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
+
+                    columns.append(new_output_label)
+                    output_label_in_path_counts[output_label] += 1
+                    columns += flatten(
+                        map(lambda x: get_comment_column(new_output_label, x),
+                            node.comments))
 
             omap = get_object_column_map(columns, columns)
 
@@ -355,8 +361,10 @@ def pbar(x):
                     df_dict[k].extend([""])
 
                 protocol_in_path_count = 0
+                output_label_in_path_counts = {}
+                name_label_in_path_counts = {}
                 for node_index in path_:
-                    node = a_graph.indexes[node_index]
+                    node = indexes[node_index]
                     if isinstance(node, Process):
                         olabel = "Protocol REF.{}".format(protocol_in_path_count)
                         protocol_in_path_count += 1
@@ -366,20 +374,20 @@ def pbar(x):
                                 protocol_type = node.executes_protocol.protocol_type.term.lower()
                             else:
                                 protocol_type = node.executes_protocol.protocol_type.lower()
-
-                            if protocol_type in protocol_types_dict and\
-                                protocol_types_dict[protocol_type][HEADER]:
+
+                            if protocol_type in protocol_types_dict and protocol_types_dict[protocol_type][HEADER]:
                                 oname_label = protocol_types_dict[protocol_type][HEADER]
-                            else:
-                                oname_label = None
-
-                            if oname_label is not None:
-                                df_dict[oname_label][-1] = node.name
 
-                                if node.executes_protocol.protocol_type.term.lower() in \
-                                        protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
+                                if oname_label not in name_label_in_path_counts:
+                                    name_label_in_path_counts[oname_label] = 0
+
+                                new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
+                                df_dict[new_oname_label][-1] = node.name
+                                name_label_in_path_counts[oname_label] += 1
+
+                                if protocol_type in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
                                     df_dict["Array Design REF"][-1] = node.array_design_ref
-                        
+
                         if node.date is not None:
                             df_dict[olabel + ".Date"][-1] = node.date
                         if node.performer is not None:
@@ -391,18 +399,18 @@ def pbar(x):
                             colabel = "{0}.Comment[{1}]".format(olabel, co.name)
                             df_dict[colabel][-1] = co.value
 
-                        for output in [x for x in node.outputs if isinstance(x, DataFile)]:
-                            output_by_type = []
-                            delim = ";"
-                            olabel = output.label
-                            if output.label not in columns:
-                                columns.append(output.label)
-                            output_by_type.append(output.filename)
-                            df_dict[olabel][-1] = delim.join(map(str, output_by_type))
-
-                            for co in output.comments:
-                                colabel = "{0}.Comment[{1}]".format(olabel, co.name)
-                                df_dict[colabel][-1] = co.value
+                        # for output in [x for x in node.outputs if isinstance(x, DataFile)]:
+                        #     output_by_type = []
+                        #     delim = ";"
+                        #     olabel = output.label
+                        #     if output.label not in columns:
+                        #         columns.append(output.label)
+                        #     output_by_type.append(output.filename)
+                        #     df_dict[olabel][-1] = delim.join(map(str, output_by_type))
+                        #
+                        #     for co in output.comments:
+                        #         colabel = "{0}.Comment[{1}]".format(olabel, co.name)
+                        #         df_dict[colabel][-1] = co.value
 
                     elif isinstance(node, Sample):
                         olabel = "Sample Name"
@@ -434,7 +442,19 @@ def pbar(x):
                             df_dict[colabel][-1] = co.value
 
                     elif isinstance(node, DataFile):
-                        pass  # handled in process
+                        # pass  # handled in process
+
+                        output_label = node.label
+                        if output_label not in output_label_in_path_counts:
+                            output_label_in_path_counts[output_label] = 0
+                        new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
+                        df_dict[new_output_label][-1] = node.filename
+                        output_label_in_path_counts[output_label] += 1
+
+                        for co in node.comments:
+                            colabel = "{0}.Comment[{1}]".format(
+                                new_output_label, co.name)
+                            df_dict[colabel][-1] = co.value
 
             DF = DataFrame(columns=columns)
             DF = DF.from_dict(data=df_dict)
@@ -482,6 +502,11 @@ def pbar(x):
                     columns[i] = "Protocol REF"
                 elif "." in col:
                     columns[i] = col[:col.rindex(".")]
+                else:
+                    for output_label in output_label_in_path_counts:
+                        if output_label in col:
+                            columns[i] = output_label
+                            break
 
             log.debug("Rendered {} paths".format(len(DF.index)))
             if len(DF.index) > 1:
@@ -521,8 +546,6 @@ def write_value_columns(df_dict, label, x):
                 elif x.unit.term_source.name:
                     df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name
 
-            # df_dict[label + ".Unit.Term Source REF"][-1] = \
-            #     x.unit.term_source.name if x.unit.term_source else ""
             df_dict[label + ".Unit.Term Accession Number"][-1] = \
                 x.unit.term_accession
         else:

diff --git a/isatools/isatab/load/ProcessSequenceFactory.py b/isatools/isatab/load/ProcessSequenceFactory.py
@@ -146,7 +146,7 @@ def create_from_df(self, DF):
         except KeyError:
             pass
 
-        for data_col in [x for x in DF.columns if x.endswith(" File")]:
+        for data_col in [x for x in DF.columns if x in _LABELS_DATA_NODES]:
             filenames = [x for x in DF[data_col].drop_duplicates() if x != '']
             data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames)))
 
@@ -167,7 +167,7 @@ def get_node_by_label_and_key(labl, this_key):
                 n = samples[lk]
             elif labl in ('Extract Name', 'Labeled Extract Name'):
                 n = other_material[lk]
-            elif labl.endswith(' File'):
+            elif labl in _LABELS_DATA_NODES:
                 n = data[lk]
             return n
 
@@ -410,7 +410,7 @@ def get_node_by_label_and_key(labl, this_key):
                     process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF)
                     process_key_sequence.append(process_key)
 
-                if object_label.endswith(' File'):
+                if object_label in _LABELS_DATA_NODES:
                     data_node = None
                     try:
                         data_node = get_node_by_label_and_key(object_label, str(object_series[object_label]))

diff --git a/isatools/isatab/utils.py b/isatools/isatab/utils.py
@@ -496,7 +496,7 @@ def get_object_column_map(isatab_header, df_columns):
     """
     labels = _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES
     if set(isatab_header) == set(df_columns):
-        object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x]
+        object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x or ' File' in x]
     else:
         object_index = [i for i, x in enumerate(isatab_header) if x in labels + ['Protocol REF']]
 

diff --git a/isatools/isatab/validate/rules/rules_40xx.py b/isatools/isatab/validate/rules/rules_40xx.py
@@ -384,6 +384,8 @@ def load_table_checks(df, filename):
         'Extract Name',
         'Labeled Extract Name',
         'Protocol REF',
+        'Performer',
+        'Date',
         'Raw Data File',
         'Raw Spectral Data File',
         'Free Induction Decay Data File',