Merge branch 'issue-511' into fix-data-file-name-bug2

ISA-tools · May 29, 2024 · ed25daa · ed25daa
2 parents bc33a2f + a2446fa
commit ed25daa
Show file tree

Hide file tree

Showing 42 changed files with 2,862 additions and 1,114 deletions.
diff --git a/isa-cookbook/content/notebooks/isa-api-programmatic-BH2023-multiomics-isa.ipynb b/isa-cookbook/content/notebooks/isa-api-programmatic-BH2023-multiomics-isa.ipynb
diff --git a/isatools/constants.py b/isatools/constants.py
@@ -1,4 +1,5 @@
 SYNONYMS = 'synonyms'
+HEADER = 'header'
 
 MATERIAL_LABELS = [
     'Source Name',
@@ -19,6 +20,9 @@
 DATA_FILE_LABELS = [
     'Raw Data File',
     'Raw Spectral Data File',
+    'Free Induction Decay Data File',
+    'Image File',
+    'Derived Data File',
     'Derived Spectral Data File',
     'Derived Array Data File',
     'Derived Array Data Matrix File',
@@ -27,16 +31,16 @@
     'Peptide Assignment File',
     'Post Translational Modification Assignment File',
     'Acquisition Parameter Data File',
-    'Free Induction Decay Data File',
-    'Image File',
-    'Derived Data File',
     'Metabolite Assignment File',
     'Metabolite Identification File'
 ]
 
 _LABELS_DATA_NODES = [
     'Raw Data File',
     'Raw Spectral Data File',
+    'Free Induction Decay Data File',
+    'Image File',
+    'Derived Data File',
     'Derived Spectral Data File',
     'Derived Array Data File',
     'Derived Array Data Matrix File',
@@ -45,9 +49,6 @@
     'Peptide Assignment File',
     'Post Translational Modification Assignment File',
     'Acquisition Parameter Data File',
-    'Free Induction Decay Data File',
-    'Image File',
-    'Derived Data File',
     'Metabolite Assignment File',
     'Metabolite Identification File'
 ]
@@ -65,16 +66,6 @@
     'Data Transformation Name'
 ]
 
-_LABELS_ASSAY_NODES = [
-    'Assay Name',
-    'MS Assay Name',
-    'NMR Assay Name',
-    'Hybridization Assay Name',
-    'Scan Name',
-    'Normalization Name',
-    'Data Transformation Name'
-]
-
 QUALIFIER_LABELS = [
     'Protocol REF',
     'Material Type',
@@ -83,6 +74,19 @@
     'Unit'
 ]
 
+ALLOWED_NODES = NODE_LABELS.append('Protocol REF')
+
 ALL_LABELS = NODE_LABELS + ASSAY_LABELS + QUALIFIER_LABELS
 
 ALL_LABELS.append('Protocol REF')
+ALL_LABELS.append('Label')
+
+_LABELS_ASSAY_NODES = [
+    'Assay Name',
+    'MS Assay Name',
+    'NMR Assay Name',
+    'Hybridization Assay Name',
+    'Scan Name',
+    'Normalization Name',
+    'Data Transformation Name'
+]
diff --git a/isatools/create/model.py b/isatools/create/model.py
@@ -3283,4 +3283,4 @@ def compute_single_arm_design_multi_element_cell(treatments, sample_assay_plan,
                                       elements=[follow_up_map[0]]), follow_up_map[1]])
         arm = StudyArm('ARM_00', group_size=group_size, arm_map=OrderedDict(arm_map))
         design.add_study_arm(arm)
-        return design
+        return design
diff --git a/isatools/isajson/validate.py b/isatools/isajson/validate.py
@@ -810,14 +810,16 @@ def check_study_groups(study_or_assay):
 def validate(
         fp,
         config_dir=default_config_dir,
-        log_level=None,
+        log_level=logging.INFO,
         base_schemas_dir="isa_model_version_1_0_schemas"
 ):
     if config_dir is None:
         config_dir = default_config_dir
-    if log_level in (
-            logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
-            logging.ERROR, logging.CRITICAL):
+    if log_level is None: #(
+    #         logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
+    #         logging.ERROR, logging.CRITICAL):
+        log.disabled = True
+    else:
         log.setLevel(log_level)
     log.info("ISA JSON Validator from ISA tools API v0.12.")
     stream = StringIO()

diff --git a/isatools/isatab/dump/core.py b/isatools/isatab/dump/core.py
@@ -41,7 +41,7 @@ def dump(isa_obj, output_path,
         raise NameError('Investigation file must match pattern i_*.txt, got {}'.format(i_file_name))
 
     if path.exists(output_path):
-        fp = open(path.join(output_path, i_file_name), 'w', encoding='utf-8')
+        fp = open(path.join(output_path, i_file_name), 'wb')
     else:
         log.debug('output_path=', i_file_name)
         raise FileNotFoundError("Can't find " + output_path)
@@ -55,7 +55,7 @@ def dump(isa_obj, output_path,
 
     # Write ONTOLOGY SOURCE REFERENCE section
     ontology_source_references_df = _build_ontology_reference_section(investigation.ontology_source_references)
-    fp.write('ONTOLOGY SOURCE REFERENCE\n')
+    fp.write(bytearray('ONTOLOGY SOURCE REFERENCE\n', 'utf-8'))
     #  Need to set index_label as top left cell
     ontology_source_references_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                          index_label='Term Source Name')
@@ -80,7 +80,7 @@ def dump(isa_obj, output_path,
         inv_df_rows.append(comment.value)
     investigation_df.loc[0] = inv_df_rows
     investigation_df = investigation_df.set_index('Investigation Identifier').T
-    fp.write('INVESTIGATION\n')
+    fp.write(bytearray('INVESTIGATION\n', 'utf-8'))
     investigation_df.to_csv(
         path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
         index_label='Investigation Identifier')
@@ -90,14 +90,15 @@ def dump(isa_obj, output_path,
         prefix='Investigation',
         publications=investigation.publications
     )
-    fp.write('INVESTIGATION PUBLICATIONS\n')
+    fp.write(bytearray('INVESTIGATION PUBLICATIONS\n', 'utf-8'))
     investigation_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                          index_label='Investigation PubMed ID')
 
     # Write INVESTIGATION CONTACTS section
     investigation_contacts_df = _build_contacts_section_df(
         contacts=investigation.contacts)
-    fp.write('INVESTIGATION CONTACTS\n')
+    fp.write(bytearray('INVESTIGATION CONTACTS\n', 'utf-8'))
+
     investigation_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                      index_label='Investigation Person Last Name')
 
@@ -127,40 +128,40 @@ def dump(isa_obj, output_path,
                 study_df_row.append(comment.value)
         study_df.loc[0] = study_df_row
         study_df = study_df.set_index('Study Identifier').T
-        fp.write('STUDY\n')
+        fp.write(bytearray('STUDY\n', 'utf-8'))
         study_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8', index_label='Study Identifier')
         study_design_descriptors_df = _build_design_descriptors_section(design_descriptors=study.design_descriptors)
-        fp.write('STUDY DESIGN DESCRIPTORS\n')
+        fp.write(bytearray('STUDY DESIGN DESCRIPTORS\n', 'utf-8'))
         study_design_descriptors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                            index_label='Study Design Type')
 
         # Write STUDY PUBLICATIONS section
         study_publications_df = _build_publications_section_df(prefix='Study', publications=study.publications)
-        fp.write('STUDY PUBLICATIONS\n')
+        fp.write(bytearray('STUDY PUBLICATIONS\n', 'utf-8'))
         study_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                      index_label='Study PubMed ID')
 
         # Write STUDY FACTORS section
         study_factors_df = _build_factors_section_df(factors=study.factors)
-        fp.write('STUDY FACTORS\n')
+        fp.write(bytearray('STUDY FACTORS\n', 'utf-8'))
         study_factors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                 index_label='Study Factor Name')
 
         study_assays_df = _build_assays_section_df(assays=study.assays)
-        fp.write('STUDY ASSAYS\n')
+        fp.write(bytearray('STUDY ASSAYS\n', 'utf-8'))
         study_assays_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                index_label='Study Assay File Name')
 
         # Write STUDY PROTOCOLS section
         study_protocols_df = _build_protocols_section_df(protocols=study.protocols)
-        fp.write('STUDY PROTOCOLS\n')
+        fp.write(bytearray('STUDY PROTOCOLS\n', 'utf-8'))
         study_protocols_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                   index_label='Study Protocol Name')
 
         # Write STUDY CONTACTS section
         study_contacts_df = _build_contacts_section_df(
             prefix='Study', contacts=study.contacts)
-        fp.write('STUDY CONTACTS\n')
+        fp.write(bytearray('STUDY CONTACTS\n', 'utf-8'))
         study_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
                                  index_label='Study Person Last Name')
 

diff --git a/isatools/isatab/dump/write.py b/isatools/isatab/dump/write.py
@@ -3,7 +3,7 @@
 from pandas import DataFrame
 from numpy import nan
 
-from isatools.constants import SYNONYMS
+from isatools.constants import SYNONYMS, HEADER
 from isatools.model import (
     OntologyAnnotation,
     Investigation,
@@ -22,8 +22,7 @@
     get_pv_columns,
     get_fv_columns,
     get_characteristic_columns,
-    get_object_column_map,
-    get_column_header
+    get_object_column_map
 )
 
 
@@ -60,7 +59,6 @@ def flatten(current_list):
         paths = _all_end_to_end_paths(
             s_graph,
             [x for x in s_graph.nodes() if isinstance(s_graph.indexes[x], Source)])
-        log.warning(s_graph.nodes())
 
         sample_in_path_count = 0
         protocol_in_path_count = 0
@@ -221,7 +219,7 @@ def flatten(current_list):
         DF = DF.replace('', nan)
         DF = DF.dropna(axis=1, how='all')
 
-        with open(path.join(output_dir, study_obj.filename), 'w') as out_fp:
+        with open(path.join(output_dir, study_obj.filename), 'wb') as out_fp:
             DF.to_csv(
                 path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8')
 
@@ -243,7 +241,13 @@ def write_assay_table_files(inv_obj, output_dir, write_factor_values=False):
 
     if not isinstance(inv_obj, Investigation):
         raise NotImplementedError
-    protocol_types_dict = load_protocol_types_info()
+    yaml_dict = load_protocol_types_info()
+    protocol_types_dict = {}
+    for protocol, attributes in yaml_dict.items():
+        protocol_types_dict[protocol] = attributes
+        for synonym in attributes[SYNONYMS]:
+            protocol_types_dict[synonym] = attributes
+
     for study_obj in inv_obj.studies:
         for assay_obj in study_obj.assays:
             a_graph = assay_obj.graph
@@ -295,10 +299,17 @@ def flatten(current_list):
                     columns += flatten(map(lambda x: get_pv_columns(olabel, x),
                                            node.parameter_values))
                     if node.executes_protocol.protocol_type:
-                        oname_label = get_column_header(
-                            node.executes_protocol.protocol_type.term,
-                            protocol_types_dict
-                        )
+                        if isinstance(node.executes_protocol.protocol_type, OntologyAnnotation):
+                            protocol_type = node.executes_protocol.protocol_type.term.lower()
+                        else:
+                            protocol_type = node.executes_protocol.protocol_type.lower()
+
+                        if protocol_type in protocol_types_dict and\
+                            protocol_types_dict[protocol_type][HEADER]:
+                            oname_label = protocol_types_dict[protocol_type][HEADER]
+                        else:
+                            oname_label = None
+
                         if oname_label is not None:
                             if oname_label not in name_label_in_path_counts:
                                 name_label_in_path_counts[oname_label] = 0
@@ -361,10 +372,17 @@ def pbar(x):
                         protocol_in_path_count += 1
                         df_dict[olabel][-1] = node.executes_protocol.name
                         if node.executes_protocol.protocol_type:
-                            oname_label = get_column_header(
-                                node.executes_protocol.protocol_type.term,
-                                protocol_types_dict
-                            )
+                            if isinstance(node.executes_protocol.protocol_type, OntologyAnnotation):
+                                protocol_type = node.executes_protocol.protocol_type.term.lower()
+                            else:
+                                protocol_type = node.executes_protocol.protocol_type.lower()
+
+                            if protocol_type in protocol_types_dict and\
+                                protocol_types_dict[protocol_type][HEADER]:
+                                oname_label = protocol_types_dict[protocol_type][HEADER]
+                            else:
+                                oname_label = None
+
                             if oname_label is not None:
                                 if oname_label not in name_label_in_path_counts:
                                     name_label_in_path_counts[oname_label] = 0
@@ -378,6 +396,7 @@ def pbar(x):
                                     node.name
                                 df_dict["Array Design REF"][-1] = \
                                     node.array_design_ref
+
                         if node.date is not None:
                             df_dict[olabel + ".Date"][-1] = node.date
                         if node.performer is not None:
@@ -496,7 +515,7 @@ def pbar(x):
             DF = DF.dropna(axis=1, how='all')
 
             with open(path.join(
-                    output_dir, assay_obj.filename), 'w') as out_fp:
+                    output_dir, assay_obj.filename), 'wb') as out_fp:
                 DF.to_csv(path_or_buf=out_fp, index=False, sep='\t',
                           encoding='utf-8')
 

diff --git a/isatools/isatab/graph.py b/isatools/isatab/graph.py
@@ -17,7 +17,7 @@ def _all_end_to_end_paths(G, start_nodes):
     num_start_nodes = len(start_nodes)
     message = 'Calculating for paths for {} start nodes: '.format(
         num_start_nodes)
-    log.info(start_nodes)
+    # log.info(start_nodes)
     start_node = G.indexes[start_nodes[0]]
     if isinstance(start_node, Source):
         message = 'Calculating for paths for {} sources: '.format(
@@ -61,7 +61,6 @@ def _longest_path_and_attrs(paths, indexes):
     :return: The longest path and attributes
     """
     longest = (0, None)
-    log.info(paths)
     for path in paths:
         length = len(path)
         for node in path:

diff --git a/isatools/isatab/load/ProcessSequenceFactory.py b/isatools/isatab/load/ProcessSequenceFactory.py
@@ -378,6 +378,12 @@ def get_node_by_label_and_key(labl, this_key):
                         if comment_key not in [x.name for x in process.comments]:
                             process.comments.append(Comment(name=comment_key, value=str(object_series[comment_column])))
 
+                    for performer in [c for c in column_group if c == 'Performer']:
+                        process.performer = str(object_series[performer])
+
+                    for date in [c for c in column_group if c == 'Date']:
+                        process.date = str(object_series[date])
+
         for _, object_series in DF.iterrows():  # don't drop duplicates
             process_key_sequence = list()
             source_node_context = None

diff --git a/isatools/isatab/load/__init__.py b/isatools/isatab/load/__init__.py
@@ -1,3 +1,8 @@
-from isatools.isatab.load.read import read_investigation_file, read_tfile
 from isatools.isatab.load.ProcessSequenceFactory import ProcessSequenceFactory, preprocess
-from isatools.isatab.load.core import load, merge_study_with_assay_tables, load_table
+from isatools.isatab.load.core import (
+    load,
+    merge_study_with_assay_tables,
+    load_table,
+    read_investigation_file,
+    read_tfile
+)