From a5ca2f10ecac34fdfef20ad7f8c00ffef586f797 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 19 Mar 2024 13:19:47 -0400 Subject: [PATCH 1/3] First attempt. The problem with trying to do this is that some of the protocol types in the yaml file have "Sample Name" and "Extract Name" as the headers and this causes those headers to be added multiple times. It might be better to leave those empty. Also might be good to change headers to a list to get rid of the special case for "nucleic acid hybridization". --- isatools/constants.py | 1 + isatools/isatab/dump/write.py | 53 +++++++++++-------- isatools/isatab/utils.py | 48 ++++++++--------- isatools/model/protocol.py | 13 ++++- .../resources/config/yaml/protocol-types.yml | 6 ++- 5 files changed, 73 insertions(+), 48 deletions(-) diff --git a/isatools/constants.py b/isatools/constants.py index 7a4ed53b..b8606d50 100644 --- a/isatools/constants.py +++ b/isatools/constants.py @@ -1,4 +1,5 @@ SYNONYMS = 'synonyms' +HEADER = 'header' MATERIAL_LABELS = [ 'Source Name', diff --git a/isatools/isatab/dump/write.py b/isatools/isatab/dump/write.py index 02ed6ebf..22b6f142 100644 --- a/isatools/isatab/dump/write.py +++ b/isatools/isatab/dump/write.py @@ -3,7 +3,7 @@ from pandas import DataFrame from numpy import nan -from isatools.constants import SYNONYMS +from isatools.constants import SYNONYMS, HEADER from isatools.model import ( OntologyAnnotation, Investigation, @@ -21,8 +21,7 @@ get_pv_columns, get_fv_columns, get_characteristic_columns, - get_object_column_map, - get_column_header + get_object_column_map ) @@ -296,17 +295,23 @@ def flatten(current_list): columns += flatten(map(lambda x: get_pv_columns(olabel, x), node.parameter_values)) if node.executes_protocol.protocol_type: - oname_label = get_column_header( - node.executes_protocol.protocol_type.term, - protocol_types_dict - ) + if isinstance(node.executes_protocol.protocol_type, OntologyAnnotation): + protocol_type = node.executes_protocol.protocol_type.term.lower() + else: + protocol_type = node.executes_protocol.protocol_type.lower() + + if protocol_type in protocol_types_dict: + oname_label = protocol_types_dict[protocol_type][HEADER] + else: + oname_label = None + if oname_label is not None: columns.append(oname_label) - elif node.executes_protocol.protocol_type.term.lower() \ - in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]: - columns.extend( - ["Hybridization Assay Name", - "Array Design REF"]) + + if node.executes_protocol.protocol_type.term.lower() in \ + protocol_types_dict["nucleic acid hybridization"][SYNONYMS]: + columns.append("Array Design REF") + columns += flatten( map(lambda x: get_comment_column(olabel, x), node.comments)) @@ -350,19 +355,23 @@ def pbar(x): protocol_in_path_count += 1 df_dict[olabel][-1] = node.executes_protocol.name if node.executes_protocol.protocol_type: - oname_label = get_column_header( - node.executes_protocol.protocol_type.term, - protocol_types_dict - ) + if isinstance(node.executes_protocol.protocol_type, OntologyAnnotation): + protocol_type = node.executes_protocol.protocol_type.term.lower() + else: + protocol_type = node.executes_protocol.protocol_type.lower() + + if protocol_type in protocol_types_dict: + oname_label = protocol_types_dict[protocol_type][HEADER] + else: + oname_label = None + if oname_label is not None: df_dict[oname_label][-1] = node.name - elif node.executes_protocol.protocol_type.term.lower() in \ - protocol_types_dict["nucleic acid hybridization"][SYNONYMS]: - df_dict["Hybridization Assay Name"][-1] = \ - node.name - df_dict["Array Design REF"][-1] = \ - node.array_design_ref + if node.executes_protocol.protocol_type.term.lower() in \ + protocol_types_dict["nucleic acid hybridization"][SYNONYMS]: + df_dict["Array Design REF"][-1] = node.array_design_ref + if node.date is not None: df_dict[olabel + ".Date"][-1] = node.date if node.performer is not None: diff --git a/isatools/isatab/utils.py b/isatools/isatab/utils.py index de36c5e4..e769925b 100644 --- a/isatools/isatab/utils.py +++ b/isatools/isatab/utils.py @@ -515,30 +515,30 @@ def get_object_column_map(isatab_header, df_columns): return object_column_map -def get_column_header(protocol_type_term, protocol_types_dict): - column_header = None - if protocol_type_term.lower() in \ - protocol_types_dict["nucleic acid sequencing"][SYNONYMS] \ - + protocol_types_dict["phenotyping"][SYNONYMS] \ - + protocol_types_dict["data acquisition"][SYNONYMS]: - column_header = "Assay Name" - elif protocol_type_term.lower() in protocol_types_dict["data collection"][SYNONYMS]: - column_header = "Scan Name" - elif protocol_type_term.lower() in protocol_types_dict["mass spectrometry"][SYNONYMS]: - column_header = "MS Assay Name" - elif protocol_type_term.lower() in protocol_types_dict["nmr spectroscopy"][SYNONYMS]: - column_header = "NMR Assay Name" - elif protocol_type_term.lower() in \ - protocol_types_dict["data transformation"][SYNONYMS] \ - + protocol_types_dict["sequence analysis data transformation"][SYNONYMS] \ - + protocol_types_dict["metabolite identification"][SYNONYMS] \ - + protocol_types_dict["protein identification"][SYNONYMS]: - column_header = "Data Transformation Name" - elif protocol_type_term.lower() in protocol_types_dict["normalization"][SYNONYMS]: - column_header = "Normalization Name" - if protocol_type_term.lower() == "unknown protocol": - column_header = "Unknown Protocol Name" - return column_header +# def get_column_header(protocol_type_term, protocol_types_dict): +# column_header = None +# if protocol_type_term.lower() in \ +# protocol_types_dict["nucleic acid sequencing"][SYNONYMS] \ +# + protocol_types_dict["phenotyping"][SYNONYMS] \ +# + protocol_types_dict["data acquisition"][SYNONYMS]: +# column_header = "Assay Name" +# elif protocol_type_term.lower() in protocol_types_dict["data collection"][SYNONYMS]: +# column_header = "Scan Name" +# elif protocol_type_term.lower() in protocol_types_dict["mass spectrometry"][SYNONYMS]: +# column_header = "MS Assay Name" +# elif protocol_type_term.lower() in protocol_types_dict["nmr spectroscopy"][SYNONYMS]: +# column_header = "NMR Assay Name" +# elif protocol_type_term.lower() in \ +# protocol_types_dict["data transformation"][SYNONYMS] \ +# + protocol_types_dict["sequence analysis data transformation"][SYNONYMS] \ +# + protocol_types_dict["metabolite identification"][SYNONYMS] \ +# + protocol_types_dict["protein identification"][SYNONYMS]: +# column_header = "Data Transformation Name" +# elif protocol_type_term.lower() in protocol_types_dict["normalization"][SYNONYMS]: +# column_header = "Normalization Name" +# if protocol_type_term.lower() == "unknown protocol": +# column_header = "Unknown Protocol Name" +# return column_header def get_value_columns(label, x): diff --git a/isatools/model/protocol.py b/isatools/model/protocol.py index 4240e1b3..a96cf97d 100644 --- a/isatools/model/protocol.py +++ b/isatools/model/protocol.py @@ -2,6 +2,7 @@ from collections.abc import Iterable from pprint import pprint from yaml import load, FullLoader +from isatools.constants import SYNONYMS from isatools.model.comments import Commentable from isatools.model.ontology_annotation import OntologyAnnotation from isatools.model.protocol_parameter import ProtocolParameter @@ -282,4 +283,14 @@ def load_protocol_types_info() -> dict: """ filepath = os.path.join(os.path.dirname(__file__), '..', 'resources', 'config', 'yaml', 'protocol-types.yml') with open(filepath) as yaml_file: - return load(yaml_file, Loader=FullLoader) + yaml_dict = load(yaml_file, Loader=FullLoader) + + protocol_types_dict = {} + for protocol, attributes in yaml_dict.items(): + protocol_types_dict[protocol] = attributes + for synonym in attributes[SYNONYMS]: + protocol_types_dict[synonym] = attributes + + return protocol_types_dict + + diff --git a/isatools/resources/config/yaml/protocol-types.yml b/isatools/resources/config/yaml/protocol-types.yml index 120b54fc..92d755b6 100644 --- a/isatools/resources/config/yaml/protocol-types.yml +++ b/isatools/resources/config/yaml/protocol-types.yml @@ -83,4 +83,8 @@ metabolite identification: protein identification: header: Data Transformation Name synonyms: - - protein identification \ No newline at end of file + - protein identification +unknown protocol: + header: Unknown Protocol Name + synonyms: + - unknown protocol \ No newline at end of file From e78f621377a14b2c7c3f15513a54d876561e936b Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Wed, 20 Mar 2024 00:59:47 -0400 Subject: [PATCH 2/3] Changes finalized and tested --- isatools/isatab/dump/write.py | 14 ++++++++--- isatools/model/process.py | 24 +++++++++---------- isatools/model/protocol.py | 11 ++------- .../resources/config/yaml/protocol-types.yml | 6 ++--- tests/model/test_protocol.py | 2 +- 5 files changed, 29 insertions(+), 28 deletions(-) diff --git a/isatools/isatab/dump/write.py b/isatools/isatab/dump/write.py index 22b6f142..786bf56a 100644 --- a/isatools/isatab/dump/write.py +++ b/isatools/isatab/dump/write.py @@ -240,7 +240,13 @@ def write_assay_table_files(inv_obj, output_dir, write_factor_values=False): if not isinstance(inv_obj, Investigation): raise NotImplementedError - protocol_types_dict = load_protocol_types_info() + yaml_dict = load_protocol_types_info() + protocol_types_dict = {} + for protocol, attributes in yaml_dict.items(): + protocol_types_dict[protocol] = attributes + for synonym in attributes[SYNONYMS]: + protocol_types_dict[synonym] = attributes + for study_obj in inv_obj.studies: for assay_obj in study_obj.assays: a_graph = assay_obj.graph @@ -300,7 +306,8 @@ def flatten(current_list): else: protocol_type = node.executes_protocol.protocol_type.lower() - if protocol_type in protocol_types_dict: + if protocol_type in protocol_types_dict and\ + protocol_types_dict[protocol_type][HEADER]: oname_label = protocol_types_dict[protocol_type][HEADER] else: oname_label = None @@ -360,7 +367,8 @@ def pbar(x): else: protocol_type = node.executes_protocol.protocol_type.lower() - if protocol_type in protocol_types_dict: + if protocol_type in protocol_types_dict and\ + protocol_types_dict[protocol_type][HEADER]: oname_label = protocol_types_dict[protocol_type][HEADER] else: oname_label = None diff --git a/isatools/model/process.py b/isatools/model/process.py index 1cac921d..cc779dfd 100644 --- a/isatools/model/process.py +++ b/isatools/model/process.py @@ -307,18 +307,18 @@ def from_assay_dict(self, process, technology_type): self.name = process.get('name', '') self.executes_protocol = indexes.get_protocol(process['executesProtocol']['@id']) self.load_comments(process.get('comments', [])) - allowed_protocol_type_terms = [ - "nucleic acid sequencing", - "nmr spectroscopy", - "mass spectrometry", - "nucleic acid hybridization", - "data transformation", - "data normalization" - ] - if self.executes_protocol.protocol_type.term in allowed_protocol_type_terms or ( - self.executes_protocol.protocol_type.term == 'data collection' - and technology_type.term == 'DNA microarray'): - self.name = process['name'] + # allowed_protocol_type_terms = [ + # "nucleic acid sequencing", + # "nmr spectroscopy", + # "mass spectrometry", + # "nucleic acid hybridization", + # "data transformation", + # "data normalization" + # ] + # if self.executes_protocol.protocol_type.term in allowed_protocol_type_terms or ( + # self.executes_protocol.protocol_type.term == 'data collection' + # and technology_type.term == 'DNA microarray'): + # self.name = process['name'] # Inputs / Outputs for io_data_target in ['inputs', 'outputs']: diff --git a/isatools/model/protocol.py b/isatools/model/protocol.py index a96cf97d..c9dc5ef8 100644 --- a/isatools/model/protocol.py +++ b/isatools/model/protocol.py @@ -283,14 +283,7 @@ def load_protocol_types_info() -> dict: """ filepath = os.path.join(os.path.dirname(__file__), '..', 'resources', 'config', 'yaml', 'protocol-types.yml') with open(filepath) as yaml_file: - yaml_dict = load(yaml_file, Loader=FullLoader) - - protocol_types_dict = {} - for protocol, attributes in yaml_dict.items(): - protocol_types_dict[protocol] = attributes - for synonym in attributes[SYNONYMS]: - protocol_types_dict[synonym] = attributes - - return protocol_types_dict + return load(yaml_file, Loader=FullLoader) + diff --git a/isatools/resources/config/yaml/protocol-types.yml b/isatools/resources/config/yaml/protocol-types.yml index 92d755b6..714331dc 100644 --- a/isatools/resources/config/yaml/protocol-types.yml +++ b/isatools/resources/config/yaml/protocol-types.yml @@ -1,12 +1,12 @@ sample collection: - header: Sample Name + header: iri: http://purl.obolibrary.org/obo/OBI_0000659 synonyms: - sample collection - sampling - aliquoting extraction: - header: Extract Name + header: iri: http://purl.obolibrary.org/obo/OBI_0302884 synonyms: - extraction @@ -14,7 +14,7 @@ extraction: - intracellular metabolite extraction - extracelluar metabolite extraction labeling: - header: Labeled Extract Name + header: iri: http://purl.obolibrary.org/obo/OBI_0600038 synonyms: - labeling diff --git a/tests/model/test_protocol.py b/tests/model/test_protocol.py index 707ee18b..809c0485 100644 --- a/tests/model/test_protocol.py +++ b/tests/model/test_protocol.py @@ -246,4 +246,4 @@ class TestFunctions(TestCase): def test_load_protocol_types_info(self): yaml_config = load_protocol_types_info() self.assertTrue(isinstance(yaml_config, dict)) - self.assertTrue(len(yaml_config.keys()) == 15) + self.assertEqual(len(yaml_config.keys()), 16) From 945241acc1def22043b1bfba33da2599688ccbea Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Wed, 20 Mar 2024 01:04:24 -0400 Subject: [PATCH 3/3] Deleted previously commented out code --- isatools/isatab/utils.py | 26 -------------------------- isatools/model/process.py | 12 ------------ 2 files changed, 38 deletions(-) diff --git a/isatools/isatab/utils.py b/isatools/isatab/utils.py index e769925b..ed06f6af 100644 --- a/isatools/isatab/utils.py +++ b/isatools/isatab/utils.py @@ -515,32 +515,6 @@ def get_object_column_map(isatab_header, df_columns): return object_column_map -# def get_column_header(protocol_type_term, protocol_types_dict): -# column_header = None -# if protocol_type_term.lower() in \ -# protocol_types_dict["nucleic acid sequencing"][SYNONYMS] \ -# + protocol_types_dict["phenotyping"][SYNONYMS] \ -# + protocol_types_dict["data acquisition"][SYNONYMS]: -# column_header = "Assay Name" -# elif protocol_type_term.lower() in protocol_types_dict["data collection"][SYNONYMS]: -# column_header = "Scan Name" -# elif protocol_type_term.lower() in protocol_types_dict["mass spectrometry"][SYNONYMS]: -# column_header = "MS Assay Name" -# elif protocol_type_term.lower() in protocol_types_dict["nmr spectroscopy"][SYNONYMS]: -# column_header = "NMR Assay Name" -# elif protocol_type_term.lower() in \ -# protocol_types_dict["data transformation"][SYNONYMS] \ -# + protocol_types_dict["sequence analysis data transformation"][SYNONYMS] \ -# + protocol_types_dict["metabolite identification"][SYNONYMS] \ -# + protocol_types_dict["protein identification"][SYNONYMS]: -# column_header = "Data Transformation Name" -# elif protocol_type_term.lower() in protocol_types_dict["normalization"][SYNONYMS]: -# column_header = "Normalization Name" -# if protocol_type_term.lower() == "unknown protocol": -# column_header = "Unknown Protocol Name" -# return column_header - - def get_value_columns(label, x): """ Generates the appropriate columns based on the value of the object. For example, if the object's .value value is an OntologyAnnotation, diff --git a/isatools/model/process.py b/isatools/model/process.py index cc779dfd..1f6bff1f 100644 --- a/isatools/model/process.py +++ b/isatools/model/process.py @@ -307,18 +307,6 @@ def from_assay_dict(self, process, technology_type): self.name = process.get('name', '') self.executes_protocol = indexes.get_protocol(process['executesProtocol']['@id']) self.load_comments(process.get('comments', [])) - # allowed_protocol_type_terms = [ - # "nucleic acid sequencing", - # "nmr spectroscopy", - # "mass spectrometry", - # "nucleic acid hybridization", - # "data transformation", - # "data normalization" - # ] - # if self.executes_protocol.protocol_type.term in allowed_protocol_type_terms or ( - # self.executes_protocol.protocol_type.term == 'data collection' - # and technology_type.term == 'DNA microarray'): - # self.name = process['name'] # Inputs / Outputs for io_data_target in ['inputs', 'outputs']: