Skip to content

Commit

Permalink
Merge branch 'issue-511' into fix-data-file-name-bug2
Browse files Browse the repository at this point in the history
  • Loading branch information
proccaserra authored May 29, 2024
2 parents bc33a2f + a2446fa commit ed25daa
Show file tree
Hide file tree
Showing 42 changed files with 2,862 additions and 1,114 deletions.
1,647 changes: 1,549 additions & 98 deletions isa-cookbook/content/notebooks/isa-api-programmatic-BH2023-multiomics-isa.ipynb

Large diffs are not rendered by default.

36 changes: 20 additions & 16 deletions isatools/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
SYNONYMS = 'synonyms'
HEADER = 'header'

MATERIAL_LABELS = [
'Source Name',
Expand All @@ -19,6 +20,9 @@
DATA_FILE_LABELS = [
'Raw Data File',
'Raw Spectral Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Derived Spectral Data File',
'Derived Array Data File',
'Derived Array Data Matrix File',
Expand All @@ -27,16 +31,16 @@
'Peptide Assignment File',
'Post Translational Modification Assignment File',
'Acquisition Parameter Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Metabolite Assignment File',
'Metabolite Identification File'
]

_LABELS_DATA_NODES = [
'Raw Data File',
'Raw Spectral Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Derived Spectral Data File',
'Derived Array Data File',
'Derived Array Data Matrix File',
Expand All @@ -45,9 +49,6 @@
'Peptide Assignment File',
'Post Translational Modification Assignment File',
'Acquisition Parameter Data File',
'Free Induction Decay Data File',
'Image File',
'Derived Data File',
'Metabolite Assignment File',
'Metabolite Identification File'
]
Expand All @@ -65,16 +66,6 @@
'Data Transformation Name'
]

_LABELS_ASSAY_NODES = [
'Assay Name',
'MS Assay Name',
'NMR Assay Name',
'Hybridization Assay Name',
'Scan Name',
'Normalization Name',
'Data Transformation Name'
]

QUALIFIER_LABELS = [
'Protocol REF',
'Material Type',
Expand All @@ -83,6 +74,19 @@
'Unit'
]

ALLOWED_NODES = NODE_LABELS.append('Protocol REF')

ALL_LABELS = NODE_LABELS + ASSAY_LABELS + QUALIFIER_LABELS

ALL_LABELS.append('Protocol REF')
ALL_LABELS.append('Label')

_LABELS_ASSAY_NODES = [
'Assay Name',
'MS Assay Name',
'NMR Assay Name',
'Hybridization Assay Name',
'Scan Name',
'Normalization Name',
'Data Transformation Name'
]
2 changes: 1 addition & 1 deletion isatools/create/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3283,4 +3283,4 @@ def compute_single_arm_design_multi_element_cell(treatments, sample_assay_plan,
elements=[follow_up_map[0]]), follow_up_map[1]])
arm = StudyArm('ARM_00', group_size=group_size, arm_map=OrderedDict(arm_map))
design.add_study_arm(arm)
return design
return design
10 changes: 6 additions & 4 deletions isatools/isajson/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,14 +810,16 @@ def check_study_groups(study_or_assay):
def validate(
fp,
config_dir=default_config_dir,
log_level=None,
log_level=logging.INFO,
base_schemas_dir="isa_model_version_1_0_schemas"
):
if config_dir is None:
config_dir = default_config_dir
if log_level in (
logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
logging.ERROR, logging.CRITICAL):
if log_level is None: #(
# logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
# logging.ERROR, logging.CRITICAL):
log.disabled = True
else:
log.setLevel(log_level)
log.info("ISA JSON Validator from ISA tools API v0.12.")
stream = StringIO()
Expand Down
25 changes: 13 additions & 12 deletions isatools/isatab/dump/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def dump(isa_obj, output_path,
raise NameError('Investigation file must match pattern i_*.txt, got {}'.format(i_file_name))

if path.exists(output_path):
fp = open(path.join(output_path, i_file_name), 'w', encoding='utf-8')
fp = open(path.join(output_path, i_file_name), 'wb')
else:
log.debug('output_path=', i_file_name)
raise FileNotFoundError("Can't find " + output_path)
Expand All @@ -55,7 +55,7 @@ def dump(isa_obj, output_path,

# Write ONTOLOGY SOURCE REFERENCE section
ontology_source_references_df = _build_ontology_reference_section(investigation.ontology_source_references)
fp.write('ONTOLOGY SOURCE REFERENCE\n')
fp.write(bytearray('ONTOLOGY SOURCE REFERENCE\n', 'utf-8'))
# Need to set index_label as top left cell
ontology_source_references_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Term Source Name')
Expand All @@ -80,7 +80,7 @@ def dump(isa_obj, output_path,
inv_df_rows.append(comment.value)
investigation_df.loc[0] = inv_df_rows
investigation_df = investigation_df.set_index('Investigation Identifier').T
fp.write('INVESTIGATION\n')
fp.write(bytearray('INVESTIGATION\n', 'utf-8'))
investigation_df.to_csv(
path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Investigation Identifier')
Expand All @@ -90,14 +90,15 @@ def dump(isa_obj, output_path,
prefix='Investigation',
publications=investigation.publications
)
fp.write('INVESTIGATION PUBLICATIONS\n')
fp.write(bytearray('INVESTIGATION PUBLICATIONS\n', 'utf-8'))
investigation_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Investigation PubMed ID')

# Write INVESTIGATION CONTACTS section
investigation_contacts_df = _build_contacts_section_df(
contacts=investigation.contacts)
fp.write('INVESTIGATION CONTACTS\n')
fp.write(bytearray('INVESTIGATION CONTACTS\n', 'utf-8'))

investigation_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Investigation Person Last Name')

Expand Down Expand Up @@ -127,40 +128,40 @@ def dump(isa_obj, output_path,
study_df_row.append(comment.value)
study_df.loc[0] = study_df_row
study_df = study_df.set_index('Study Identifier').T
fp.write('STUDY\n')
fp.write(bytearray('STUDY\n', 'utf-8'))
study_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8', index_label='Study Identifier')
study_design_descriptors_df = _build_design_descriptors_section(design_descriptors=study.design_descriptors)
fp.write('STUDY DESIGN DESCRIPTORS\n')
fp.write(bytearray('STUDY DESIGN DESCRIPTORS\n', 'utf-8'))
study_design_descriptors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Design Type')

# Write STUDY PUBLICATIONS section
study_publications_df = _build_publications_section_df(prefix='Study', publications=study.publications)
fp.write('STUDY PUBLICATIONS\n')
fp.write(bytearray('STUDY PUBLICATIONS\n', 'utf-8'))
study_publications_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study PubMed ID')

# Write STUDY FACTORS section
study_factors_df = _build_factors_section_df(factors=study.factors)
fp.write('STUDY FACTORS\n')
fp.write(bytearray('STUDY FACTORS\n', 'utf-8'))
study_factors_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Factor Name')

study_assays_df = _build_assays_section_df(assays=study.assays)
fp.write('STUDY ASSAYS\n')
fp.write(bytearray('STUDY ASSAYS\n', 'utf-8'))
study_assays_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Assay File Name')

# Write STUDY PROTOCOLS section
study_protocols_df = _build_protocols_section_df(protocols=study.protocols)
fp.write('STUDY PROTOCOLS\n')
fp.write(bytearray('STUDY PROTOCOLS\n', 'utf-8'))
study_protocols_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Protocol Name')

# Write STUDY CONTACTS section
study_contacts_df = _build_contacts_section_df(
prefix='Study', contacts=study.contacts)
fp.write('STUDY CONTACTS\n')
fp.write(bytearray('STUDY CONTACTS\n', 'utf-8'))
study_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8',
index_label='Study Person Last Name')

Expand Down
49 changes: 34 additions & 15 deletions isatools/isatab/dump/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pandas import DataFrame
from numpy import nan

from isatools.constants import SYNONYMS
from isatools.constants import SYNONYMS, HEADER
from isatools.model import (
OntologyAnnotation,
Investigation,
Expand All @@ -22,8 +22,7 @@
get_pv_columns,
get_fv_columns,
get_characteristic_columns,
get_object_column_map,
get_column_header
get_object_column_map
)


Expand Down Expand Up @@ -60,7 +59,6 @@ def flatten(current_list):
paths = _all_end_to_end_paths(
s_graph,
[x for x in s_graph.nodes() if isinstance(s_graph.indexes[x], Source)])
log.warning(s_graph.nodes())

sample_in_path_count = 0
protocol_in_path_count = 0
Expand Down Expand Up @@ -221,7 +219,7 @@ def flatten(current_list):
DF = DF.replace('', nan)
DF = DF.dropna(axis=1, how='all')

with open(path.join(output_dir, study_obj.filename), 'w') as out_fp:
with open(path.join(output_dir, study_obj.filename), 'wb') as out_fp:
DF.to_csv(
path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8')

Expand All @@ -243,7 +241,13 @@ def write_assay_table_files(inv_obj, output_dir, write_factor_values=False):

if not isinstance(inv_obj, Investigation):
raise NotImplementedError
protocol_types_dict = load_protocol_types_info()
yaml_dict = load_protocol_types_info()
protocol_types_dict = {}
for protocol, attributes in yaml_dict.items():
protocol_types_dict[protocol] = attributes
for synonym in attributes[SYNONYMS]:
protocol_types_dict[synonym] = attributes

for study_obj in inv_obj.studies:
for assay_obj in study_obj.assays:
a_graph = assay_obj.graph
Expand Down Expand Up @@ -295,10 +299,17 @@ def flatten(current_list):
columns += flatten(map(lambda x: get_pv_columns(olabel, x),
node.parameter_values))
if node.executes_protocol.protocol_type:
oname_label = get_column_header(
node.executes_protocol.protocol_type.term,
protocol_types_dict
)
if isinstance(node.executes_protocol.protocol_type, OntologyAnnotation):
protocol_type = node.executes_protocol.protocol_type.term.lower()
else:
protocol_type = node.executes_protocol.protocol_type.lower()

if protocol_type in protocol_types_dict and\
protocol_types_dict[protocol_type][HEADER]:
oname_label = protocol_types_dict[protocol_type][HEADER]
else:
oname_label = None

if oname_label is not None:
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
Expand Down Expand Up @@ -361,10 +372,17 @@ def pbar(x):
protocol_in_path_count += 1
df_dict[olabel][-1] = node.executes_protocol.name
if node.executes_protocol.protocol_type:
oname_label = get_column_header(
node.executes_protocol.protocol_type.term,
protocol_types_dict
)
if isinstance(node.executes_protocol.protocol_type, OntologyAnnotation):
protocol_type = node.executes_protocol.protocol_type.term.lower()
else:
protocol_type = node.executes_protocol.protocol_type.lower()

if protocol_type in protocol_types_dict and\
protocol_types_dict[protocol_type][HEADER]:
oname_label = protocol_types_dict[protocol_type][HEADER]
else:
oname_label = None

if oname_label is not None:
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
Expand All @@ -378,6 +396,7 @@ def pbar(x):
node.name
df_dict["Array Design REF"][-1] = \
node.array_design_ref

if node.date is not None:
df_dict[olabel + ".Date"][-1] = node.date
if node.performer is not None:
Expand Down Expand Up @@ -496,7 +515,7 @@ def pbar(x):
DF = DF.dropna(axis=1, how='all')

with open(path.join(
output_dir, assay_obj.filename), 'w') as out_fp:
output_dir, assay_obj.filename), 'wb') as out_fp:
DF.to_csv(path_or_buf=out_fp, index=False, sep='\t',
encoding='utf-8')

Expand Down
3 changes: 1 addition & 2 deletions isatools/isatab/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def _all_end_to_end_paths(G, start_nodes):
num_start_nodes = len(start_nodes)
message = 'Calculating for paths for {} start nodes: '.format(
num_start_nodes)
log.info(start_nodes)
# log.info(start_nodes)
start_node = G.indexes[start_nodes[0]]
if isinstance(start_node, Source):
message = 'Calculating for paths for {} sources: '.format(
Expand Down Expand Up @@ -61,7 +61,6 @@ def _longest_path_and_attrs(paths, indexes):
:return: The longest path and attributes
"""
longest = (0, None)
log.info(paths)
for path in paths:
length = len(path)
for node in path:
Expand Down
6 changes: 6 additions & 0 deletions isatools/isatab/load/ProcessSequenceFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,12 @@ def get_node_by_label_and_key(labl, this_key):
if comment_key not in [x.name for x in process.comments]:
process.comments.append(Comment(name=comment_key, value=str(object_series[comment_column])))

for performer in [c for c in column_group if c == 'Performer']:
process.performer = str(object_series[performer])

for date in [c for c in column_group if c == 'Date']:
process.date = str(object_series[date])

for _, object_series in DF.iterrows(): # don't drop duplicates
process_key_sequence = list()
source_node_context = None
Expand Down
9 changes: 7 additions & 2 deletions isatools/isatab/load/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from isatools.isatab.load.read import read_investigation_file, read_tfile
from isatools.isatab.load.ProcessSequenceFactory import ProcessSequenceFactory, preprocess
from isatools.isatab.load.core import load, merge_study_with_assay_tables, load_table
from isatools.isatab.load.core import (
load,
merge_study_with_assay_tables,
load_table,
read_investigation_file,
read_tfile
)
Loading

0 comments on commit ed25daa

Please sign in to comment.