Skip to content

Commit

Permalink
fixes issue-511 and incorporates code from @ptth222 from PR #553'
Browse files Browse the repository at this point in the history
  • Loading branch information
proccaserra committed Jul 9, 2024
1 parent db3c4a9 commit 84942ce
Show file tree
Hide file tree
Showing 13 changed files with 452 additions and 65 deletions.
4 changes: 4 additions & 0 deletions isatools/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@

QUALIFIER_LABELS = [
'Protocol REF',
'Performer',
'Date',
'Material Type',
'Term Source REF',
'Term Accession Number',
Expand All @@ -80,6 +82,8 @@

ALL_LABELS.append('Protocol REF')
ALL_LABELS.append('Label')
ALL_LABELS.append('Performer')
ALL_LABELS.append('Date')

_LABELS_ASSAY_NODES = [
'Assay Name',
Expand Down
137 changes: 80 additions & 57 deletions isatools/isatab/dump/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from isatools.isatab.defaults import log
from isatools.isatab.graph import _all_end_to_end_paths, _longest_path_and_attrs
from isatools.model.utils import _build_paths_and_indexes
from isatools.isatab.utils import (
get_comment_column,
get_pv_columns,
Expand Down Expand Up @@ -260,24 +261,24 @@ def flatten(current_list):

columns = []

# start_nodes, end_nodes = _get_start_end_nodes(a_graph)
paths = _all_end_to_end_paths(
a_graph, [x for x in a_graph.nodes()
if isinstance(a_graph.indexes[x], Sample)])
paths, indexes = _build_paths_and_indexes(assay_obj.process_sequence)

if len(paths) == 0:
log.info("No paths found, skipping writing assay file")
continue
if _longest_path_and_attrs(paths, a_graph.indexes) is None:
if _longest_path_and_attrs(paths, indexes) is None:
raise IOError(
"Could not find any valid end-to-end paths in assay graph")

protocol_in_path_count = 0
for node_index in _longest_path_and_attrs(paths, a_graph.indexes):
node = a_graph.indexes[node_index]
output_label_in_path_counts = {}
name_label_in_path_counts = {}
header_count: dict[str, int] = {}

for node_index in _longest_path_and_attrs(paths, indexes):
node = indexes[node_index]
if isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
columns.append(olabel)
columns += flatten(
map(lambda x: get_comment_column(olabel, x),
Expand Down Expand Up @@ -305,30 +306,25 @@ def flatten(current_list):
protocol_type = node.executes_protocol.protocol_type.term.lower()
else:
protocol_type = node.executes_protocol.protocol_type.lower()

if protocol_type in protocol_types_dict and\
protocol_types_dict[protocol_type][HEADER]:

if protocol_type in protocol_types_dict and protocol_types_dict[protocol_type][HEADER]:
oname_label = protocol_types_dict[protocol_type][HEADER]
else:
oname_label = None

if oname_label is not None:
columns.append(oname_label)

if node.executes_protocol.protocol_type.term.lower() in \
protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
columns.append("Array Design REF")


if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
header_count[oname_label] = 0
new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])

columns.append(new_oname_label)
name_label_in_path_counts[oname_label] += 1

if protocol_type in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
columns.extend(["Array Design REF"])

columns += flatten(
map(lambda x: get_comment_column(olabel, x),
node.comments))

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
if output.label not in columns:
columns.append(output.label)
columns += flatten(
map(lambda x: get_comment_column(output.label, x),
output.comments))
print(columns)
elif isinstance(node, Material):
olabel = node.type
columns.append(olabel)
Expand All @@ -340,7 +336,17 @@ def flatten(current_list):
node.comments))

elif isinstance(node, DataFile):
pass # handled in process
# pass # handled in process
output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])

columns.append(new_output_label)
output_label_in_path_counts[output_label] += 1
columns += flatten(
map(lambda x: get_comment_column(new_output_label, x),
node.comments))

omap = get_object_column_map(columns, columns)

Expand All @@ -355,8 +361,10 @@ def pbar(x):
df_dict[k].extend([""])

protocol_in_path_count = 0
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in path_:
node = a_graph.indexes[node_index]
node = indexes[node_index]
if isinstance(node, Process):
olabel = "Protocol REF.{}".format(protocol_in_path_count)
protocol_in_path_count += 1
Expand All @@ -366,20 +374,20 @@ def pbar(x):
protocol_type = node.executes_protocol.protocol_type.term.lower()
else:
protocol_type = node.executes_protocol.protocol_type.lower()

if protocol_type in protocol_types_dict and\
protocol_types_dict[protocol_type][HEADER]:

if protocol_type in protocol_types_dict and protocol_types_dict[protocol_type][HEADER]:
oname_label = protocol_types_dict[protocol_type][HEADER]
else:
oname_label = None

if oname_label is not None:
df_dict[oname_label][-1] = node.name

if node.executes_protocol.protocol_type.term.lower() in \
protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0

new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
df_dict[new_oname_label][-1] = node.name
name_label_in_path_counts[oname_label] += 1

if protocol_type in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
df_dict["Array Design REF"][-1] = node.array_design_ref

if node.date is not None:
df_dict[olabel + ".Date"][-1] = node.date
if node.performer is not None:
Expand All @@ -391,18 +399,18 @@ def pbar(x):
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
output_by_type = []
delim = ";"
olabel = output.label
if output.label not in columns:
columns.append(output.label)
output_by_type.append(output.filename)
df_dict[olabel][-1] = delim.join(map(str, output_by_type))

for co in output.comments:
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value
# for output in [x for x in node.outputs if isinstance(x, DataFile)]:
# output_by_type = []
# delim = ";"
# olabel = output.label
# if output.label not in columns:
# columns.append(output.label)
# output_by_type.append(output.filename)
# df_dict[olabel][-1] = delim.join(map(str, output_by_type))
#
# for co in output.comments:
# colabel = "{0}.Comment[{1}]".format(olabel, co.name)
# df_dict[colabel][-1] = co.value

elif isinstance(node, Sample):
olabel = "Sample Name"
Expand Down Expand Up @@ -434,7 +442,19 @@ def pbar(x):
df_dict[colabel][-1] = co.value

elif isinstance(node, DataFile):
pass # handled in process
# pass # handled in process

output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
df_dict[new_output_label][-1] = node.filename
output_label_in_path_counts[output_label] += 1

for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
new_output_label, co.name)
df_dict[colabel][-1] = co.value

DF = DataFrame(columns=columns)
DF = DF.from_dict(data=df_dict)
Expand Down Expand Up @@ -482,6 +502,11 @@ def pbar(x):
columns[i] = "Protocol REF"
elif "." in col:
columns[i] = col[:col.rindex(".")]
else:
for output_label in output_label_in_path_counts:
if output_label in col:
columns[i] = output_label
break

log.debug("Rendered {} paths".format(len(DF.index)))
if len(DF.index) > 1:
Expand Down Expand Up @@ -521,8 +546,6 @@ def write_value_columns(df_dict, label, x):
elif x.unit.term_source.name:
df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name

# df_dict[label + ".Unit.Term Source REF"][-1] = \
# x.unit.term_source.name if x.unit.term_source else ""
df_dict[label + ".Unit.Term Accession Number"][-1] = \
x.unit.term_accession
else:
Expand Down
6 changes: 3 additions & 3 deletions isatools/isatab/load/ProcessSequenceFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def create_from_df(self, DF):
except KeyError:
pass

for data_col in [x for x in DF.columns if x.endswith(" File")]:
for data_col in [x for x in DF.columns if x in _LABELS_DATA_NODES]:
filenames = [x for x in DF[data_col].drop_duplicates() if x != '']
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames)))

Expand All @@ -167,7 +167,7 @@ def get_node_by_label_and_key(labl, this_key):
n = samples[lk]
elif labl in ('Extract Name', 'Labeled Extract Name'):
n = other_material[lk]
elif labl.endswith(' File'):
elif labl in _LABELS_DATA_NODES:
n = data[lk]
return n

Expand Down Expand Up @@ -410,7 +410,7 @@ def get_node_by_label_and_key(labl, this_key):
process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF)
process_key_sequence.append(process_key)

if object_label.endswith(' File'):
if object_label in _LABELS_DATA_NODES:
data_node = None
try:
data_node = get_node_by_label_and_key(object_label, str(object_series[object_label]))
Expand Down
2 changes: 1 addition & 1 deletion isatools/isatab/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def get_object_column_map(isatab_header, df_columns):
"""
labels = _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES
if set(isatab_header) == set(df_columns):
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x]
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x or ' File' in x]
else:
object_index = [i for i, x in enumerate(isatab_header) if x in labels + ['Protocol REF']]

Expand Down
2 changes: 2 additions & 0 deletions isatools/isatab/validate/rules/rules_40xx.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,8 @@ def load_table_checks(df, filename):
'Extract Name',
'Labeled Extract Name',
'Protocol REF',
'Performer',
'Date',
'Raw Data File',
'Raw Spectral Data File',
'Free Induction Decay Data File',
Expand Down
Loading

0 comments on commit 84942ce

Please sign in to comment.