Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tracking to data file type column names 2. #553

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 63 additions & 46 deletions isatools/isatab/dump/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from isatools.isatab.defaults import log
from isatools.isatab.graph import _all_end_to_end_paths, _longest_path_and_attrs
from isatools.model.utils import _build_paths_and_indexes
from isatools.isatab.utils import (
get_comment_column,
get_pv_columns,
Expand Down Expand Up @@ -260,24 +261,21 @@ def flatten(current_list):

columns = []

# start_nodes, end_nodes = _get_start_end_nodes(a_graph)
paths = _all_end_to_end_paths(
a_graph, [x for x in a_graph.nodes()
if isinstance(a_graph.indexes[x], Sample)])
paths, indexes = _build_paths_and_indexes(assay_obj.process_sequence)
if len(paths) == 0:
log.info("No paths found, skipping writing assay file")
continue
if _longest_path_and_attrs(paths, a_graph.indexes) is None:
if _longest_path_and_attrs(paths, indexes) is None:
raise IOError(
"Could not find any valid end-to-end paths in assay graph")

protocol_in_path_count = 0
for node_index in _longest_path_and_attrs(paths, a_graph.indexes):
node = a_graph.indexes[node_index]
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in _longest_path_and_attrs(paths, indexes):
node = indexes[node_index]
if isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
columns.append(olabel)
columns += flatten(
map(lambda x: get_comment_column(olabel, x),
Expand Down Expand Up @@ -313,22 +311,21 @@ def flatten(current_list):
oname_label = None

if oname_label is not None:
columns.append(oname_label)

if node.executes_protocol.protocol_type.term.lower() in \
protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
columns.append("Array Design REF")

if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])

columns.append(new_oname_label)
name_label_in_path_counts[oname_label] += 1
elif node.executes_protocol.protocol_type.term.lower() \
in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
columns.extend(
["Hybridization Assay Name",
"Array Design REF"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ptth222: doing the code review, and trying to merge, caused 2 tests to fail.
There are several issues we need to discuss but the PR can not be merged as is:

  • we never enter this elif at line 320
  • Hybridization Assay Name, Array Design REF are appended with .0 when there is one occurrence only. this prevents the df_dict to retrieve the right key, raising a KeyError. We suggest a first pass to count the number of headers and only append the process number when there is more than one.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why I made that an "elif". I think It's been too long and I can't remember. I found a dataset that uses "nucleic acid hybridization" and used that to test with, so now it should work. I'm not sure what you are talking about with the KeyError. If you have a specific dataset to illustrate that would be helpful.

columns += flatten(
map(lambda x: get_comment_column(olabel, x),
node.comments))

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
if output.label not in columns:
columns.append(output.label)
columns += flatten(
map(lambda x: get_comment_column(output.label, x),
output.comments))
elif isinstance(node, Material):
olabel = node.type
columns.append(olabel)
Expand All @@ -340,7 +337,18 @@ def flatten(current_list):
node.comments))

elif isinstance(node, DataFile):
pass # handled in process
# pass # handled in process

output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])

columns.append(new_output_label)
output_label_in_path_counts[output_label] += 1
columns += flatten(
map(lambda x: get_comment_column(new_output_label, x),
node.comments))

omap = get_object_column_map(columns, columns)

Expand All @@ -355,8 +363,10 @@ def pbar(x):
df_dict[k].extend([""])

protocol_in_path_count = 0
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in path_:
node = a_graph.indexes[node_index]
node = indexes[node_index]
if isinstance(node, Process):
olabel = "Protocol REF.{}".format(protocol_in_path_count)
protocol_in_path_count += 1
Expand All @@ -374,12 +384,19 @@ def pbar(x):
oname_label = None

if oname_label is not None:
df_dict[oname_label][-1] = node.name
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])

df_dict[new_oname_label][-1] = node.name
name_label_in_path_counts[oname_label] += 1
elif node.executes_protocol.protocol_type.term.lower() in \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see comment above, same logic

protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
df_dict["Hybridization Assay Name"][-1] = \
node.name
df_dict["Array Design REF"][-1] = \
node.array_design_ref

if node.executes_protocol.protocol_type.term.lower() in \
protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
df_dict["Array Design REF"][-1] = node.array_design_ref

if node.date is not None:
df_dict[olabel + ".Date"][-1] = node.date
if node.performer is not None:
Expand All @@ -391,23 +408,8 @@ def pbar(x):
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
output_by_type = []
delim = ";"
olabel = output.label
if output.label not in columns:
columns.append(output.label)
output_by_type.append(output.filename)
df_dict[olabel][-1] = delim.join(map(str, output_by_type))

for co in output.comments:
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

elif isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
df_dict[olabel][-1] = node.name
for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
Expand All @@ -434,7 +436,19 @@ def pbar(x):
df_dict[colabel][-1] = co.value

elif isinstance(node, DataFile):
pass # handled in process
# pass # handled in process

output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
df_dict[new_output_label][-1] = node.filename
output_label_in_path_counts[output_label] += 1

for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
new_output_label, co.name)
df_dict[colabel][-1] = co.value

DF = DataFrame(columns=columns)
DF = DF.from_dict(data=df_dict)
Expand Down Expand Up @@ -482,6 +496,11 @@ def pbar(x):
columns[i] = "Protocol REF"
elif "." in col:
columns[i] = col[:col.rindex(".")]
else:
for output_label in output_label_in_path_counts:
if output_label in col:
columns[i] = output_label
break

log.debug("Rendered {} paths".format(len(DF.index)))
if len(DF.index) > 1:
Expand Down Expand Up @@ -521,8 +540,6 @@ def write_value_columns(df_dict, label, x):
elif x.unit.term_source.name:
df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name

# df_dict[label + ".Unit.Term Source REF"][-1] = \
# x.unit.term_source.name if x.unit.term_source else ""
df_dict[label + ".Unit.Term Accession Number"][-1] = \
x.unit.term_accession
else:
Expand Down
8 changes: 5 additions & 3 deletions isatools/isatab/load/ProcessSequenceFactory.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from isatools.isatab.utils import process_keygen, find_lt, find_gt, pairwise, get_object_column_map, get_value
from isatools.isatab.defaults import (
log,
Expand Down Expand Up @@ -146,7 +148,7 @@ def create_from_df(self, DF):
except KeyError:
pass

for data_col in [x for x in DF.columns if x.endswith(" File")]:
for data_col in [x for x in DF.columns if x in _LABELS_DATA_NODES]:
filenames = [x for x in DF[data_col].drop_duplicates() if x != '']
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames)))

Expand All @@ -167,7 +169,7 @@ def get_node_by_label_and_key(labl, this_key):
n = samples[lk]
elif labl in ('Extract Name', 'Labeled Extract Name'):
n = other_material[lk]
elif labl.endswith(' File'):
elif labl in _LABELS_DATA_NODES:
n = data[lk]
return n

Expand Down Expand Up @@ -410,7 +412,7 @@ def get_node_by_label_and_key(labl, this_key):
process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF)
process_key_sequence.append(process_key)

if object_label.endswith(' File'):
if object_label in _LABELS_DATA_NODES:
data_node = None
try:
data_node = get_node_by_label_and_key(object_label, str(object_series[object_label]))
Expand Down
2 changes: 1 addition & 1 deletion isatools/isatab/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def get_object_column_map(isatab_header, df_columns):
"""
labels = _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES
if set(isatab_header) == set(df_columns):
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x]
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x or ' File' in x]
else:
object_index = [i for i, x in enumerate(isatab_header) if x in labels + ['Protocol REF']]

Expand Down
Loading
Loading