-
Notifications
You must be signed in to change notification settings - Fork 40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add tracking to data file type column names 2. #553
Changes from 3 commits
2cb095b
ca12fe4
e50e65e
bc33a2f
ed25daa
ad22cae
4a8b735
770db3c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
import re | ||
|
||
from isatools.isatab.utils import process_keygen, find_lt, find_gt, pairwise, get_object_column_map, get_value | ||
from isatools.isatab.defaults import ( | ||
log, | ||
|
@@ -146,11 +148,12 @@ def create_from_df(self, DF): | |
except KeyError: | ||
pass | ||
|
||
for data_col in [x for x in DF.columns if x.endswith(" File")]: | ||
for data_col in [x for x in DF.columns if " File" in x]: | ||
label = re.match(r'(.* File)', data_col).group(0) | ||
filenames = [x for x in DF[data_col].drop_duplicates() if x != ''] | ||
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames))) | ||
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=label)), filenames))) | ||
|
||
node_cols = [i for i, c in enumerate(DF.columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES] | ||
node_cols = [i for i, c in enumerate(DF.columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES or ' File' in c] | ||
proc_cols = [i for i, c in enumerate(DF.columns) if c.startswith("Protocol REF")] | ||
|
||
try: | ||
|
@@ -167,7 +170,7 @@ def get_node_by_label_and_key(labl, this_key): | |
n = samples[lk] | ||
elif labl in ('Extract Name', 'Labeled Extract Name'): | ||
n = other_material[lk] | ||
elif labl.endswith(' File'): | ||
elif ' File' in labl: | ||
n = data[lk] | ||
return n | ||
|
||
|
@@ -260,7 +263,7 @@ def get_node_by_label_and_key(labl, this_key): | |
fv_set.add(fv) | ||
material.factor_values = list(fv_set) | ||
|
||
elif object_label in _LABELS_DATA_NODES: | ||
elif object_label in _LABELS_DATA_NODES or ' File' in object_label: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. object_label="foo File" would cause issue There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After discussing in meeting. I will remove this and make the code always just look in _LABELS_DATA_NODES. |
||
for _, object_series in DF[column_group].drop_duplicates().iterrows(): | ||
try: | ||
data_file = get_node_by_label_and_key(object_label, str(object_series[object_label])) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
import itertools | ||
|
||
import networkx as nx | ||
from hashlib import md5, sha1, sha256, blake2b | ||
import os | ||
|
@@ -18,6 +20,154 @@ def find(predictor, iterable): | |
return None, it | ||
|
||
|
||
def _build_paths_and_indexes(process_sequence=None): | ||
"""Returns the paths from source/sample to end points and a mapping of sequence_identifier to object.""" | ||
|
||
def _compute_combinations(identifier_list, identifiers_to_objects): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
io_types = {} | ||
for identifier in identifier_list: | ||
io_object = identifiers_to_objects[identifier] | ||
if isinstance(io_object, DataFile): | ||
label = io_object.label | ||
if label not in io_types: | ||
io_types[label] = [identifier] | ||
else: | ||
io_types[label].append(identifier) | ||
else: | ||
if "Material" not in io_types: | ||
io_types["Material"] = [identifier] | ||
else: | ||
io_types["Material"].append(identifier) | ||
combinations = [item for item in list(itertools.product(*[values for values in io_types.values()])) if item] | ||
return combinations | ||
|
||
## Determining paths depends on processes having next and prev sequence, so add them if they aren't there | ||
## based on inputs and outputs. | ||
inputs_to_process = {id(p_input):{"process":process, "input":p_input} for process in process_sequence for p_input in process.inputs} | ||
outputs_to_process = {id(output):{"process":process, "output":output} for process in process_sequence for output in process.outputs} | ||
for output, output_dict in outputs_to_process.items(): | ||
if output in inputs_to_process: | ||
if not inputs_to_process[output]["process"].prev_process: | ||
inputs_to_process[output]["process"].prev_process = output_dict["process"] | ||
if not output_dict["process"].next_process: | ||
output_dict["process"].next_process = inputs_to_process[output]["process"] | ||
|
||
paths = [] | ||
identifiers_to_objects = {} | ||
all_inputs = set() | ||
all_outputs = set() | ||
for process in process_sequence: | ||
|
||
identifiers_to_objects[process.sequence_identifier] = process | ||
for output in process.outputs: | ||
identifiers_to_objects[output.sequence_identifier] = output | ||
all_outputs.add(output.sequence_identifier) | ||
for input_ in process.inputs: | ||
identifiers_to_objects[input_.sequence_identifier] = input_ | ||
all_inputs.add(input_.sequence_identifier) | ||
|
||
|
||
original_process = process | ||
|
||
right_processes = [] | ||
while next_process := process.next_process: | ||
right_processes.append(next_process.sequence_identifier) | ||
process = next_process | ||
|
||
left_processes = [] | ||
process = original_process | ||
while prev_process := process.prev_process: | ||
left_processes.append(prev_process.sequence_identifier) | ||
process = prev_process | ||
left_processes = list(reversed(left_processes)) | ||
|
||
paths.append(left_processes + [original_process.sequence_identifier] + right_processes) | ||
|
||
|
||
unique_paths = [list(x) for x in set(tuple(x) for x in paths)] | ||
paths = unique_paths | ||
dead_end_outputs = all_outputs - all_inputs | ||
|
||
## Add paths based on inputs and outputs. | ||
str_path_to_path = {} | ||
was_path_modified = {} | ||
paths_seen = [] | ||
paths_seen_twice = [] | ||
while True: | ||
new_paths = [] | ||
paths_seen_changed = False | ||
for path in paths: | ||
str_path = str(path) | ||
str_path_to_path[str_path] = path | ||
if path not in paths_seen: | ||
paths_seen.append(path) | ||
paths_seen_changed = True | ||
else: | ||
paths_seen_twice.append(path) | ||
continue | ||
path_len = len(path) | ||
path_modified = False | ||
for i, identifier in enumerate(path): | ||
node = identifiers_to_objects[identifier] | ||
|
||
if i == 0 and isinstance(node, Process): | ||
identifier_list = [input_.sequence_identifier for input_ in node.inputs] | ||
combinations = _compute_combinations(identifier_list, identifiers_to_objects) | ||
for combo in combinations: | ||
new_path = list(combo) + path | ||
path_modified = True | ||
if new_path not in new_paths: | ||
new_paths.append(new_path) | ||
continue | ||
|
||
if i == path_len - 1 and isinstance(node, Process): | ||
identifier_list = [output.sequence_identifier for output in node.outputs] | ||
combinations = _compute_combinations(identifier_list, identifiers_to_objects) | ||
for combo in combinations: | ||
new_path = path + list(combo) | ||
path_modified = True | ||
if new_path not in new_paths: | ||
new_paths.append(new_path) | ||
continue | ||
|
||
if i + 1 < path_len and isinstance(identifiers_to_objects[path[i+1]], Process) and i > 0 and isinstance(node, Process): | ||
output_sequence_identifiers = {output.sequence_identifier for output in node.outputs} | ||
input_sequence_identifiers = {input_.sequence_identifier for input_ in identifiers_to_objects[path[i+1]].inputs} | ||
identifier_intersection = output_sequence_identifiers.intersection(input_sequence_identifiers) | ||
|
||
combinations = _compute_combinations(identifier_intersection, identifiers_to_objects) | ||
for combo in combinations: | ||
new_path = path[0:i+1] + list(combo) + path[i+1:] | ||
path_modified = True | ||
if new_path not in new_paths: | ||
new_paths.append(new_path) | ||
|
||
## Add outputs that aren't later used as inputs. | ||
for output in output_sequence_identifiers.intersection(dead_end_outputs): | ||
new_path = path[:i+1] + [output] | ||
path_modified = True | ||
if new_path not in new_paths: | ||
new_paths.append(new_path) | ||
continue | ||
## This is supposed to catch different length paths. | ||
if not path_modified and path not in new_paths: | ||
new_paths.append(path) | ||
|
||
if str_path in was_path_modified: | ||
if path_modified: | ||
was_path_modified[str_path] = path_modified | ||
else: | ||
was_path_modified[str_path] = path_modified | ||
if not paths_seen_changed: | ||
break | ||
paths = new_paths | ||
|
||
|
||
paths = [str_path_to_path[path] for path, was_modified in was_path_modified.items() if not was_modified] | ||
|
||
return paths, identifiers_to_objects | ||
|
||
|
||
def _build_assay_graph(process_sequence=None): | ||
""":obj:`networkx.DiGraph` Returns a directed graph object based on a | ||
given ISA process sequence.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see comment above, same logic