From 75b746c5f65a0b5f21316b2796431b32f696ada4 Mon Sep 17 00:00:00 2001 From: Justin Hiemstra Date: Tue, 10 Oct 2023 19:49:05 +0000 Subject: [PATCH] Move container framework to algorith run call and fix hash lengths This commit moves the container framework up a level so that each algorithm is called with the configured framework instead of determining the framework internally. The logic behind this is that doing so makes testing much easier; instead of messing with global configs in the tests, just call the PRM's run function with a different framework specified. This also moves prepare_volume to `containers.py` to fix the issue with importing hash lengths in the function. Arguably, I should have moved prepare_volume into containers.py in the first place. --- Snakefile | 1 + config/config.yaml | 7 ++--- src/allpairs.py | 6 ++--- src/analysis/cytoscape.py | 3 +-- src/config.py | 10 ++++++-- src/containers.py | 40 +++++++++++++++++++++++++++++ src/domino.py | 8 +++--- src/meo.py | 7 ++--- src/mincostflow.py | 6 ++--- src/omicsintegrator1.py | 8 ++---- src/omicsintegrator2.py | 7 +++-- src/pathlinker.py | 6 ++--- src/util.py | 41 ------------------------------ test/AllPairs/test_ap.py | 6 ++--- test/DOMINO/test_domino.py | 6 ++--- test/MEO/test_meo.py | 5 ++-- test/MinCostFlow/test_mcf.py | 5 ++-- test/OmicsIntegrator1/test_oi1.py | 5 ++-- test/OmicsIntegrator2/test_oi2.py | 5 ++-- test/PathLinker/test_pathlinker.py | 4 +-- test/test_util.py | 10 ++------ 21 files changed, 86 insertions(+), 110 deletions(-) diff --git a/Snakefile b/Snakefile index 5fa1ab60..d9a8288b 100644 --- a/Snakefile +++ b/Snakefile @@ -212,6 +212,7 @@ rule reconstruct: params.pop('spras_placeholder') # TODO consider the best way to pass global configuration information to the run functions # This approach requires that all run functions support a singularity option + params['container_framework'] = FRAMEWORK runner.run(wildcards.algorithm, params) # Original pathway reconstruction output to universal output diff --git a/config/config.yaml b/config/config.yaml index 986c7670..cf0b449f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -2,9 +2,10 @@ # The length of the hash used to identify a parameter combination hash_length: 7 -# If true, use Singularity instead of Docker -# Singularity support is only available on Unix -singularity: false + +# Specify the container framework. Current supported versions include 'docker' and +# 'singularity'. If container_framework is not specified, SPRAS will default to docker. +container_framework: docker # Allow the user to configure which container registry containers should be pulled from # Note that this assumes container names are consistent across registries, and that the diff --git a/src/allpairs.py b/src/allpairs.py index eb751381..1a0c30fc 100644 --- a/src/allpairs.py +++ b/src/allpairs.py @@ -4,9 +4,8 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.prm import PRM -from src.util import prepare_volume __all__ = ['AllPairs'] @@ -50,7 +49,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(nodetypes=None, network=None, output_file=None): + def run(nodetypes=None, network=None, output_file=None, container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) @@ -85,7 +84,6 @@ def run(nodetypes=None, network=None, output_file=None): print('Running All Pairs Shortest Paths with arguments: {}'.format(' '.join(command)), flush=True) - container_framework = config.config.container_framework container_suffix = "allpairs" out = run_container( diff --git a/src/analysis/cytoscape.py b/src/analysis/cytoscape.py index a6c9681c..7ab18622 100644 --- a/src/analysis/cytoscape.py +++ b/src/analysis/cytoscape.py @@ -3,8 +3,7 @@ from typing import List, Union import src.config as config -from src.containers import run_container -from src.util import prepare_volume +from src.containers import prepare_volume, run_container def run_cytoscape(pathways: List[Union[str, PurePath]], output_file: str) -> None: diff --git a/src/config.py b/src/config.py index 28e3d201..16ff9756 100644 --- a/src/config.py +++ b/src/config.py @@ -64,8 +64,14 @@ def process_config(self, raw_config): # Set up a few top-level config variables self.out_dir = raw_config["reconstruction_settings"]["locations"]["reconstruction_dir"] - if "singularity" in raw_config and raw_config["singularity"]: - self.container_framework = "singularity" + # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. + # However, if we get a bad value, we raise an exception. + if "container_framework" in raw_config: + container_framework = raw_config["container_framework"].lower() + if container_framework not in ("docker", "singularity"): + msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker' or 'singularity" + raise ValueError(msg) + self.container_framework = container_framework else: self.container_framework = "docker" diff --git a/src/containers.py b/src/containers.py index 8effaafc..e20d8285 100644 --- a/src/containers.py +++ b/src/containers.py @@ -7,6 +7,7 @@ import docker import src.config as config +from src.util import hash_filename def prepare_path_docker(orig_path: PurePath) -> str: @@ -194,3 +195,42 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ command, options=singularity_options, bind=bind_paths) + +# Because this is called independently for each file, the same local path can be mounted to multiple volumes +def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: + """ + Makes a file on the local file system accessible within a container by mapping the local (source) path to a new + container (destination) path and renaming the file to be relative to the destination path. + The destination path will be a new path relative to the volume_base that includes a hash identifier derived from the + original filename. + An example mapped filename looks like '/spras/MG4YPNK/oi1-edges.txt'. + @param filename: The file on the local file system to map + @param volume_base: The base directory in the container, which must be an absolute directory + @return: first returned object is a tuple (source path, destination path) and the second returned object is the + updated filename relative to the destination path + """ + base_path = PurePosixPath(volume_base) + if not base_path.is_absolute(): + raise ValueError(f'Volume base must be an absolute path: {volume_base}') + + if isinstance(filename, PurePath): + filename = str(filename) + + # There's no clear way to get DEFAULT_HASH_LENGTH from config without a circular import... + # For now, hardcoding the value to 7, since it appeared the value wasn't updated by + # config.yaml before anyway. + filename_hash = hash_filename(filename, config.config.hash_length) + dest = PurePosixPath(base_path, filename_hash) + + abs_filename = Path(filename).resolve() + container_filename = str(PurePosixPath(dest, abs_filename.name)) + if abs_filename.is_dir(): + dest = PurePosixPath(dest, abs_filename.name) + src = abs_filename + else: + parent = abs_filename.parent + if parent.as_posix() == '.': + parent = Path.cwd() + src = parent + + return (src, dest), container_filename diff --git a/src/domino.py b/src/domino.py index 08788516..9418380f 100644 --- a/src/domino.py +++ b/src/domino.py @@ -4,9 +4,8 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.prm import PRM -from src.util import prepare_volume __all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform'] @@ -55,7 +54,7 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(network=None, active_genes=None, output_file=None, slice_threshold=None, module_threshold=None): + def run(network=None, active_genes=None, output_file=None, slice_threshold=None, module_threshold=None, container_framework="docker"): """ Run DOMINO with Docker. Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. @@ -98,7 +97,6 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, print('Running slicer with arguments: {}'.format(' '.join(slicer_command)), flush=True) - container_framework = config.config.container_framework container_suffix = "domino" slicer_out = run_container(container_framework, container_suffix, @@ -127,7 +125,7 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, print('Running DOMINO with arguments: {}'.format(' '.join(domino_command)), flush=True) domino_out = run_container(container_framework, - 'reedcompbio/domino', + container_suffix, domino_command, volumes, work_dir) diff --git a/src/meo.py b/src/meo.py index 05370091..6139b59d 100644 --- a/src/meo.py +++ b/src/meo.py @@ -3,9 +3,8 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.prm import PRM -from src.util import prepare_volume __all__ = ['MEO', 'write_properties'] @@ -87,7 +86,7 @@ def generate_inputs(data, filename_map): # TODO document required arguments @staticmethod def run(edges=None, sources=None, targets=None, output_file=None, max_path_length=None, local_search=None, - rand_restarts=None): + rand_restarts=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -139,8 +138,6 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt print('Running Maximum Edge Orientation with arguments: {}'.format(' '.join(command)), flush=True) - # TODO consider making this a string in the config file instead of a Boolean - container_framework = config.config.container_framework container_suffix = "meo" out = run_container(container_framework, container_suffix, diff --git a/src/mincostflow.py b/src/mincostflow.py index 68860e2c..74a82c88 100644 --- a/src/mincostflow.py +++ b/src/mincostflow.py @@ -3,9 +3,8 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.prm import PRM -from src.util import prepare_volume __all__ = ['MinCostFlow'] @@ -43,7 +42,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['edges'], sep='\t', index=False, columns=["Interactor1","Interactor2","Weight"], header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, flow=None, capacity=None, singularity=False): + def run(sources=None, targets=None, edges=None, output_file=None, flow=None, capacity=None, container_framework="docker"): """ Run min cost flow with Docker (or singularity) @param sources: input sources (required) @@ -96,7 +95,6 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap command.extend(['--capacity', str(capacity)]) # choosing to run in docker or singularity container - container_framework = config.config.container_framework container_suffix = "mincostflow" # constructs a docker run call diff --git a/src/omicsintegrator1.py b/src/omicsintegrator1.py index 82856273..024f4682 100644 --- a/src/omicsintegrator1.py +++ b/src/omicsintegrator1.py @@ -3,9 +3,8 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.prm import PRM -from src.util import prepare_volume __all__ = ['OmicsIntegrator1', 'write_conf'] @@ -79,7 +78,7 @@ def generate_inputs(data, filename_map): @staticmethod def run(edges=None, prizes=None, dummy_mode=None, mu_squared=None, exclude_terms=None, output_file=None, noisy_edges=None, shuffled_prizes=None, random_terminals=None, - seed=None, w=None, b=None, d=None, mu=None, noise=None, g=None, r=None): + seed=None, w=None, b=None, d=None, mu=None, noise=None, g=None, r=None, container_framework="docker"): """ Run Omics Integrator 1 in the Docker image with the provided parameters. Does not support the garnet, cyto30, knockout, cv, or cv-reps arguments. @@ -145,10 +144,7 @@ def run(edges=None, prizes=None, dummy_mode=None, mu_squared=None, exclude_terms print('Running Omics Integrator 1 with arguments: {}'.format(' '.join(command)), flush=True) - # TODO consider making this a string in the config file instead of a Boolean - container_framework = config.config.container_framework container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default - out = run_container(container_framework, container_suffix, # no-conda version is the default command, diff --git a/src/omicsintegrator2.py b/src/omicsintegrator2.py index 108b7b9d..aec1f4dc 100644 --- a/src/omicsintegrator2.py +++ b/src/omicsintegrator2.py @@ -3,10 +3,10 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.dataset import Dataset from src.prm import PRM -from src.util import add_rank_column, prepare_volume +from src.util import add_rank_column __all__ = ['OmicsIntegrator2'] @@ -52,7 +52,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO document required arguments @staticmethod def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise=None, noisy_edges=None, - random_terminals=None, dummy_mode=None, seed=None): + random_terminals=None, dummy_mode=None, seed=None, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -103,7 +103,6 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise print('Running Omics Integrator 2 with arguments: {}'.format(' '.join(command)), flush=True) - container_framework = config.config.container_framework container_suffix = "omics-integrator-2:v2" out = run_container(container_framework, container_suffix, diff --git a/src/pathlinker.py b/src/pathlinker.py index a724d03e..f2fa55b9 100644 --- a/src/pathlinker.py +++ b/src/pathlinker.py @@ -4,9 +4,8 @@ import pandas as pd import src.config as config -from src.containers import run_container +from src.containers import prepare_volume, run_container from src.prm import PRM -from src.util import prepare_volume __all__ = ['PathLinker'] @@ -49,7 +48,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(nodetypes=None, network=None, output_file=None, k=None): + def run(nodetypes=None, network=None, output_file=None, k=None, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) @@ -97,7 +96,6 @@ def run(nodetypes=None, network=None, output_file=None, k=None): print('Running PathLinker with arguments: {}'.format(' '.join(command)), flush=True) - container_framework = config.config.container_framework container_suffix = "pathlinker" out = run_container(container_framework, container_suffix, diff --git a/src/util.py b/src/util.py index 24d2bd84..505a7b58 100644 --- a/src/util.py +++ b/src/util.py @@ -42,47 +42,6 @@ def hash_filename(filename: str, length: Optional[int] = None) -> str: """ return hash_params_sha1_base32({'filename': filename}, length) - -# Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: - """ - Makes a file on the local file system accessible within a container by mapping the local (source) path to a new - container (destination) path and renaming the file to be relative to the destination path. - The destination path will be a new path relative to the volume_base that includes a hash identifier derived from the - original filename. - An example mapped filename looks like '/spras/MG4YPNK/oi1-edges.txt'. - @param filename: The file on the local file system to map - @param volume_base: The base directory in the container, which must be an absolute directory - @return: first returned object is a tuple (source path, destination path) and the second returned object is the - updated filename relative to the destination path - """ - base_path = PurePosixPath(volume_base) - if not base_path.is_absolute(): - raise ValueError(f'Volume base must be an absolute path: {volume_base}') - - if isinstance(filename, PurePath): - filename = str(filename) - - # There's no clear way to get DEFAULT_HASH_LENGTH from config without a circular import... - # For now, hardcoding the value to 7, since it appeared the value wasn't updated by - # config.yaml before anyway. - from src.config import DEFAULT_HASH_LENGTH - filename_hash = hash_filename(filename, DEFAULT_HASH_LENGTH) - dest = PurePosixPath(base_path, filename_hash) - - abs_filename = Path(filename).resolve() - container_filename = str(PurePosixPath(dest, abs_filename.name)) - if abs_filename.is_dir(): - dest = PurePosixPath(dest, abs_filename.name) - src = abs_filename - else: - parent = abs_filename.parent - if parent.as_posix() == '.': - parent = Path.cwd() - src = parent - - return (src, dest), container_filename - def make_required_dirs(path: str): """ Create the directory and parent directories required before an output file can be written to the specified path. diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index dc0af7ec..ea044634 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -44,13 +44,11 @@ def test_allpairs_singularity(self): out_path = Path(OUT_DIR+'sample-out.txt') out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - config.config.container_framework = "singularity" AllPairs.run( nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', network=TEST_DIR+'input/sample-in-net.txt', - output_file=str(out_path) - ) - config.config.container_framework = "docker" + output_file=str(out_path), + container_framework="singularity") assert out_path.exists() def test_allpairs_correctness(self): diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 06ef96f2..0f56090b 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -85,13 +85,13 @@ def test_domino_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - config.config.container_framework = "singularity" DOMINO.run( network=TEST_DIR+'input/domino-network.txt', active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() - config.config.container_framework = "docker" + def test_pre_id_transform(self): """ Test the node ID transformation run before DOMINO executes diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index 1940eccb..1beeb06e 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -62,10 +62,9 @@ def test_meo_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - config.config.container_framework = "singularity" MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', sources=TEST_DIR + 'input/meo-sources.txt', targets=TEST_DIR + 'input/meo-targets.txt', - output_file=OUT_FILE) - config.config.container_framework = "docker" + output_file=OUT_FILE, + container_framework="singularity") assert out_path.exists() diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index a14649b8..507bca90 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -106,13 +106,12 @@ def test_mincostflow_singularity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - config.config.container_framework = "singularity" MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', targets=TEST_DIR + 'input/' + graph + '/targets.txt', edges=TEST_DIR + 'input/' + graph + '/edges.txt', output_file=OUT_FILE, flow=1, - capacity=1) - config.config.container_framework = "docker" + capacity=1, + container_framework="singularity") assert out_path.exists() diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 1bfd1342..806f580f 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -88,12 +88,11 @@ def test_oi1_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - config.config.container_framework = "singularity" OmicsIntegrator1.run(edges=TEST_DIR + 'input/oi1-edges.txt', prizes=TEST_DIR + 'input/oi1-prizes.txt', output_file=OUT_FILE, w=5, b=1, - d=10) - config.config.container_framework = "docker" + d=10, + container_framework="singularity") assert out_path.exists() diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 46429233..b5dc1191 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -64,9 +64,8 @@ def test_oi2_missing(self): def test_oi2_singularity(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - config.config.container_framework = "singularity" OmicsIntegrator2.run(edges=EDGE_FILE, prizes=PRIZE_FILE, - output_file=OUT_FILE) - config.config.container_framework = "docker" + output_file=OUT_FILE, + container_framework="singularity") assert OUT_FILE.exists() diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index bf441646..bbb76a4a 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -56,11 +56,9 @@ def test_pathlinker_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - config.config.container_framework = "singularity" PathLinker.run( nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', network=TEST_DIR+'input/sample-in-net.txt', output_file=OUT_FILE_DEFAULT, - ) - config.config.container_framework = "docker" + container_framework="singularity") assert out_path.exists() diff --git a/test/test_util.py b/test/test_util.py index 15f7187e..08dd1e42 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -2,14 +2,8 @@ import pytest -from src.containers import ( - convert_docker_path, - prepare_path_docker, -) -from src.util import ( - hash_params_sha1_base32, - prepare_volume, -) +from src.containers import convert_docker_path, prepare_path_docker, prepare_volume +from src.util import hash_params_sha1_base32 class TestUtil: