Module execution: clone repository, checkout commit, execute run.sh

Daniel Incicau · Daniel Incicau · commit bd1ba6f9d6fe · 2024-06-27T14:00:28.000+02:00
diff --git a/Makefile b/Makefile
@@ -13,4 +13,4 @@ graph:
 	snakemake --dag | dot -Tpng > workflow_dag.png
 clean:
 	rm -f benchmark.pkl Snakefile
-	rm -rf ./in ./out ./log ./data/D1 ./data/D2 workflow_dag.png output_dag.png
+	rm -rf ./in ./out ./log ./data/D1 ./data/D2 workflow_dag.png output_dag.png ./.snakemake
diff --git a/Pipfile b/Pipfile
@@ -11,6 +11,7 @@ linkml-runtime = ">=1.7.0"
 rdflib = "==7.0.0"
 pyyaml = "~=6.0.1"
 omni-schema = {git = "https://github.com/omnibenchmark/omni-schema.git", editable = true, ref = "version/0.0.2"}
+gitpython = "*"
 
 [dev-packages]
 
diff --git a/data/Benchmark_001.yaml b/data/Benchmark_001.yaml
@@ -55,8 +55,8 @@ stages:
       - id: P2
         software_environment: "R"
         parameters:
-          - values: ["-a 0", "-c 0"]
-          - values: ["-a 1", "-c 0.1"]
+          - values: ["-a 0", "-b 0"]
+          - values: ["-a 1", "-b 0.1"]
         repository:
           url: https://github.com/omnibenchmark-example/process.git
           commit: 24579a8
@@ -107,17 +107,17 @@ stages:
       - id: m1
         software_environment: "python"
         repository:
-          url: git@github.com:omnibenchmark-example/metric.git
+          url: https://github.com/omnibenchmark-example/metric.git
           commit: ba781d7
       - id: m2
         software_environment: "python"
         repository:
-          url: git@github.com:omnibenchmark-example/metric.git
+          url: https://github.com/omnibenchmark-example/metric.git
           commit: ba781d7
       - id: m3
         software_environment: "python"
         repository:
-          url: git@github.com:omnibenchmark-example/metric.git
+          url: https://github.com/omnibenchmark-example/metric.git
           commit: ba781d7
     inputs:
       - entries: [
diff --git a/main.py b/main.py
@@ -63,7 +63,7 @@ def main(benchmark_file):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Test OmniWorkflow converter.')
-    parser.add_argument('--benchmark_file', default='data/Benchmark_002.yaml',
+    parser.add_argument('--benchmark_file', default='data/Benchmark_001.yaml',
                         type=str, help='Location of the benchmark file')
 
     args = parser.parse_args()
diff --git a/src/converter/linkml_converter.py b/src/converter/linkml_converter.py
@@ -7,6 +7,9 @@ def __init__(self, benchmark_file):
         super().__init__(benchmark_file)
         self.benchmark = load_yaml(benchmark_file)
 
+    def get_benchmark_name(self):
+        return self.benchmark.name if self.benchmark.name else self.benchmark.id
+
     def get_benchmark_definition(self):
         return self.benchmark
 
diff --git a/src/model/benchmark.py b/src/model/benchmark.py
@@ -10,6 +10,9 @@ def __init__(self, converter, output_folder='out'):
 
         self.execution_paths = None
 
+    def get_benchmark_name(self):
+        return self.converter.get_benchmark_name()
+
     def get_definition(self):
         return self.converter.get_benchmark_definition()
 
diff --git a/src/model/dag_operations.py b/src/model/dag_operations.py
@@ -26,8 +26,8 @@ def expend_stage_nodes(converter, stage, output_folder):
             for inputs in inputs_for_stage:
                 required_input_stages = set(converter.get_inputs_stage(inputs).values()) if inputs else None
                 most_recent_input_stage = sorted(list(required_input_stages), key=converter.stage_order)[-1] if inputs else None
-                inputs = converter.get_stage_explicit_inputs(inputs).values() if inputs else None
-                inputs = [x.replace('{input}', '{pre}') for x in inputs] if inputs else None
+                inputs = converter.get_stage_explicit_inputs(inputs) if inputs else None
+                inputs = {k: v.replace('{input}', '{pre}') for k, v in inputs.items()} if inputs else None
                 node = BenchmarkNode(converter, stage, module, param, inputs, outputs, param_id,
                                      after=most_recent_input_stage)
                 nodes.append(node)
diff --git a/src/model/node.py b/src/model/node.py
@@ -1,5 +1,7 @@
 import os.path
 
+from omni_schema.datamodel.omni_schema import Repository
+
 
 class BenchmarkNode:
     def __init__(self, converter,
@@ -22,14 +24,27 @@ def __init__(self, converter,
     def get_id(self):
         return BenchmarkNode.to_id(self.stage_id, self.module_id, self.param_id, self.after)
 
+    def get_benchmark_name(self):
+        return self.converter.get_benchmark_name()
+
     def get_definition(self):
         return self.converter.get_benchmark_definition()
 
     def get_definition_file(self):
         return self.converter.benchmark_file
 
     def get_inputs(self):
-        return self.inputs if self.inputs else []
+        return self.inputs.values() if self.inputs else []
+
+    def get_inputs_dict(self):
+        return self.inputs if self.inputs else {}
+
+    def get_explicit_inputs(self):
+        explicit_inputs = [self.converter.get_stage_explicit_inputs(i) for i in self.converter.get_stage_implicit_inputs(self.stage)]
+        return explicit_inputs
+
+    def get_benchmark_name(self):
+        return self.converter.get_benchmark_name()
 
     def get_input_paths(self):
         input_paths = []
@@ -53,6 +68,9 @@ def get_output_paths(self):
     def get_parameters(self):
         return self.parameters
 
+    def get_repository(self):
+        return self.converter.get_module_repository(module=self.module)
+
     def is_initial(self):
         return self.converter.is_initial(self.stage)
 
diff --git a/src/workflow/snakemake/format/formatter.py b/src/workflow/snakemake/format/formatter.py
@@ -1,7 +1,7 @@
 import re
 from itertools import takewhile
 from pathlib import Path
-from typing import List, Set, Tuple, Union, NamedTuple
+from typing import List, Set, Tuple, Union, NamedTuple, Dict, Any
 
 from src.model import BenchmarkNode, Benchmark
 
@@ -25,7 +25,7 @@ def format_output_templates_to_be_expanded(node: BenchmarkNode) -> List[str]:
     return outputs
 
 
-def format_input_templates_to_be_expanded(benchmark: Benchmark, wildcards: Wildcards) -> List[str]:
+def format_input_templates_to_be_expanded(benchmark: Benchmark, wildcards: Wildcards, return_as_dict=False) -> dict[str, str] | list[str]:
     """Formats benchmark inputs that will be expanded according to Snakemake's engine"""
 
     pre = wildcards.pre
@@ -43,15 +43,17 @@ def format_input_templates_to_be_expanded(benchmark: Benchmark, wildcards: Wildc
     node_hash = hash(BenchmarkNode.to_id(stage_id, module_id, param_id, after_stage_id))
     matching_node = next((node for node in nodes if hash(node) == node_hash), None)
     if matching_node:
-        node_inputs = matching_node.get_inputs()
+        node_inputs = matching_node.get_inputs_dict()
 
         inputs = _match_inputs(node_inputs, pre_stages, pre, dataset)
 
         # print(f'Inputs: {stage_id} {module_id} {param_id}: {inputs}')
-        return inputs
-
+        if return_as_dict:
+            return inputs
+        else:
+            return inputs.values()
     else:
-        return []
+        return {} if return_as_dict else []
 
 
 def _extract_stages_from_path(path: str, known_stage_ids: Set[str]) -> List[Union[str, tuple]]:
@@ -113,17 +115,17 @@ def _match_input_prefix(input: str, pre: str) -> str:
     return formatted_input
 
 
-def _match_inputs(inputs: List[str], stages: List[Tuple[str]], pre: str, dataset: str) -> List[str]:
+def _match_inputs(inputs: dict[str, str], stages: List[Tuple[str]], pre: str, dataset: str) -> dict[str, str]:
     all_matched = True
 
-    formatted_inputs = []
-    for input in inputs:
+    formatted_inputs = {}
+    for key, input in inputs.items():
         formatted_input = _match_input_module(input, stages, dataset)
         if not formatted_input:
             all_matched = False
             break
         else:
             formatted_input = _match_input_prefix(formatted_input, pre)
-            formatted_inputs.append(formatted_input)
+            formatted_inputs[key] = formatted_input
 
-    return formatted_inputs if all_matched else []
+    return formatted_inputs if all_matched else {}
diff --git a/src/workflow/snakemake/rules/rule_node.smk b/src/workflow/snakemake/rules/rule_node.smk
@@ -19,6 +19,10 @@ def _create_initial_node(node):
     module_id = node.module_id
     param_id = node.param_id
 
+    repository = node.get_repository()
+    repository_url = repository.url if repository else None
+    commit_hash = repository.commit if repository else None
+
     rule:
         name: f"{{stage}}_{{module}}_{{param}}".format(stage=stage_id,module=module_id,param=param_id)
         wildcard_constraints:
@@ -29,6 +33,8 @@ def _create_initial_node(node):
         output:
             formatter.format_output_templates_to_be_expanded(node),
         params:
+            repository_url = repository_url,
+            commit_hash = commit_hash,
             parameters = node.get_parameters(),
         script: os.path.join(os.path.dirname(os.path.realpath(scripts.__file__)), 'run_module.py')
 
@@ -44,6 +50,12 @@ def _create_intermediate_node(benchmark, node):
     if any(['{params}' in o for o in outputs]):
         post += '/' + param_id
 
+    repository = node.get_repository()
+    repository_url = repository.url if repository else None
+    commit_hash = repository.commit if repository else None
+
+    inputs_map = lambda wildcards: formatter.format_input_templates_to_be_expanded(benchmark, wildcards, return_as_dict=True)
+
     rule:
         name: f"{{stage}}_{{module}}_{{param}}".format(stage=stage_id,module=module_id,param=param_id)
         wildcard_constraints:
@@ -55,6 +67,9 @@ def _create_intermediate_node(benchmark, node):
         output:
             formatter.format_output_templates_to_be_expanded(node)
         params:
+            inputs_map = inputs_map,
+            repository_url = repository_url,
+            commit_hash = commit_hash,
             parameters = node.get_parameters()
         script: os.path.join(os.path.dirname(os.path.realpath(scripts.__file__)), 'run_module.py')
 
diff --git a/src/workflow/snakemake/scripts/run_module.py b/src/workflow/snakemake/scripts/run_module.py
@@ -4,11 +4,12 @@
 ##
 ## Started 22 Feb 2024
 ## Izaskun Mallona
-
-import sys
+import hashlib
+import subprocess
 import os
 from typing import List
 
+from git import Repo
 from snakemake.script import Snakemake
 
 
@@ -22,7 +23,61 @@ def mock_execution(inputs: List[str], output: str, snakemake: Snakemake):
     print('  params are', snakemake.params)
 
 
-def dump_parameters_to_file(output_dir: str, parameters: str):
+def execution(module_dir: str, module_name: str, output_dir: str, dataset: str,
+              inputs_map: dict[str, str], parameters: List[str]):
+
+    run_sh = os.path.join(module_dir, 'run.sh')
+    if not os.path.exists(run_sh):
+        raise RuntimeError(f'{module_name} run.sh script does not exist')
+
+    # Constructing the command list
+    command = [run_sh, output_dir, dataset]
+
+    # Adding input files with their respective keys
+    if inputs_map:
+        for k, v in inputs_map.items():
+            command.extend([f"--{k}", v])
+
+    # Adding extra parameters
+    if parameters:
+        command.extend(parameters)
+
+    try:
+        # Execute the shell script
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
+        return result.stdout
+
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f'Error executing {run_sh}') from e
+
+
+# Create a unique folder name based on the repository URL and commit hash
+def generate_unique_repo_folder_name(repo_url, commit_hash):
+    unique_string = f"{repo_url}@{commit_hash}"
+    folder_name = hashlib.md5(unique_string.encode()).hexdigest()
+
+    return folder_name
+
+
+def clone_module(output_dir: str, repository_url: str, commit_hash: str):
+    module_name = generate_unique_repo_folder_name(repository_url, commit_hash)
+    module_dir = os.path.join(output_dir, module_name)
+
+    if not os.path.exists(module_dir):
+        repo = Repo.clone_from(repository_url, module_dir)
+        repo.git.checkout(commit_hash)
+    else:
+        repo = Repo(module_dir)
+
+    if repo.head.commit.hexsha[:7] != commit_hash:
+        raise RuntimeError(f'WARNING: {commit_hash} does not match {repo.head.commit.hexsha[:7]}')
+
+    return module_dir
+
+
+def dump_parameters_to_file(output_dir: str, parameters: List[str]):
+    os.makedirs(output_dir, exist_ok=True)
+
     if parameters is not None:
         params_file = os.path.join(output_dir, 'parameters.txt')
         with open(params_file, 'w') as params_file:
@@ -35,17 +90,31 @@ def dump_parameters_to_file(output_dir: str, parameters: str):
 
 try:
     snakemake: Snakemake = snakemake
-    parameters = dict(snakemake.params)['parameters']
-    output_dir = os.path.dirname(snakemake.output[0])
-    os.makedirs(output_dir, exist_ok=True)
+    params = dict(snakemake.params)
+    parameters = params['parameters']
+    repository_url = params['repository_url']
+    commit_hash = params['commit_hash']
+    inputs_map = params.get('inputs_map')
 
+    # Create parameters file for outputs
+    output_dir = os.path.dirname(snakemake.output[0])
     dump_parameters_to_file(output_dir, parameters)
 
-    for out in snakemake.output:
-        with open(out, 'w') as sys.stdout:
-            mock_execution(inputs=snakemake.input,
-                           output=out,
-                           snakemake=snakemake)
+    # Clone github repository
+    repositories_dir = os.path.join(".snakemake", "repos")
+    module_dir = clone_module(repositories_dir, repository_url, commit_hash)
+
+    # Execute module code
+    module_name = snakemake.rule
+
+    # TODO Fix logic of inferring output dirname
+    output_dir = os.path.commonpath(snakemake.output)
+    if os.path.splitext(output_dir)[1] != '':
+        output_dir = os.path.dirname(output_dir)
+
+    dataset = snakemake.wildcards.dataset
+    execution(module_dir, module_name=module_name, output_dir=output_dir, dataset=dataset,
+              inputs_map=inputs_map, parameters=parameters)
 
 except NameError:
     raise RuntimeError("This script must be run from within a Snakemake workflow.")