From 69a25876fe26cccbe0d583a779e1f1b80c7108dd Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 11 Feb 2025 18:47:26 +0100 Subject: [PATCH 01/21] add celltypist --- src/atlas_service/config.vsh.yaml | 54 ++++++++++++++- src/atlas_service/main.nf | 108 ++++++++++++++++++------------ src/atlas_service/test.sh | 6 +- 3 files changed, 123 insertions(+), 45 deletions(-) diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index c02e9ff..0100105 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -33,6 +33,12 @@ argument_groups: required: false description: | The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. - name: Reference data arguments description: Dataset to be used as a reference for label transfer and to train annotation algorithms on. @@ -74,7 +80,7 @@ argument_groups: type: string multiple: true default: scgpt_annotation - choices: [harmony_knn, scgpt_annotation] + choices: [celltypist, harmony_knn, scgpt_annotation] example: harmony_knn - name: "Pre-processing options: RNA filtering" @@ -263,6 +269,45 @@ argument_groups: description: | Seed for random number generation used for binning. If not set, no seed is used. + - name: CellTypist reference model + description: The CellTypist reference model to use for annotation. If not provided, the reference dataset will be used for model training. + arguments: + - name: "--celltypist_model" + type: file + description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided." + required: false + example: pretrained_model.pkl + + - name: CellTypist annotation options + description: Specifications for CellTypist annotation. + arguments: + - name: "--celltypist_feature_selection" + type: boolean + description: "Whether to perform feature selection." + default: false + - name: "--celltypist_majority_voting" + type: boolean + description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering." + default: false + - name: "--celltypist_C" + type: double + description: "Inverse of regularization strength in logistic regression." + default: 1.0 + - name: "--celltypist_max_iter" + type: integer + description: "Maximum number of iterations before reaching the minimum of the cost function." + default: 1000 + - name: "--celltypist_use_SGD" + type: boolean_true + description: "Whether to use the stochastic gradient descent algorithm." + - name: "--celltypist_min_prop" + type: double + description: | + "For the dominant cell type within a subcluster, the minimum proportion of cells required to + support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. + Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'." + default: 0 + - name: Outputs description: The output file to write the annotated dataset to. arguments: @@ -285,12 +330,19 @@ dependencies: - name: workflows/annotation/scgpt_annotation alias: scgpt_annotation_workflow repository: op + - name: annotate/celltypist + repository: op repositories: - name: op type: github repo: openpipelines-bio/openpipeline tag: 2.0.0 + - name: op-harmony + type: github + repo: openpipelines-bio/openpipeline + tag: harmony_knn_annoation_workflow_build + resources: - type: nextflow_script diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index 2079645..f6af038 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -4,14 +4,10 @@ workflow run_wf { main: output_ch = input_ch - | map {id, state -> - def new_state = state + ["query_processed": state.output] + | map { id, state -> + def new_state = state + [ "query_processed": state.output, "_meta": ["join_id": id] ] [id, new_state] - } - // | map{ id, state -> - // def new_state = state + ["_meta": ["join_id": id]] - // [id, new_state] - // } + } | process_samples_workflow.run( fromState: {id, state -> def newState = [ @@ -32,46 +28,76 @@ workflow run_wf { "mitochondrial_gene_regex": state.mitochondrial_gene_regex, "var_qc_metrics": state.var_qc_metrics, "top_n_vars": state.top_n_vars, - ] + ] }, args: [ "pca_overwrite": "true", "add_id_obs_output": "sample_id" ], toState: ["query_processed": "output"], - ) - | view {"After processing query: $it"} - | scgpt_annotation_workflow.run( - runIf: { id, state -> state.annotation_methods.contains("scgpt_annotation") }, - fromState: { id, state -> - [ - "id": id, - "input": state.query_processed, - "modality": state.modality, - "input_layer": state.input_layer, - "input_var_gene_names": state.input_var_gene_names, - "model": state.scgpt_model, - "model_config": state.scgpt_model_config, - "model_vocab": state.scgpt_model_vocab, - "finetuned_checkpoints_key": state.scgpt_finetuned_checkpoints_key, - "label_mapper_key": state.scgpt_label_mapper_key, - "pad_token": state.scgpt_pad_token, - "pad_value": state.scgpt_pad_value, - "n_hvg": state.scgpt_n_hvg, - "dsbn": state.scgpt_dsbn, - "batch_size": state.scgpt_batch_size, - "n_input_bins": state.scgpt_n_input_bins, - "seed": state.scgpt_seed - ] - }, - args: [ - "input_obs_batch_label": "sample_id", - "output_obs_predictions": "scgpt_pred", - "output_obs_probability": "scgpt_proba" - ], - toState: [ "query_processed": "output" ] - ) - | setState(["output": "query_processed", "_meta": "_meta"]) + ) + + | view {"After processing query: $it"} + + | scgpt_annotation_workflow.run( + runIf: { id, state -> state.annotation_methods.contains("scgpt_annotation") }, + fromState: { id, state -> + [ + "id": id, + "input": state.query_processed, + "modality": state.modality, + "input_layer": state.input_layer, + "input_var_gene_names": state.input_var_gene_names, + "model": state.scgpt_model, + "model_config": state.scgpt_model_config, + "model_vocab": state.scgpt_model_vocab, + "finetuned_checkpoints_key": state.scgpt_finetuned_checkpoints_key, + "label_mapper_key": state.scgpt_label_mapper_key, + "pad_token": state.scgpt_pad_token, + "pad_value": state.scgpt_pad_value, + "n_hvg": state.scgpt_n_hvg, + "dsbn": state.scgpt_dsbn, + "batch_size": state.scgpt_batch_size, + "n_input_bins": state.scgpt_n_input_bins, + "seed": state.scgpt_seed + ] + }, + args: [ + "input_obs_batch_label": "sample_id", + "output_obs_predictions": "scgpt_pred", + "output_obs_probability": "scgpt_proba" + ], + toState: [ "query_processed": "output" ] + ) + + | view {"After scgpt: $it"} + + // | celltypist.run( + // runIf: { id, state -> state.annotation_methods.contains("celltypist") && state.celltypist_model }, + // fromState: [ + // "input": "query_processed", + // "modality": "modality", + // "input_layer": "input_layer", + // "input_var_gene_names": "input_var_gene_names", + // "input_reference_gene_overlap": "input_reference_gene_overlap", + // "model": "celltypist_model", + // "majority_voting": "celltypist_majority_voting" + // ], + // args: [ + // "output_obs_predictions": "celltypist_pred", + // "output_obs_probability": "celltypist_proba" + // ], + // toState: [ "query_processed": "output" ] + // ) + + // | view {"After celltypist: $it"} + | map {id, state -> + def new_state = state + ["output": state.query_processed] + [id, new_state] + } + | view {"After mapping: $it"} + | setState(["output", "_meta"]) + | view {"After setstate: $it"} emit: output_ch diff --git a/src/atlas_service/test.sh b/src/atlas_service/test.sh index 924be40..0f16f83 100755 --- a/src/atlas_service/test.sh +++ b/src/atlas_service/test.sh @@ -3,17 +3,17 @@ id: run input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol +celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl annotation_methods: scgpt_annotation scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json -publish_dir: output HERE nextflow run . \ -main-script target/nextflow/atlas_service/main.nf \ -params-file params.yaml \ -resume \ --profile docker \ +-profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config \ No newline at end of file +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config From 58f9e27a92ae9a3ea00ce7a503b9a65e501825ad Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 12 Feb 2025 11:29:47 +0100 Subject: [PATCH 02/21] update package name --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 8c6dcd5..6a7d20e 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -3,7 +3,7 @@ viash_version: 0.9.1 source: src target: target -name: incubator +name: openpipeline_incubator organization: openpipelines-bio links: From 83cb586ce4466bd0942b2e071eb60cf3afe18d33 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 18 Feb 2025 19:02:49 +0100 Subject: [PATCH 03/21] add harmony --- src/atlas_service/config.vsh.yaml | 30 +++++-- src/atlas_service/main.nf | 125 ++++++++++++++++++++++++------ src/atlas_service/test.sh | 56 ++++++++++++- src/atlas_service/test_params.sh | 53 +++++++++++++ 4 files changed, 233 insertions(+), 31 deletions(-) create mode 100755 src/atlas_service/test_params.sh diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index 0100105..447dfad 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -39,6 +39,9 @@ argument_groups: min: 1 description: | The minimum number of genes present in both the reference and query datasets. + - name: "--overwrite_existing_key" + type: boolean_true + description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. - name: Reference data arguments description: Dataset to be used as a reference for label transfer and to train annotation algorithms on. @@ -46,17 +49,16 @@ argument_groups: - name: "--reference" type: file required: false + example: reference.h5mu description: | - The reference dataset to be used as a reference mapper and to train annotation algorithms on. - # example: https://zenodo.org/records/7587774/files/TS_Lung_filtered.h5ad + The reference dataset in .h5mu format to be used as a reference mapper and to train annotation algorithms on. - name: "--reference_layer_raw_counts" type: string description: "The layer in the reference dataset containing the raw counts, if .X is not to be used." required: false - name: "--reference_layer_lognormalized_counts" type: string - default: log_normalized - description: "The layer in the reference dataset containing the log-normalized counts." + description: "The layer in the reference dataset containing the log-normalized counts, if .X is not to be used." - name: "--reference_var_gene_names" type: string required: false @@ -72,6 +74,11 @@ argument_groups: example: cell_type required: false description: The `.obs` key of the target labels to tranfer. + - name: "--reference_var_input" + type: string + required: false + description: | + .var column containing highly variable genes. By default, do not subset genes. - name: Annotation methods description: The available annotation methods to annotate the query dataset(s) with. @@ -328,21 +335,28 @@ dependencies: alias: process_samples_workflow repository: op - name: workflows/annotation/scgpt_annotation - alias: scgpt_annotation_workflow repository: op - name: annotate/celltypist - repository: op + repository: op-celltypist + alias: celltypist_annotation + - name: workflows/annotation/harmony_knn + repository: op-harmony + alias: harmony_knn_annotation + repositories: - name: op type: github repo: openpipelines-bio/openpipeline tag: 2.0.0 + - name: op-celltypist + type: github + repo: openpipelines-bio/openpipeline + tag: celltypist-layer-passing_build - name: op-harmony type: github repo: openpipelines-bio/openpipeline - tag: harmony_knn_annoation_workflow_build - + tag: harmony-knn-annoation-workflow_build resources: - type: nextflow_script diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index f6af038..c0f29a3 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -8,6 +8,28 @@ workflow run_wf { def new_state = state + [ "query_processed": state.output, "_meta": ["join_id": id] ] [id, new_state] } + // Enforce annotation method-specific required arguments + | map { id, state -> + def new_state = [:] + // Check scGPT arguments + if (state.annotation_methods.contains("scgpt_annotation") && + (!state.scgpt_model || !state.scgpt_model_config || !state.scgp_model_vocab)) { + throw new RuntimeException("Using scgpt_annotation requires --scgpt_model, --scgpt_model_config and --scgp_model_vocab parameters.") + } + // Check CellTypist arguments + if (state.annotation_methods.contains("celltypist") && + (!state.celltypist_model && !state.reference)) { + throw new RuntimeException("Celltypist was selected as an annotation method. Either --celltypist_model or --reference must be provided.") + } + if (state.annotation_methods.contains("celltypist") && state.celltypist_model && state.reference ) { + System.err.println( + "Warning: --celltypist_model is set and a --reference was provided. \ + The pre-trained Celltypist model will be used for annotation, the reference will be ignored." + ) + } + + [id, state + new_state] + } | process_samples_workflow.run( fromState: {id, state -> def newState = [ @@ -37,9 +59,7 @@ workflow run_wf { toState: ["query_processed": "output"], ) - | view {"After processing query: $it"} - - | scgpt_annotation_workflow.run( + | scgpt_annotation.run( runIf: { id, state -> state.annotation_methods.contains("scgpt_annotation") }, fromState: { id, state -> [ @@ -70,27 +90,88 @@ workflow run_wf { toState: [ "query_processed": "output" ] ) - | view {"After scgpt: $it"} + | celltypist_annotation.run( + runIf: { id, state -> state.annotation_methods.contains("celltypist") && state.celltypist_model }, + fromState: [ + "input": "query_processed", + "modality": "modality", + "input_var_gene_names": "input_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "model": "celltypist_model", + "majority_voting": "celltypist_majority_voting" + ], + args: [ + // log normalized counts are expected for celltypist + "input_layer": "log_normalized", + "output_obs_predictions": "celltypist_pred", + "output_obs_probability": "celltypist_proba" + ], + toState: [ "query_processed": "output" ] + ) + + | celltypist.run( + runIf: { id, state -> state.annotation_methods.contains("celltypist") && !state.celltypist_model }, + fromState: [ + "input": "query_processed", + "modality": "modality", + "input_var_gene_names": "input_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "reference": "reference", + "reference_layer": "reference_layer_lognormalized_counts", + "reference_obs_target": "reference_obs_label", + "reference_var_gene_names": "reference_var_gene_names", + "reference_obs_batch": "reference_obs_batch", + "reference_var_input": "reference_var_input", + "feature_selection": "celltypist_feature_selection", + "C": "celltypist_C", + "max_iter": "celltypist_max_iter", + "use_SGD": "celltypist_use_SGD", + "min_prop": "celltypist_min_prop", + "majority_voting": "celltypist_majority_voting" + ], + args: [ + // log normalized counts are expected for celltypist + "input_layer": "log_normalized", + "check_expression": "true", + "output_obs_predictions": "celltypist_pred", + "output_obs_probability": "celltypist_proba" + ], + toState: [ "query_processed": "output" ] + ) - // | celltypist.run( - // runIf: { id, state -> state.annotation_methods.contains("celltypist") && state.celltypist_model }, - // fromState: [ - // "input": "query_processed", - // "modality": "modality", - // "input_layer": "input_layer", - // "input_var_gene_names": "input_var_gene_names", - // "input_reference_gene_overlap": "input_reference_gene_overlap", - // "model": "celltypist_model", - // "majority_voting": "celltypist_majority_voting" - // ], - // args: [ - // "output_obs_predictions": "celltypist_pred", - // "output_obs_probability": "celltypist_proba" - // ], - // toState: [ "query_processed": "output" ] - // ) + | harmony_knn_annotation.run( + runIf: { id, state -> state.annotation_methods.contains("harmony_knn") }, + fromState: { id, state -> + [ + "id": id, + "input": state.query_processed, + "modality": state.modality, + "input_layer": state.input_layer, + "input_var_gene_names": state.input_var_gene_names, + "input_reference_gene_overlap": state.input_reference_gene_overlap, + "overwrite_existing_key": state.overwrite_existing_key, + "reference": state.reference, + "reference_layer": state.reference_layer_raw_counts, + "reference_obs_target": state.reference_obs_label, + "reference_var_gene_names": state.reference_var_gene_names, + "reference_obs_batch_label": state.reference_obs_batch, + "harmony_theta": state.harmony_theta, + // disable arguments for pca/leiden/knn for now + // "pca_num_components": state.pca_num_components, + // "leiden_resolution": state.leiden_resolution, + // "knn_weights": state.knn_weights, + // "knn_n_neighbors": state.knn_n_neighbors + ] + }, + args: [ + "input_obs_batch_label": "sample_id", + "output_obs_predictions": "harmony_knn_pred", + "output_obs_probability": "harmony_knn_proba", + "output_obsm_integrated": "X_integrated_harmony", + ], + toState: [ "query_processed": "output" ] + ) - // | view {"After celltypist: $it"} | map {id, state -> def new_state = state + ["output": state.query_processed] [id, new_state] diff --git a/src/atlas_service/test.sh b/src/atlas_service/test.sh index 0f16f83..1e66345 100755 --- a/src/atlas_service/test.sh +++ b/src/atlas_service/test.sh @@ -4,7 +4,7 @@ input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/p modality: rna input_var_gene_names: gene_symbol celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl -annotation_methods: scgpt_annotation +annotation_methods: scgpt_annotation;celltypist scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json @@ -17,3 +17,57 @@ nextflow run . \ -profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config + +# Test required arguments scGPT +cat > params.yaml << HERE +id: run +input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +annotation_methods: scgpt_annotation;celltypist +scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json +scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config + +# Test required arguments CellTypist +cat > params.yaml << HERE +id: run +input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +annotation_methods: scgpt_annotation;celltypist +celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config + +cat > params.yaml << HERE +id: run +input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +annotation_methods: scgpt_annotation;celltypist +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config \ No newline at end of file diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh new file mode 100755 index 0000000..531b868 --- /dev/null +++ b/src/atlas_service/test_params.sh @@ -0,0 +1,53 @@ +# Test required arguments scGPT +cat > params.yaml << HERE +id: scgpt_no_params +input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +annotation_methods: scgpt_annotation +scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json +scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config + +# Test required arguments CellTypist +cat > params.yaml << HERE +id: celltypist_overlapping_params +input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +annotation_methods: celltypist +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config + +cat > params.yaml << HERE +id: celltypist_no_params +input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +annotation_methods: scgpt,celltypist +celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config From 56102b88912afe1e84789c74b2cb5f966f78e446 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 18 Feb 2025 20:31:57 +0100 Subject: [PATCH 04/21] wip --- src/atlas_service/config.vsh.yaml | 1 - src/atlas_service/main.nf | 9 +++- .../{test.sh => test_execution.sh} | 46 ++++++------------- src/atlas_service/test_params.sh | 13 +++--- 4 files changed, 27 insertions(+), 42 deletions(-) rename src/atlas_service/{test.sh => test_execution.sh} (62%) diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index 447dfad..0570fc5 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -343,7 +343,6 @@ dependencies: repository: op-harmony alias: harmony_knn_annotation - repositories: - name: op type: github diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index c0f29a3..6778924 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -9,11 +9,12 @@ workflow run_wf { [id, new_state] } // Enforce annotation method-specific required arguments + | niceView() | map { id, state -> def new_state = [:] // Check scGPT arguments if (state.annotation_methods.contains("scgpt_annotation") && - (!state.scgpt_model || !state.scgpt_model_config || !state.scgp_model_vocab)) { + (!state.scgpt_model || !state.scgpt_model_config || !state.scgpt_model_vocab)) { throw new RuntimeException("Using scgpt_annotation requires --scgpt_model, --scgpt_model_config and --scgp_model_vocab parameters.") } // Check CellTypist arguments @@ -27,6 +28,10 @@ workflow run_wf { The pre-trained Celltypist model will be used for annotation, the reference will be ignored." ) } + // Check Harmony KNN arguments + if (state.annotation_methods.contains("harmony_knn") && !state.reference ) { + throw new RuntimeException("Harmony KNN was selected as an annotation method. A --reference dataset must be provided.") + } [id, state + new_state] } @@ -109,7 +114,7 @@ workflow run_wf { toState: [ "query_processed": "output" ] ) - | celltypist.run( + | celltypist_annotation.run( runIf: { id, state -> state.annotation_methods.contains("celltypist") && !state.celltypist_model }, fromState: [ "input": "query_processed", diff --git a/src/atlas_service/test.sh b/src/atlas_service/test_execution.sh similarity index 62% rename from src/atlas_service/test.sh rename to src/atlas_service/test_execution.sh index 1e66345..7a0620f 100755 --- a/src/atlas_service/test.sh +++ b/src/atlas_service/test_execution.sh @@ -1,13 +1,13 @@ cat > params.yaml << HERE -id: run +id: harmony input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna -input_var_gene_names: gene_symbol +reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +reference_var_gene_names: ensemblid +reference_obs_batch: donor_assay +reference_obs_label: cell_type celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl -annotation_methods: scgpt_annotation;celltypist -scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt -scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json -scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json +annotation_methods: harmony_knn HERE nextflow run . \ @@ -18,15 +18,13 @@ nextflow run . \ -c target/nextflow/atlas_service/nextflow.config \ -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config -# Test required arguments scGPT cat > params.yaml << HERE -id: run +id: celltypist input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol -annotation_methods: scgpt_annotation;celltypist -scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json -scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json +annotation_methods: celltypist +publish_dir: output HERE nextflow run . \ @@ -37,15 +35,15 @@ nextflow run . \ -c target/nextflow/atlas_service/nextflow.config \ -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config -# Test required arguments CellTypist cat > params.yaml << HERE -id: run +id: scgpt input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna +annotation_methods: scgpt_annotation input_var_gene_names: gene_symbol -reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu -annotation_methods: scgpt_annotation;celltypist -celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl +scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt +scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json +scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json HERE nextflow run . \ @@ -55,19 +53,3 @@ nextflow run . \ -profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config - -cat > params.yaml << HERE -id: run -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: scgpt_annotation;celltypist -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config \ No newline at end of file diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh index 531b868..4bbcc7f 100755 --- a/src/atlas_service/test_params.sh +++ b/src/atlas_service/test_params.sh @@ -1,6 +1,6 @@ # Test required arguments scGPT cat > params.yaml << HERE -id: scgpt_no_params +id: scgpt input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol @@ -19,11 +19,11 @@ nextflow run . \ # Test required arguments CellTypist cat > params.yaml << HERE -id: celltypist_overlapping_params +id: celltypist input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol -annotation_methods: celltypist +annotation_methods: scgpt,celltypist HERE nextflow run . \ @@ -34,14 +34,13 @@ nextflow run . \ -c target/nextflow/atlas_service/nextflow.config \ -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +# Test required arguments CellTypist cat > params.yaml << HERE -id: celltypist_no_params +id: celltypist input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol -reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu -annotation_methods: scgpt,celltypist -celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl +annotation_methods: harmony_knn HERE nextflow run . \ From f855d33627774cd693734e97383770d5a7f60599 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Fri, 21 Feb 2025 17:06:45 +0000 Subject: [PATCH 05/21] add scvi-knn annotation workflow --- src/atlas_service/config.vsh.yaml | 86 +++++++++++++++++++---------- src/atlas_service/main.nf | 53 ++++++++++++++---- src/atlas_service/test_execution.sh | 75 ++++++++++++------------- 3 files changed, 136 insertions(+), 78 deletions(-) diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index 0570fc5..5de5bdc 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -39,10 +39,7 @@ argument_groups: min: 1 description: | The minimum number of genes present in both the reference and query datasets. - - name: "--overwrite_existing_key" - type: boolean_true - description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. - + - name: Reference data arguments description: Dataset to be used as a reference for label transfer and to train annotation algorithms on. arguments: @@ -86,8 +83,8 @@ argument_groups: - name: "--annotation_methods" type: string multiple: true - default: scgpt_annotation - choices: [celltypist, harmony_knn, scgpt_annotation] + required: true + choices: [celltypist, harmony_knn, scgpt_annotation, scvi_knn] example: harmony_knn - name: "Pre-processing options: RNA filtering" @@ -125,30 +122,13 @@ argument_groups: - name: "Pre-processing options: Highly variable features detection" description: Pre-processing options for detecting highly variable features arguments: - - name: "--top_n_vars" + - name: "--n_hvg" type: integer description: | - Number of top vars to be used to calculate cumulative proportions. - If not specified, proportions are not calculated. `--top_n_vars 20,50` finds - cumulative proportion to the 20th and 50th most expressed vars. - multiple: true - multiple_sep: ',' - required: false - default: [50, 100, 200, 500] - - name: "--highly_variable_features_var_output" - alternatives: ["--filter_with_hvg_var_output"] - required: false - type: string - default: "filter_with_hvg" - description: In which .var slot to store a boolean array corresponding to the highly variable genes. - - name: "--highly_variable_features_obs_batch_key" - alternatives: ["--filter_with_hvg_obs_batch_key"] - type: string - default: "sample_id" - required: false - description: | - If specified, highly-variable genes are selected within each batch separately and merged. This simple - process avoids the selection of batch-specific genes and acts as a lightweight batch correction method. + Number of highly-variable features to keep. + Only relevant if HVG need to be calculated across query and reference datasets (e.g. for --annotation_methods scvi_knn and harmony_knn). + For reference mapping-based methods, the HVG's specified in --reference_var_input will be used. + default: 2000 - name: "Pre-processing options: Mitochondrial Gene Detection" description: Pre-processing options for detecting mitochondrial genes @@ -189,7 +169,7 @@ argument_groups: example: "ercc,highly_variable" - name: Harmony integration options - description: Specifications for harmony integration. Only relevant for annotation method 'harmony_knn'. + description: Specifications for harmony integration. arguments: - name: "--harmony_theta" type: double @@ -201,6 +181,47 @@ argument_groups: example: [0, 1, 2] multiple: true + - name: SCVI integration options + description: Specifications for SCVI integration. + arguments: + - name: "--scvi_early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--scvi_early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--scvi_early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--scvi_early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement." + - name: "--scvi_max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--scvi_reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--scvi_lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--scvi_lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + - name: scGPT reference model description: scGPT model input, required for scGPT annotation methods arguments: @@ -342,6 +363,9 @@ dependencies: - name: workflows/annotation/harmony_knn repository: op-harmony alias: harmony_knn_annotation + - name: workflows/annotation/scvi_knn + repository: op-scvi + alias: scvi_knn_annotation repositories: - name: op @@ -356,6 +380,10 @@ repositories: type: github repo: openpipelines-bio/openpipeline tag: harmony-knn-annoation-workflow_build + - name: op-scvi + type: github + repo: openpipelines-bio/openpipeline + tag: scvi-knn-annotation_build resources: - type: nextflow_script diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index 6778924..270d80c 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -9,7 +9,6 @@ workflow run_wf { [id, new_state] } // Enforce annotation method-specific required arguments - | niceView() | map { id, state -> def new_state = [:] // Check scGPT arguments @@ -48,13 +47,10 @@ workflow run_wf { "rna_min_cells_per_gene": state.rna_min_cells_per_gene, "rna_min_fraction_mito": state.rna_min_fraction_mito, "rna_max_fraction_mito": state.rna_max_fraction_mito, - "highly_variable_features_var_output": state.highly_variable_features_var_output, - "highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key, "var_name_mitochondrial_genes": state.var_name_mitochondrial_genes, "var_gene_names": state.input_var_gene_names, "mitochondrial_gene_regex": state.mitochondrial_gene_regex, - "var_qc_metrics": state.var_qc_metrics, - "top_n_vars": state.top_n_vars, + "var_qc_metrics": state.var_qc_metrics ] }, args: [ @@ -145,6 +141,35 @@ workflow run_wf { ) | harmony_knn_annotation.run( + runIf: { id, state -> state.annotation_methods.contains("harmony_knn") }, + fromState: { id, state -> + [ + "id": id, + "input": state.query_processed, + "modality": state.modality, + "input_var_gene_names": state.input_var_gene_names, + "input_reference_gene_overlap": state.input_reference_gene_overlap, + "reference": state.reference, + "reference_layer": state.reference_layer_lognormalized_counts, + "reference_obs_target": state.reference_obs_label, + "reference_var_gene_names": state.reference_var_gene_names, + "reference_obs_batch_label": state.reference_obs_batch, + "n_hvg": state.n_hvg, + "harmony_theta": state.harmony_theta, + ] + }, + args: [ + "input_layer": "log_normalized", + "input_obs_batch_label": "sample_id", + "output_obs_predictions": "harmony_knn_pred", + "output_obs_probability": "harmony_knn_proba", + "output_obsm_integrated": "X_integrated_harmony", + "overwrite_existing_key": "true" + ], + toState: [ "query_processed": "output" ] + ) + + | scvi_knn_annotation.run( runIf: { id, state -> state.annotation_methods.contains("harmony_knn") }, fromState: { id, state -> [ @@ -154,25 +179,29 @@ workflow run_wf { "input_layer": state.input_layer, "input_var_gene_names": state.input_var_gene_names, "input_reference_gene_overlap": state.input_reference_gene_overlap, - "overwrite_existing_key": state.overwrite_existing_key, "reference": state.reference, "reference_layer": state.reference_layer_raw_counts, + "reference_layer_lognormalized": state.reference_layer_lognormalized_counts, "reference_obs_target": state.reference_obs_label, "reference_var_gene_names": state.reference_var_gene_names, "reference_obs_batch_label": state.reference_obs_batch, - "harmony_theta": state.harmony_theta, - // disable arguments for pca/leiden/knn for now - // "pca_num_components": state.pca_num_components, - // "leiden_resolution": state.leiden_resolution, - // "knn_weights": state.knn_weights, - // "knn_n_neighbors": state.knn_n_neighbors + "n_hvg": state.n_hvg, + "scvi_early_stopping": state.scvi_early_stopping, + "scvi_early_stopping_patience": state.scvi_early_stopping_patience, + "scvi_early_stopping_min_delta": state.scvi_early_stopping_min_delta, + "scvi_max_epochs": state.scvi_max_epochs, + "scvi_reduce_lr_on_plateau": state.scvi_reduce_lr_on_plateau, + "scvi_lr_factor": state.scvi_lr_factor, + "scvi_lr_patience": state.scvi_lr_patience ] }, args: [ + "input_layer_lognormalized": "log_normalized", "input_obs_batch_label": "sample_id", "output_obs_predictions": "harmony_knn_pred", "output_obs_probability": "harmony_knn_proba", "output_obsm_integrated": "X_integrated_harmony", + "overwrite_existing_key": "true" ], toState: [ "query_processed": "output" ] ) diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh index 7a0620f..45e3a06 100755 --- a/src/atlas_service/test_execution.sh +++ b/src/atlas_service/test_execution.sh @@ -1,13 +1,14 @@ cat > params.yaml << HERE id: harmony -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna -reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +reference_layer_lognormalized_counts: log_normalized reference_var_gene_names: ensemblid reference_obs_batch: donor_assay reference_obs_label: cell_type -celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl -annotation_methods: harmony_knn +celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl +annotation_methods: harmony_knn;scvi_knn HERE nextflow run . \ @@ -16,40 +17,40 @@ nextflow run . \ -resume \ -profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config -cat > params.yaml << HERE -id: celltypist -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: celltypist -publish_dir: output -HERE +# cat > params.yaml << HERE +# id: celltypist +# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +# modality: rna +# input_var_gene_names: gene_symbol +# annotation_methods: celltypist +# publish_dir: output +# HERE -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +# nextflow run . \ +# -main-script target/nextflow/atlas_service/main.nf \ +# -params-file params.yaml \ +# -resume \ +# -profile docker,no_publish \ +# -c target/nextflow/atlas_service/nextflow.config \ +# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config -cat > params.yaml << HERE -id: scgpt -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -annotation_methods: scgpt_annotation -input_var_gene_names: gene_symbol -scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt -scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json -scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json -HERE +# cat > params.yaml << HERE +# id: scgpt +# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +# modality: rna +# annotation_methods: scgpt_annotation +# input_var_gene_names: gene_symbol +# scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt +# scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json +# scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json +# HERE -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +# nextflow run . \ +# -main-script target/nextflow/atlas_service/main.nf \ +# -params-file params.yaml \ +# -resume \ +# -profile docker,no_publish \ +# -c target/nextflow/atlas_service/nextflow.config \ +# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config From 90db0253eda30e5fe98b4b732c13bc11a61d02d1 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 26 Feb 2025 10:19:17 +0000 Subject: [PATCH 06/21] update repos --- src/atlas_service/config.vsh.yaml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index 5de5bdc..f18b5ec 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -358,10 +358,10 @@ dependencies: - name: workflows/annotation/scgpt_annotation repository: op - name: annotate/celltypist - repository: op-celltypist + repository: op-main alias: celltypist_annotation - name: workflows/annotation/harmony_knn - repository: op-harmony + repository: op-main alias: harmony_knn_annotation - name: workflows/annotation/scvi_knn repository: op-scvi @@ -372,14 +372,10 @@ repositories: type: github repo: openpipelines-bio/openpipeline tag: 2.0.0 - - name: op-celltypist + - name: op-main type: github repo: openpipelines-bio/openpipeline - tag: celltypist-layer-passing_build - - name: op-harmony - type: github - repo: openpipelines-bio/openpipeline - tag: harmony-knn-annoation-workflow_build + tag: main_build - name: op-scvi type: github repo: openpipelines-bio/openpipeline From a49cd41eb0c81e28bc56e56dba02745e3f9de47e Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 26 Feb 2025 15:47:47 +0000 Subject: [PATCH 07/21] update celltypist params --- src/atlas_service/main.nf | 5 +-- src/atlas_service/test_execution.sh | 7 ++-- src/atlas_service/test_params.sh | 60 +++++++++++++++++++++++------ 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index 270d80c..5671714 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -28,8 +28,8 @@ workflow run_wf { ) } // Check Harmony KNN arguments - if (state.annotation_methods.contains("harmony_knn") && !state.reference ) { - throw new RuntimeException("Harmony KNN was selected as an annotation method. A --reference dataset must be provided.") + if ((state.annotation_methods.contains("harmony_knn") || state.annotation_methods.contains("scvi_knn")) && !state.reference ) { + throw new RuntimeException("When `harmony_knn` or `scvi_knn` are selected as an annotation method, a --reference dataset must be provided.") } [id, state + new_state] @@ -133,7 +133,6 @@ workflow run_wf { args: [ // log normalized counts are expected for celltypist "input_layer": "log_normalized", - "check_expression": "true", "output_obs_predictions": "celltypist_pred", "output_obs_probability": "celltypist_proba" ], diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh index 45e3a06..96bb15f 100755 --- a/src/atlas_service/test_execution.sh +++ b/src/atlas_service/test_execution.sh @@ -7,21 +7,22 @@ reference_layer_lognormalized_counts: log_normalized reference_var_gene_names: ensemblid reference_obs_batch: donor_assay reference_obs_label: cell_type -celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl -annotation_methods: harmony_knn;scvi_knn +annotation_methods: harmony_knn;scvi_knn;celltypist +publish_dir: aaas_test HERE nextflow run . \ -main-script target/nextflow/atlas_service/main.nf \ -params-file params.yaml \ -resume \ --profile docker,no_publish \ +-profile docker \ -c target/nextflow/atlas_service/nextflow.config \ -c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config # cat > params.yaml << HERE # id: celltypist # input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +# celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl # modality: rna # input_var_gene_names: gene_symbol # annotation_methods: celltypist diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh index 4bbcc7f..53986e8 100755 --- a/src/atlas_service/test_params.sh +++ b/src/atlas_service/test_params.sh @@ -1,12 +1,12 @@ # Test required arguments scGPT cat > params.yaml << HERE id: scgpt -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol annotation_methods: scgpt_annotation -scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json -scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json +scgpt_model_config: /home/dorienroosen/openpipeline/resources_test/scgpt/source/args.json +scgpt_model_vocab: /home/dorienroosen/openpipeline/resources_test/scgpt/source/vocab.json HERE nextflow run . \ @@ -15,15 +15,15 @@ nextflow run . \ -resume \ -profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config # Test required arguments CellTypist cat > params.yaml << HERE -id: celltypist -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +id: celltypist_1 +input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol -annotation_methods: scgpt,celltypist +annotation_methods: celltypist HERE nextflow run . \ @@ -32,12 +32,31 @@ nextflow run . \ -resume \ -profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config -# Test required arguments CellTypist cat > params.yaml << HERE -id: celltypist -input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +id: celltypist_2 +input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl +reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +modality: rna +input_var_gene_names: gene_symbol +annotation_methods: celltypist +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config + + +# Test required arguments Harmony +cat > params.yaml << HERE +id: harmony +input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna input_var_gene_names: gene_symbol annotation_methods: harmony_knn @@ -49,4 +68,21 @@ nextflow run . \ -resume \ -profile docker,no_publish \ -c target/nextflow/atlas_service/nextflow.config \ --c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config +-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config + +# Test required arguments SCVI +cat > params.yaml << HERE +id: scvi +input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +modality: rna +input_var_gene_names: gene_symbol +annotation_methods: scvi_knn +HERE + +nextflow run . \ +-main-script target/nextflow/atlas_service/main.nf \ +-params-file params.yaml \ +-resume \ +-profile docker,no_publish \ +-c target/nextflow/atlas_service/nextflow.config \ +-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config \ No newline at end of file From 37cd27a66131dfc3a9aa89f3ae8b764f1cddc581 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Mon, 3 Mar 2025 11:28:53 +0100 Subject: [PATCH 08/21] update celltypist params --- src/atlas_service/main.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index 6778924..c4fdcd8 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -29,9 +29,8 @@ workflow run_wf { ) } // Check Harmony KNN arguments - if (state.annotation_methods.contains("harmony_knn") && !state.reference ) { - throw new RuntimeException("Harmony KNN was selected as an annotation method. A --reference dataset must be provided.") - } + if ((state.annotation_methods.contains("harmony_knn") || state.annotation_methods.contains("scvi_knn")) && !state.reference ) { + throw new RuntimeException("When `harmony_knn` or `scvi_knn` are selected as an annotation method, a --reference dataset must be provided.") [id, state + new_state] } @@ -137,7 +136,6 @@ workflow run_wf { args: [ // log normalized counts are expected for celltypist "input_layer": "log_normalized", - "check_expression": "true", "output_obs_predictions": "celltypist_pred", "output_obs_probability": "celltypist_proba" ], From 028e0d1735aedfc8acdcfbd2827a31538622f530 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 4 Mar 2025 15:05:26 +0000 Subject: [PATCH 09/21] tests wip --- src/atlas_service/config.vsh.yaml | 8 +++++ src/atlas_service/integration_test.sh | 17 ++++++++++ src/atlas_service/nextflow.config | 4 +-- src/atlas_service/test.nf | 48 +++++++++++++++++++++++++++ src/atlas_service/test_execution.sh | 6 ++-- 5 files changed, 78 insertions(+), 5 deletions(-) create mode 100755 src/atlas_service/integration_test.sh create mode 100644 src/atlas_service/test.nf diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index f18b5ec..47c6de9 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -386,5 +386,13 @@ resources: path: main.nf entrypoint: run_wf +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + - path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl + runners: - type: nextflow \ No newline at end of file diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh new file mode 100755 index 0000000..ff15861 --- /dev/null +++ b/src/atlas_service/integration_test.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/atlas_service/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/utils/labels_ci.config \ + -c src/utils/integration_tests.config diff --git a/src/atlas_service/nextflow.config b/src/atlas_service/nextflow.config index 8108bc2..71b491a 100644 --- a/src/atlas_service/nextflow.config +++ b/src/atlas_service/nextflow.config @@ -3,8 +3,8 @@ manifest { } params { - rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() + rootDir = java.nio.file.Paths.get("$projectDir/../../").toAbsolutePath().normalize().toString() } // include common settings -includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file +includeConfig("${params.rootDir}/src/labels.config") \ No newline at end of file diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf new file mode 100644 index 0000000..a97bbee --- /dev/null +++ b/src/atlas_service/test.nf @@ -0,0 +1,48 @@ +nextflow.enable.dsl=2 + +include { atlas_service } from params.rootDir + "/target/nextflow/atlas_service/main.nf" +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + reference_layer_lognormalized_counts: "log_normalized", + reference_obs_batch: "donor_assay", + reference_obs_label: "cell_type", + annotation_methods: "celltypist" + ] + ]) + | view {"State at start: $it"} + | map{ state -> [state.id, state] } + | atlas_service + | view {"After AaaS: $it"} + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } + } \ No newline at end of file diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh index 96bb15f..2463e37 100755 --- a/src/atlas_service/test_execution.sh +++ b/src/atlas_service/test_execution.sh @@ -1,8 +1,8 @@ cat > params.yaml << HERE id: harmony -input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +input: resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu modality: rna -reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu +reference: resources_test/annotation_test_data/TS_Blood_filtered.h5mu reference_layer_lognormalized_counts: log_normalized reference_var_gene_names: ensemblid reference_obs_batch: donor_assay @@ -17,7 +17,7 @@ nextflow run . \ -resume \ -profile docker \ -c target/nextflow/atlas_service/nextflow.config \ --c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config +-c src/utils/labels_ci.config # cat > params.yaml << HERE # id: celltypist From dcaaaa14926ef751cc5ce04921a0162b67b4af6e Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 4 Mar 2025 15:26:58 +0000 Subject: [PATCH 10/21] tests wip --- src/atlas_service/main.nf | 1 + src/utils/integration_tests.config | 36 ++++++++++++++++++++++++++++++ src/utils/labels_ci.config | 36 ++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 src/utils/integration_tests.config create mode 100644 src/utils/labels_ci.config diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index 6b89c57..06c9760 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -30,6 +30,7 @@ workflow run_wf { // Check Harmony KNN arguments if ((state.annotation_methods.contains("harmony_knn") || state.annotation_methods.contains("scvi_knn")) && !state.reference ) { throw new RuntimeException("When `harmony_knn` or `scvi_knn` are selected as an annotation method, a --reference dataset must be provided.") + } [id, state + new_state] } diff --git a/src/utils/integration_tests.config b/src/utils/integration_tests.config new file mode 100644 index 0000000..59d5b09 --- /dev/null +++ b/src/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/src/utils/labels_ci.config b/src/utils/labels_ci.config new file mode 100644 index 0000000..dd2e23e --- /dev/null +++ b/src/utils/labels_ci.config @@ -0,0 +1,36 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + // Nextflow apparently can't handle empty directives, i.e. + // withLabel: lowdisk {} + // so for that reason we have to add a dummy directive + withLabel: lowdisk { + dummyDirective = "dummyValue" + } + withLabel: middisk { + dummyDirective = "dummyValue" + } + withLabel: highdisk { + dummyDirective = "dummyValue" + } + withLabel: veryhighdisk { + dummyDirective = "dummyValue" + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 From 7f357550529c59adb0f76ccc6025c68490a140ff Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 4 Mar 2025 17:57:31 +0000 Subject: [PATCH 11/21] tests wip --- src/atlas_service/test.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf index a97bbee..b96a612 100644 --- a/src/atlas_service/test.nf +++ b/src/atlas_service/test.nf @@ -29,7 +29,7 @@ workflow test_wf { // check id def id = output[0] - assert id.endsWith("_test") : "Output ID should be same as input ID" + assert id == "merged" : "Output ID should be `merged`" // check output def state = output[1] From 61763f0ccba35aa2d762d97c94788e163c02981f Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 5 Mar 2025 16:06:32 +0000 Subject: [PATCH 12/21] tests wip --- src/atlas_service/integration_test.sh | 11 ++++ src/atlas_service/test.nf | 79 +++++++++++++++++++++++++-- src/atlas_service/test.yaml | 63 +++++++++++++++++++++ 3 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 src/atlas_service/test.yaml diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh index ff15861..546a62b 100755 --- a/src/atlas_service/integration_test.sh +++ b/src/atlas_service/integration_test.sh @@ -12,6 +12,17 @@ nextflow \ run . \ -main-script src/atlas_service/test.nf \ -entry test_wf \ + -resume \ -profile docker,no_publish \ -c src/utils/labels_ci.config \ -c src/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/atlas_service/test.nf \ + -profile docker, no_publish \ + -resume \ + -entry test_wf_2 \ + -c src/utils/labels_ci.config \ + -c src/utils/integration_tests.config \ + --publish_dir test_2 \ No newline at end of file diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf index b96a612..6c01251 100644 --- a/src/atlas_service/test.nf +++ b/src/atlas_service/test.nf @@ -39,10 +39,77 @@ workflow test_wf { assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" "Output: $output" + } +} + +workflow test_wf_2 { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "pbmc", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + var_name_mitochondrial_genes: 'mitochondrial', + rna_min_counts: 2, + prot_min_counts: 3, + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + reference_layer_lognormalized_counts: "log_normalized", + reference_obs_batch: "donor_assay", + reference_obs_label: "cell_type", + annotation_methods: "celltypist" + ], + [ + id: "pbmc_with_more_params", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + rna_min_counts: 2, + rna_max_counts: 1000000, + rna_min_genes_per_cell: 1, + rna_max_genes_per_cell: 1000000, + rna_min_cells_per_gene: 1, + rna_min_fraction_mito: 0.0, + rna_max_fraction_mito: 1.0, + prot_min_counts: 3, + prot_max_counts: 1000000, + prot_min_proteins_per_cell: 1, + prot_max_proteins_per_cell: 1000000, + prot_min_cells_per_protein: 1, + var_name_mitochondrial_genes: 'mitochondrial', + obs_name_mitochondrial_fraction: 'fraction_mitochondrial', + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + reference_layer_lognormalized_counts: "log_normalized", + reference_obs_batch: "donor_assay", + reference_obs_label: "cell_type", + annotation_methods: "celltypist" + ] + ]) + | view {"State at start: $it"} + | map { state -> [state.id, state] } + | atlas_service + | view {"After AaaS: $it"} + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id == "merged" : "Output ID should be `merged`" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" } - | toSortedList({a, b -> a[0] <=> b[0]}) - | map { output_list -> - assert output_list.size() == 2 : "output channel should contain 2 events" - assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] - } - } \ No newline at end of file + } diff --git a/src/atlas_service/test.yaml b/src/atlas_service/test.yaml new file mode 100644 index 0000000..2314410 --- /dev/null +++ b/src/atlas_service/test.yaml @@ -0,0 +1,63 @@ +output: $id.$key.output.html +var_gene_names: gene_symbol +var_name_mitochondrial_genes: mitochondrial +var_name_ribosomal_genes: ribosomal +publish_dir: s3://itx-del-data-pipelines/jmajerci/qc_ingestion_reports +param_list: + - id: various_cart_sample_1 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_1.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_2 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_2.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_3 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_3.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_4 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_4.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_5 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_5.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_6 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_6.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_7 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_7.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_8 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_8.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_9 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_9.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_10 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_10.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_11 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_11.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_12 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_12.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_13 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_13.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_14 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_14.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_15 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_15.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_16 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_16.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_17 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_17.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_18 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_18.from_cellranger_multi_to_h5mu.output_0.h5mu + - id: various_cart_sample_19 + input: >- + s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_19.from_cellranger_multi_to_h5mu.output_0.h5mu \ No newline at end of file From 6a7ccd93c7b0b783918f0dc488b883c8b77b5a94 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 5 Mar 2025 16:30:54 +0000 Subject: [PATCH 13/21] update gitignore and test resources pointer --- .gitignore | 3 +++ _viash.yaml | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/.gitignore b/.gitignore index 2198d08..3ad2825 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,6 @@ Thumbs.db work .nextflow* target + +# viash related +resources_test diff --git a/_viash.yaml b/_viash.yaml index 6a7d20e..17094d7 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -15,3 +15,9 @@ repositories: repo: openpipelines-bio/openpipeline type: github tag: main_build + +info: + test_resources: + - type: s3 + path: s3://openpipelines-data + dest: resources_test \ No newline at end of file From a22d1ea49bb96ab9b255e7dfc188ab28dd7d9600 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 5 Mar 2025 16:59:10 +0000 Subject: [PATCH 14/21] update annotation columns --- src/atlas_service/integration_test.sh | 2 +- src/atlas_service/main.nf | 6 +- src/atlas_service/test.nf | 2 +- src/atlas_service/test.yaml | 63 ------------------- src/atlas_service/test_execution.sh | 57 ----------------- src/atlas_service/test_params.sh | 88 --------------------------- 6 files changed, 5 insertions(+), 213 deletions(-) delete mode 100644 src/atlas_service/test.yaml delete mode 100755 src/atlas_service/test_execution.sh delete mode 100755 src/atlas_service/test_params.sh diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh index 546a62b..066f6c5 100755 --- a/src/atlas_service/integration_test.sh +++ b/src/atlas_service/integration_test.sh @@ -20,7 +20,7 @@ nextflow \ nextflow \ run . \ -main-script src/atlas_service/test.nf \ - -profile docker, no_publish \ + -profile docker,no_publish \ -resume \ -entry test_wf_2 \ -c src/utils/labels_ci.config \ diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index 06c9760..b85c3d4 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -197,9 +197,9 @@ workflow run_wf { args: [ "input_layer_lognormalized": "log_normalized", "input_obs_batch_label": "sample_id", - "output_obs_predictions": "harmony_knn_pred", - "output_obs_probability": "harmony_knn_proba", - "output_obsm_integrated": "X_integrated_harmony", + "output_obs_predictions": "scvi_knn_pred", + "output_obs_probability": "scvi_knn_proba", + "output_obsm_integrated": "X_integrated_scvi", "overwrite_existing_key": "true" ], toState: [ "query_processed": "output" ] diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf index 6c01251..e549bd3 100644 --- a/src/atlas_service/test.nf +++ b/src/atlas_service/test.nf @@ -17,7 +17,7 @@ workflow test_wf { reference_layer_lognormalized_counts: "log_normalized", reference_obs_batch: "donor_assay", reference_obs_label: "cell_type", - annotation_methods: "celltypist" + annotation_methods: "celltypist;scvi_knn;harmony_knn" ] ]) | view {"State at start: $it"} diff --git a/src/atlas_service/test.yaml b/src/atlas_service/test.yaml deleted file mode 100644 index 2314410..0000000 --- a/src/atlas_service/test.yaml +++ /dev/null @@ -1,63 +0,0 @@ -output: $id.$key.output.html -var_gene_names: gene_symbol -var_name_mitochondrial_genes: mitochondrial -var_name_ribosomal_genes: ribosomal -publish_dir: s3://itx-del-data-pipelines/jmajerci/qc_ingestion_reports -param_list: - - id: various_cart_sample_1 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_1.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_2 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_2.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_3 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_3.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_4 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_4.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_5 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_5.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_6 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_6.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_7 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_7.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_8 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_8.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_9 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_9.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_10 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_10.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_11 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_11.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_12 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_12.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_13 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_13.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_14 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_14.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_15 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_15.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_16 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_16.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_17 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_17.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_18 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_18.from_cellranger_multi_to_h5mu.output_0.h5mu - - id: various_cart_sample_19 - input: >- - s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_19.from_cellranger_multi_to_h5mu.output_0.h5mu \ No newline at end of file diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh deleted file mode 100755 index 2463e37..0000000 --- a/src/atlas_service/test_execution.sh +++ /dev/null @@ -1,57 +0,0 @@ -cat > params.yaml << HERE -id: harmony -input: resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -reference: resources_test/annotation_test_data/TS_Blood_filtered.h5mu -reference_layer_lognormalized_counts: log_normalized -reference_var_gene_names: ensemblid -reference_obs_batch: donor_assay -reference_obs_label: cell_type -annotation_methods: harmony_knn;scvi_knn;celltypist -publish_dir: aaas_test -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker \ --c target/nextflow/atlas_service/nextflow.config \ --c src/utils/labels_ci.config - -# cat > params.yaml << HERE -# id: celltypist -# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -# celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl -# modality: rna -# input_var_gene_names: gene_symbol -# annotation_methods: celltypist -# publish_dir: output -# HERE - -# nextflow run . \ -# -main-script target/nextflow/atlas_service/main.nf \ -# -params-file params.yaml \ -# -resume \ -# -profile docker,no_publish \ -# -c target/nextflow/atlas_service/nextflow.config \ -# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config - -# cat > params.yaml << HERE -# id: scgpt -# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -# modality: rna -# annotation_methods: scgpt_annotation -# input_var_gene_names: gene_symbol -# scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt -# scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json -# scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json -# HERE - -# nextflow run . \ -# -main-script target/nextflow/atlas_service/main.nf \ -# -params-file params.yaml \ -# -resume \ -# -profile docker,no_publish \ -# -c target/nextflow/atlas_service/nextflow.config \ -# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh deleted file mode 100755 index 53986e8..0000000 --- a/src/atlas_service/test_params.sh +++ /dev/null @@ -1,88 +0,0 @@ -# Test required arguments scGPT -cat > params.yaml << HERE -id: scgpt -input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: scgpt_annotation -scgpt_model_config: /home/dorienroosen/openpipeline/resources_test/scgpt/source/args.json -scgpt_model_vocab: /home/dorienroosen/openpipeline/resources_test/scgpt/source/vocab.json -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config - -# Test required arguments CellTypist -cat > params.yaml << HERE -id: celltypist_1 -input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: celltypist -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config - -cat > params.yaml << HERE -id: celltypist_2 -input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl -reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: celltypist -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config - - -# Test required arguments Harmony -cat > params.yaml << HERE -id: harmony -input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: harmony_knn -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config - -# Test required arguments SCVI -cat > params.yaml << HERE -id: scvi -input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu -modality: rna -input_var_gene_names: gene_symbol -annotation_methods: scvi_knn -HERE - -nextflow run . \ --main-script target/nextflow/atlas_service/main.nf \ --params-file params.yaml \ --resume \ --profile docker,no_publish \ --c target/nextflow/atlas_service/nextflow.config \ --c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config \ No newline at end of file From fcd1ead28673b706c0130c5a2ec9b474d108bd88 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 6 Mar 2025 18:25:18 +0000 Subject: [PATCH 15/21] finalize tests --- src/atlas_service/integration_test.sh | 21 +++++++- src/atlas_service/test.nf | 76 +++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh index 066f6c5..827f55d 100755 --- a/src/atlas_service/integration_test.sh +++ b/src/atlas_service/integration_test.sh @@ -24,5 +24,22 @@ nextflow \ -resume \ -entry test_wf_2 \ -c src/utils/labels_ci.config \ - -c src/utils/integration_tests.config \ - --publish_dir test_2 \ No newline at end of file + -c src/utils/integration_tests.config + + nextflow \ + run . \ + -main-script src/atlas_service/test.nf \ + -profile docker,no_publish \ + -resume \ + -entry test_wf_3 \ + -c src/utils/labels_ci.config \ + -c src/utils/integration_tests.config + + nextflow \ + run . \ + -main-script src/atlas_service/test.nf \ + -profile docker,no_publish \ + -resume \ + -entry test_wf_4 \ + -c src/utils/labels_ci.config \ + -c src/utils/integration_tests.config \ No newline at end of file diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf index e549bd3..ae151a4 100644 --- a/src/atlas_service/test.nf +++ b/src/atlas_service/test.nf @@ -43,6 +43,7 @@ workflow test_wf { } workflow test_wf_2 { + // allow changing the resources_test dir resources_test = file(params.resources_test) @@ -113,3 +114,78 @@ workflow test_wf_2 { "Output: $output" } } + +workflow test_wf_3 { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "celltypist_model", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + celltypist_model: resources_test.resolve("annotation_test_data/celltypist_model_Immune_All_Low.pkl"), + annotation_methods: "celltypist", + input_var_gene_names: "gene_symbol" + ] + ]) + | view {"State at start: $it"} + | map{ state -> [state.id, state] } + | atlas_service + | view {"After AaaS: $it"} + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id == "merged" : "Output ID should be `merged`" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } +} + +workflow test_wf_4 { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "scgpt", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + annotation_methods: "scgpt_annotation", + input_var_gene_names: "gene_symbol", + scgpt_model: resources_test.resolve("scgpt/finetuned_model/best_model.pt"), + scgpt_model_config: resources_test.resolve("scgpt/source/args.json"), + scgpt_model_vocab: resources_test.resolve("scgpt/source/vocab.json"), + annotation_methods: "scgpt_annotation" + ] + ]) + | view {"State at start: $it"} + | map{ state -> [state.id, state] } + | atlas_service + | view {"After AaaS: $it"} + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id == "merged" : "Output ID should be `merged`" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } +} From 57f851a712b6ebf02a885d2598b29770993a07c9 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 11 Mar 2025 17:51:22 +0000 Subject: [PATCH 16/21] update scgpt annotation --- src/atlas_service/config.vsh.yaml | 2 +- src/atlas_service/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index 47c6de9..e361553 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -356,7 +356,7 @@ dependencies: alias: process_samples_workflow repository: op - name: workflows/annotation/scgpt_annotation - repository: op + repository: op-main - name: annotate/celltypist repository: op-main alias: celltypist_annotation diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf index b85c3d4..9289ee7 100644 --- a/src/atlas_service/main.nf +++ b/src/atlas_service/main.nf @@ -67,7 +67,6 @@ workflow run_wf { "id": id, "input": state.query_processed, "modality": state.modality, - "input_layer": state.input_layer, "input_var_gene_names": state.input_var_gene_names, "model": state.scgpt_model, "model_config": state.scgpt_model_config, @@ -84,6 +83,7 @@ workflow run_wf { ] }, args: [ + "input_layer": "log_normalized", "input_obs_batch_label": "sample_id", "output_obs_predictions": "scgpt_pred", "output_obs_probability": "scgpt_proba" From f3e3ad6b87fb8af29a8a80b5840cfbff24e2680d Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 20 Mar 2025 09:40:57 +0100 Subject: [PATCH 17/21] add viash config mods --- _viash.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/_viash.yaml b/_viash.yaml index 56e5e0c..927260c 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -21,3 +21,8 @@ info: - type: s3 path: s3://openpipelines-bio/openpipeline_incubator/resources_test dest: resources_test + +config_mods: | + .resources += {path: '/src/utils/labels.config', dest: 'nextflow_labels.config'} + .runners[.type == 'nextflow'].directives.tag := '$id' + .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")' From 3e47eddf9b2978780455f7dc1820bea62ed8c21b Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 20 Mar 2025 09:41:13 +0100 Subject: [PATCH 18/21] add viash config mods --- src/labels.config | 68 ----------------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 src/labels.config diff --git a/src/labels.config b/src/labels.config deleted file mode 100644 index 541aaad..0000000 --- a/src/labels.config +++ /dev/null @@ -1,68 +0,0 @@ -process { - // Default resources for components that hardly do any processing - memory = { 2.GB * task.attempt } - cpus = 1 - - // Retry for exit codes that have something to do with memory issues - errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - maxRetries = 3 - maxMemory = null - - // CPU resources - withLabel: singlecpu { cpus = 1 } - withLabel: lowcpu { cpus = 4 } - withLabel: midcpu { cpus = 10 } - withLabel: highcpu { cpus = 20 } - - // Memory resources - withLabel: lowmem { memory = { get_memory( 50.GB * task.attempt ) } } - withLabel: midmem { memory = { get_memory( 50.GB * task.attempt ) } } - withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } } - withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } } - - // Disk space - // Nextflow apparently can't handle empty directives, i.e. - // withLabel: lowdisk {} - // so for that reason we have to add a dummy directive - withLabel: lowdisk { - dummyDirective = "dummyValue" - } - withLabel: middisk { - dummyDirective = "dummyValue" - } - withLabel: highdisk { - dummyDirective = "dummyValue" - } - withLabel: veryhighdisk { - dummyDirective = "dummyValue" - } - // NOTE: The above labels intentionally do not have an effect by default. - // The user should set the disk space requirements by adding the following - // to the compute environment: - // - // withLabel: lowdisk { disk = { 20.GB * task.attempt } } - // withLabel: middisk { disk = { 100.GB * task.attempt } } - // withLabel: highdisk { disk = { 200.GB * task.attempt } } - // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } -} - -def get_memory(to_compare) { - if (!process.containsKey("maxMemory") || !process.maxMemory) { - return to_compare - } - - try { - if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { - return process.maxMemory - } - else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { - return max_memory as nextflow.util.MemoryUnit - } - else { - return to_compare - } - } catch (all) { - println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" - System.exit(1) - } -} From 210c3918321536a85bc32b960cc78b219016ce4f Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 20 Mar 2025 09:41:58 +0100 Subject: [PATCH 19/21] add viash config mods --- src/atlas_service/nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/atlas_service/nextflow.config b/src/atlas_service/nextflow.config index 71b491a..41a4570 100644 --- a/src/atlas_service/nextflow.config +++ b/src/atlas_service/nextflow.config @@ -7,4 +7,4 @@ params { } // include common settings -includeConfig("${params.rootDir}/src/labels.config") \ No newline at end of file +includeConfig("${params.rootDir}/src/utils/labels.config") \ No newline at end of file From a887eb7c956ba72a99a1fecda63cf7713d400da0 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 20 Mar 2025 09:46:06 +0100 Subject: [PATCH 20/21] update labels --- src/utils/labels.config | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/utils/labels.config diff --git a/src/utils/labels.config b/src/utils/labels.config new file mode 100644 index 0000000..541aaad --- /dev/null +++ b/src/utils/labels.config @@ -0,0 +1,68 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = null + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { get_memory( 50.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 50.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } } + withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } } + + // Disk space + // Nextflow apparently can't handle empty directives, i.e. + // withLabel: lowdisk {} + // so for that reason we have to add a dummy directive + withLabel: lowdisk { + dummyDirective = "dummyValue" + } + withLabel: middisk { + dummyDirective = "dummyValue" + } + withLabel: highdisk { + dummyDirective = "dummyValue" + } + withLabel: veryhighdisk { + dummyDirective = "dummyValue" + } + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } +} From cfa3a45191107157648e9e9e1b0d4c688dbca281 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 20 Mar 2025 09:59:20 +0100 Subject: [PATCH 21/21] update _viash and dependencies --- _viash.yaml | 2 +- src/atlas_service/config.vsh.yaml | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 927260c..51ace0d 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -14,7 +14,7 @@ repositories: - name: openpipeline repo: openpipelines-bio/openpipeline type: github - tag: main_build + tag: 2.0.0 info: test_resources: diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml index e361553..70137dd 100644 --- a/src/atlas_service/config.vsh.yaml +++ b/src/atlas_service/config.vsh.yaml @@ -348,13 +348,9 @@ argument_groups: example: output.h5mu dependencies: - - name: metadata/add_id - repository: op - - name: dataflow/split_h5mu - repository: op - name: workflows/multiomics/process_samples alias: process_samples_workflow - repository: op + repository: openpipeline - name: workflows/annotation/scgpt_annotation repository: op-main - name: annotate/celltypist @@ -368,10 +364,6 @@ dependencies: alias: scvi_knn_annotation repositories: - - name: op - type: github - repo: openpipelines-bio/openpipeline - tag: 2.0.0 - name: op-main type: github repo: openpipelines-bio/openpipeline