From 70d83d2389d116f75d9771b9f561435a7ae6d689 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 16 Sep 2025 19:48:46 +0200
Subject: [PATCH 1/4] remove scgpt components and workflows

---
 src/scgpt/binning/config.vsh.yaml             |  93 -----
 src/scgpt/binning/script.py                   | 129 -------
 src/scgpt/binning/test.py                     |  59 ---
 .../cell_type_annotation/config.vsh.yaml      | 164 --------
 src/scgpt/cell_type_annotation/script.py      | 255 -------------
 src/scgpt/cell_type_annotation/test.py        | 215 -----------
 src/scgpt/cross_check_genes/config.vsh.yaml   |  99 -----
 src/scgpt/cross_check_genes/script.py         |  70 ----
 src/scgpt/cross_check_genes/test.py           |  96 -----
 src/scgpt/embedding/config.vsh.yaml           | 147 --------
 src/scgpt/embedding/script.py                 | 187 ----------
 src/scgpt/embedding/test.py                   | 350 ------------------
 src/scgpt/pad_tokenize/config.vsh.yaml        | 125 -------
 src/scgpt/pad_tokenize/script.py              | 111 ------
 src/scgpt/pad_tokenize/test.py                | 113 ------
 .../scgpt_annotation/config.vsh.yaml          | 211 -----------
 .../scgpt_annotation/integration_test.sh      |  14 -
 .../annotation/scgpt_annotation/main.nf       | 112 ------
 .../scgpt_annotation/nextflow.config          |  10 -
 .../annotation/scgpt_annotation/test.nf       |  58 ---
 .../integration/scgpt_leiden/config.vsh.yaml  | 185 ---------
 .../scgpt_leiden/integration_test.sh          |  23 --
 .../integration/scgpt_leiden/main.nf          | 138 -------
 .../integration/scgpt_leiden/nextflow.config  |  10 -
 .../integration/scgpt_leiden/test.nf          | 100 -----
 25 files changed, 3074 deletions(-)
 delete mode 100644 src/scgpt/binning/config.vsh.yaml
 delete mode 100644 src/scgpt/binning/script.py
 delete mode 100644 src/scgpt/binning/test.py
 delete mode 100644 src/scgpt/cell_type_annotation/config.vsh.yaml
 delete mode 100644 src/scgpt/cell_type_annotation/script.py
 delete mode 100644 src/scgpt/cell_type_annotation/test.py
 delete mode 100644 src/scgpt/cross_check_genes/config.vsh.yaml
 delete mode 100644 src/scgpt/cross_check_genes/script.py
 delete mode 100644 src/scgpt/cross_check_genes/test.py
 delete mode 100644 src/scgpt/embedding/config.vsh.yaml
 delete mode 100644 src/scgpt/embedding/script.py
 delete mode 100644 src/scgpt/embedding/test.py
 delete mode 100644 src/scgpt/pad_tokenize/config.vsh.yaml
 delete mode 100644 src/scgpt/pad_tokenize/script.py
 delete mode 100644 src/scgpt/pad_tokenize/test.py
 delete mode 100644 src/workflows/annotation/scgpt_annotation/config.vsh.yaml
 delete mode 100755 src/workflows/annotation/scgpt_annotation/integration_test.sh
 delete mode 100644 src/workflows/annotation/scgpt_annotation/main.nf
 delete mode 100644 src/workflows/annotation/scgpt_annotation/nextflow.config
 delete mode 100644 src/workflows/annotation/scgpt_annotation/test.nf
 delete mode 100644 src/workflows/integration/scgpt_leiden/config.vsh.yaml
 delete mode 100755 src/workflows/integration/scgpt_leiden/integration_test.sh
 delete mode 100644 src/workflows/integration/scgpt_leiden/main.nf
 delete mode 100644 src/workflows/integration/scgpt_leiden/nextflow.config
 delete mode 100644 src/workflows/integration/scgpt_leiden/test.nf

diff --git a/src/scgpt/binning/config.vsh.yaml b/src/scgpt/binning/config.vsh.yaml
deleted file mode 100644
index 2d3fcdb0486..00000000000
--- a/src/scgpt/binning/config.vsh.yaml
+++ /dev/null
@@ -1,93 +0,0 @@
-name: binning
-namespace: "scgpt"
-scope: "public"
-description: |
-  Conversion of (pre-processed) expression count data into relative values (bins) to address scale differences across sequencing batches.
-authors:
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ maintainer, author ]
-  - __merge__: /src/authors/elizabeth_mlynarski.yaml
-    roles: [ author ]
-  - __merge__: /src/authors/weiwei_schultz.yaml
-    roles: [ contributor ]
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: "--input"
-        type: file
-        direction: input
-        required: true
-        example: input.h5mu
-        description: |
-          Input h5mu file.
-      - name: "--modality"
-        description:
-          Which modality from the input MuData file to process. 
-        type: string
-        default: "rna"
-        required: false
-      - name: "--input_layer"
-        type: string
-        required: False
-        description: |
-          Mudata layer (key from .layers) to use as input data for binning. If not specified, .X is used.
-      - name: "--var_input"
-        type: string
-        default: "id_in_vocab"
-        description: |
-          The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.
-      - name: "--n_input_bins"
-        type: integer
-        default: 51
-        required: False
-        min: 1
-        description: |
-          The number of bins to discretize the data into. When no value is provided, data won't be binned.
-          
-  - name: Outputs
-    arguments:
-      - name: "--output"
-        direction: output
-        type: file
-        example: output.h5mu
-        required: true
-        description: |
-          The output h5mu file containing the binned data.    
-      - name: "--output_obsm_binned_counts"
-        type: string
-        default: "binned_counts"
-        description: |
-          The name of the adata layer to write the binned data to.
-      - name: "--seed"
-        type: integer
-        description: |
-          Seed for random number generation.
-    __merge__: [., /src/base/h5_compression_argument.yaml]
-
-
-resources:
-  - type: python_script
-    path: script.py
-  - path: /src/utils/setup_logger.py
-  - path: /src/utils/subset_vars.py
-test_resources:
-  - type: python_script
-    path: test.py
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu
-
-engines:
-  - type: docker
-    image: python:3.11-slim
-    setup:
-      - type: apt
-        packages: 
-          - procps
-      - type: python
-        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
-    __merge__: [ /src/base/requirements/python_test_setup.yaml ]
-runners:
-  - type: executable
-  - type: nextflow
-    directives:
-      label: [ midcpu, midmem ]
diff --git a/src/scgpt/binning/script.py b/src/scgpt/binning/script.py
deleted file mode 100644
index cac9312b576..00000000000
--- a/src/scgpt/binning/script.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import sys
-import mudata as mu
-import numpy as np
-from scipy.sparse import csr_matrix
-import warnings
-
-## VIASH START
-par = {
-    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_genes_cross_checked.h5mu",
-    "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu",
-    "modality": "rna",
-    "input_layer": None,
-    "output_obsm_binned_counts": "binned_counts",
-    "n_input_bins": 51,
-    "output_compression": None,
-    "var_input": "id_in_vocab",
-    "seed": 0,
-}
-meta = {"resources_dir": "src/utils"}
-## VIASH END
-
-if par["seed"]:
-    np.random.seed(par["seed"])
-
-sys.path.append(meta["resources_dir"])
-from setup_logger import setup_logger
-from subset_vars import subset_vars
-
-logger = setup_logger()
-
-logger.info("Reading in data")
-# Read in data
-mdata = mu.read(par["input"])
-input_adata = mdata.mod[par["modality"]]
-adata = input_adata.copy()
-
-logger.info("Subsetting data based on highly variable gene and/or cross-checked genes")
-adata = subset_vars(adata, par["var_input"])
-
-logger.info("Converting the input layer into a CSR matrix")
-if not par["input_layer"] or par["input_layer"] == "X":
-    layer_data = adata.X
-else:
-    layer_data = adata.layers[par["input_layer"]]
-layer_data = csr_matrix(layer_data)
-
-if layer_data.min() < 0:
-    raise ValueError(
-        f"Assuming non-negative data, but got min value {layer_data.min()}."
-    )
-
-n_bins = par["n_input_bins"]  # NOTE: the first bin is always a spectial for zero
-logger.info(f"Binning data into {par['n_input_bins']} bins.")
-
-
-def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
-    assert x.ndim == 1 and bins.ndim == 1
-
-    left_digits = np.digitize(x, bins)
-    right_difits = np.digitize(x, bins, right=True)
-
-    rands = np.random.rand(len(x))  # uniform random numbers
-
-    digits = rands * (right_difits - left_digits) + left_digits
-    digits = np.ceil(digits)
-    smallest_dtype = np.min_scalar_type(
-        digits.max().astype(np.uint)
-    )  # Already checked for non-negative values
-    digits = digits.astype(smallest_dtype)
-
-    return digits
-
-
-with warnings.catch_warnings():
-    # Make sure warnings are displayed once.
-    warnings.simplefilter("once")
-    # layer_data.indptr.size is the number of rows in the sparse matrix
-    binned_rows = []
-    bin_edges = []
-    logger.info(
-        "Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix"
-    )
-    for row_number in range(layer_data.indptr.size - 1):
-        row_start_index, row_end_index = (
-            layer_data.indptr[row_number],
-            layer_data.indptr[row_number + 1],
-        )
-        # These are all non-zero counts in the row
-        non_zero_row = layer_data.data[row_start_index:row_end_index]
-        if len(non_zero_row) == 0:
-            logger.warning(
-                "The input data contains all zero rows. Please make sure "
-                "this is expected. You can use the `filter_cell_by_counts` "
-                "arg to filter out all zero rows."
-            )
-
-            # Add binned_rows and bin_edges as all 0
-            # np.stack will upcast the dtype later
-            binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8))
-            bin_edges.append(np.array([0] * n_bins))
-            continue
-
-        # Binning of non-zero values
-        bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1))
-        non_zero_digits = _digitize(non_zero_row, bins)
-        assert non_zero_digits.min() >= 1
-        assert non_zero_digits.max() <= n_bins - 1
-        binned_rows.append(non_zero_digits)
-
-        bin_edges.append(np.concatenate([[0], bins]))
-
-# Create new CSR matrix
-logger.info("Creating a new CSR matrix of the binned count values")
-binned_counts = csr_matrix(
-    (
-        np.concatenate(binned_rows, casting="same_kind"),
-        layer_data.indices,
-        layer_data.indptr,
-    ),
-    shape=layer_data.shape,
-)
-
-# Set binned values and bin edges layers to adata object
-input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts
-input_adata.obsm["bin_edges"] = np.stack(bin_edges)
-
-# Write mudata output
-logger.info("Writing output data")
-mdata.write_h5mu(par["output"], compression=par["output_compression"])
diff --git a/src/scgpt/binning/test.py b/src/scgpt/binning/test.py
deleted file mode 100644
index 4cfad752966..00000000000
--- a/src/scgpt/binning/test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import pytest
-import sys
-import mudata as mu
-from scipy.sparse import issparse
-
-## VIASH START
-meta = {
-    "resources_dir": "resources_test",
-    "executable": "./target/docker/scgpt/binning/binning",
-    "temp_dir": "tmp",
-    "config": "./target/docker/scgpt/binning/.config.vsh.yaml",
-}
-## VIASH END
-
-
-def test_binning(run_component, tmp_path):
-    input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_preprocessed.h5mu"
-    output_file_path = tmp_path / "Kim2020_Lung_subset_binned.h5mu"
-
-    run_component(
-        [
-            "--input",
-            input_file_path,
-            "--modality",
-            "rna",
-            "--output_obsm_binned_counts",
-            "binned_counts",
-            "--n_input_bins",
-            "51",
-            "--var_input",
-            "filter_with_hvg",
-            "--output",
-            output_file_path,
-        ]
-    )
-
-    # Read output file
-    output_mdata = mu.read(output_file_path)
-    output_adata = output_mdata.mod["rna"]
-
-    # Check presence of binning layers
-    assert {"bin_edges", "binned_counts"}.issubset(output_adata.obsm.keys()), (
-        "Binning obsm fields were not added."
-    )
-
-    # Check bin edges
-    bin_edges = output_adata.obsm["bin_edges"]
-    assert all(bin_edges[:, 0] == 0)
-    assert bin_edges.shape[1] == 51
-    assert all(all(i >= 0) for i in bin_edges)
-
-    # Check binned values
-    binned_values = output_adata.obsm["binned_counts"]
-    assert issparse(binned_values)
-    assert (binned_values.data <= 51).all(axis=None)
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main([__file__]))
diff --git a/src/scgpt/cell_type_annotation/config.vsh.yaml b/src/scgpt/cell_type_annotation/config.vsh.yaml
deleted file mode 100644
index 165cb3b74d8..00000000000
--- a/src/scgpt/cell_type_annotation/config.vsh.yaml
+++ /dev/null
@@ -1,164 +0,0 @@
-name: cell_type_annotation
-namespace: "scgpt"
-scope: "public"
-description: |
-  Annotate gene expression data with cell type classes through the scGPT model.
-authors:
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ maintainer, author ]
-  - __merge__: /src/authors/jakub_majercik.yaml
-    roles: [ author ]
-
-argument_groups:
-  - name: Model input
-    arguments:
-      - name: "--model"
-        type: file
-        required: true
-        example: best_model.pt
-        description: |
-          The model file containing checkpoints and cell type label mapper.
-      - name: "--model_config"
-        type: file
-        required: true
-        example: args.json
-        description: |
-          The model configuration file. 
-      - name: "--model_vocab"
-        type: file
-        required: true
-        example: vocab.json
-        description: |
-          Model vocabulary file directory.
-      - name: "--finetuned_checkpoints_key"
-        type: string
-        default: model_state_dict
-        description: |
-          Key in the model file containing the pretrained checkpoints.
-      - name: "--label_mapper_key"
-        type: string
-        default: id_to_class
-        description: |
-          Key in the model file containing the cell type class to label mapper dictionary.
-
-  - name: Query input
-    arguments:
-      - name: "--input"
-        type: file
-        direction: input
-        required: true
-        example: scgpt_preprocess_ouput.h5mu
-        description: |
-          The input h5mu file containing of data that have been pre-processed (normalized, binned, genes cross-checked and tokenized).
-      - name: "--modality"
-        description:
-          Which modality from the input MuData file to process. 
-        type: string
-        default: "rna"
-        required: false
-      - name: "--obs_batch_label"
-        type: string
-        required: false
-        description: |
-          The name of the adata.obs column containing the batch labels. Required if dsbn is set to true.
-      - name: "--obsm_gene_tokens"
-        type: string
-        default: "gene_id_tokens"
-        description: |
-          The key of the .obsm array containing the gene token ids
-      - name: "--obsm_tokenized_values"
-        type: string
-        default: values_tokenized
-        description: |
-          The key of the .obsm array containing the count values of the tokenized genes
-
-  - name: Outputs
-    arguments:
-      - name: "--output"
-        type: file
-        direction: output
-        required: true
-        example: output.h5mu
-        description: |
-          The output mudata file.
-      - name: "--output_obs_predictions"
-        type: string
-        default: "scgpt_pred"
-        required: false
-        description: |
-          The name of the adata.obs column to write predicted cell type labels to.
-      - name: "--output_obs_probability"
-        type: string
-        default: "scgpt_probability"
-        required: false
-        description: |
-          The name of the adata.obs column to write the probabilities of the predicted cell type labels to.
-    __merge__: [., /src/base/h5_compression_argument.yaml]
-
-  - name: Arguments
-    arguments:
-      - name: "--pad_token"
-        type: string
-        default: "<pad>"
-        required: false
-        description: |
-          The padding token used in the model.
-      - name: "--pad_value"
-        type: integer
-        default: -2
-        required: false
-        description: |
-          The value of the padding.
-      - name: "--n_input_bins"
-        type: integer
-        default: 51
-        required: false
-        description: |
-          The number of input bins.
-      - name: "--batch_size"
-        type: integer
-        default: 64
-        required: false
-        description: |
-          The batch size. 
-      - name: "--dsbn"
-        type: boolean
-        default: true
-        required: false
-        description: |
-          Whether to use domain-specific batch normalization.
-      - name: "--seed"
-        type: integer
-        description: |
-          Seed for random number generation. If not specified, no seed is used.
-
-resources:
-  - type: python_script
-    path: script.py
-  - path: /src/utils/setup_logger.py
-test_resources:
-  - type: python_script
-    path: test.py
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
-  - path: /resources_test/scgpt/source/args.json
-  - path: /resources_test/scgpt/source/vocab.json
-  - path: /resources_test/scgpt/finetuned_model/best_model.pt
-
-engines:
-  - type: docker
-    image: nvcr.io/nvidia/pytorch:23.09-py3
-    setup:
-      - type: python
-        __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ]
-      - type: python
-        packages:
-          - scgpt==0.2.1
-    test_setup:
-      - type: python
-        __merge__: [/src/base/requirements/scanpy.yaml]
-    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
-runners:
-  - type: executable
-  - type: nextflow
-    directives:
-      label: [ highmem, highcpu, gpu ]
diff --git a/src/scgpt/cell_type_annotation/script.py b/src/scgpt/cell_type_annotation/script.py
deleted file mode 100644
index c3487712512..00000000000
--- a/src/scgpt/cell_type_annotation/script.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import sys
-import json
-from multiprocessing import freeze_support
-import os
-import mudata as mu
-from typing import Dict
-import warnings
-import torch
-import numpy as np
-from torch.nn import functional
-from torch.utils.data import Dataset, DataLoader
-from scgpt.model import TransformerModel
-from scgpt.tokenizer.gene_tokenizer import GeneVocab
-from scgpt.utils import set_seed
-from tqdm import tqdm
-
-## VIASH START
-par = {
-    "input": r"resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu",
-    "modality": r"rna",
-    "model": r"resources_test/scgpt/finetuned_model/best_model.pt",
-    "model_config": r"resources_test/scgpt/source/args.json",
-    "model_vocab": r"resources_test/scgpt/source/vocab.json",
-    "obs_batch_label": r"sample",
-    "obsm_gene_tokens": r"gene_id_tokens",
-    "obsm_tokenized_values": r"values_tokenized",
-    "output": r"output.h5mu",
-    "output_compression": None,
-    "output_obs_predictions": r"predictions",
-    "output_obs_probability": r"probabilities",
-    "dsbn": True,
-    "seed": 0,
-    "pad_token": "<pad>",
-    "pad_value": -2,
-    "n_input_bins": 51,
-    "batch_size": 64,
-    "finetuned_checkpoints_key": "model_state_dict",
-    "label_mapper_key": "id_to_class",
-}
-
-## VIASH END
-
-sys.path.append(meta["resources_dir"])
-from setup_logger import setup_logger
-
-logger = setup_logger()
-
-
-class SeqDataset(Dataset):
-    def __init__(self, data: Dict[str, torch.Tensor]):
-        self.data = data
-
-    def __len__(self):
-        return self.data["gene_ids"].shape[0]
-
-    def __getitem__(self, idx):
-        return {k: v[idx] for k, v in self.data.items()}
-
-
-def main():
-    # Setting seed
-    if par["seed"]:
-        set_seed(par["seed"])
-
-    # Setting device
-    logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}")
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    # Read in data
-    logger.info("Reading in data")
-    mdata = mu.read(par["input"])
-    input_adata = mdata.mod[par["modality"]]
-    adata = input_adata.copy()
-
-    # Fetch batch ids for domain-specific batch normalization
-    if par["dsbn"] and not par["obs_batch_label"]:
-        raise ValueError(
-            "When dsbn is set to True, you are required to provide batch labels (obs_batch_labels)."
-        )
-    elif par["dsbn"] and par["obs_batch_label"]:
-        logger.info("Fetching batch id's for domain-specific batch normalization")
-        batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category")
-        batch_id_labels = batch_id_cats.cat.codes.values
-        batch_ids = batch_id_labels.tolist()
-        batch_ids = np.array(batch_ids)
-        num_batch_types = len(set(batch_ids))
-    elif not par["dsbn"]:
-        # forward pass requires a tensor as input
-        batch_ids = np.zeros(adata.shape[0])
-
-    # Vocabulary configuration
-    logger.info("Loading model vocabulary")
-    special_tokens = [par["pad_token"], "<cls>", "<eoc>"]
-    logger.info(f"Loading model vocab from {par['model_vocab']}")
-    vocab_file = par["model_vocab"]
-    vocab = GeneVocab.from_file(vocab_file)
-    [vocab.append_token(s) for s in special_tokens if s not in vocab]
-    vocab.set_default_index(vocab[par["pad_token"]])
-    ntokens = len(vocab)
-
-    # Model configuration
-    logger.info("Loading model and configurations")
-    model_config_file = par["model_config"]
-    with open(model_config_file, "r") as f:
-        model_configs = json.load(f)
-    embsize = model_configs["embsize"]
-    nhead = model_configs["nheads"]
-    d_hid = model_configs["d_hid"]
-    nlayers = model_configs["nlayers"]
-
-    # Ensure the provided model has the correct architecture
-    logger.info("Loading model")
-    model_file = par["model"]
-    model_dict = torch.load(model_file, map_location=device)
-    for k, v in {
-        "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"],
-        "--label_mapper_key": par["label_mapper_key"],
-    }.items():
-        if v not in model_dict.keys():
-            raise KeyError(
-                f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper."
-            )
-    pretrained_dict = model_dict[par["finetuned_checkpoints_key"]]
-
-    # Label mapper configuration
-    logger.info("Loading label mapper")
-    label_mapper = model_dict[par["label_mapper_key"]]
-    cell_type_mapper = {int(k): v for k, v in label_mapper.items()}
-    n_cls = len(cell_type_mapper)
-
-    # Model instatiation
-    logger.info("Instantiating model")
-    model = TransformerModel(
-        ntokens,
-        d_model=embsize,  # self.encoder (GenEncoder), self.value_encoder (ContinuousValueEncoder), self.transformerencoder(TransformerEncoderLayer)
-        nhead=nhead,  # self.transformer_encoder(TransformerEncoderLayer)
-        d_hid=d_hid,  # self.transformer_encoder(TransformerEncoderLayer)
-        nlayers=nlayers,  # self.transformer_encoder(TransformerEncoderLayer), self.cls_decoder
-        nlayers_cls=3,  # self.cls_decoder
-        n_cls=n_cls,  # self.cls_decoder
-        vocab=vocab,
-        dropout=0.2,  # self.transformer_encoder
-        pad_token=par["pad_token"],
-        pad_value=par["pad_value"],
-        do_mvc=False,
-        do_dab=False,
-        use_batch_labels=par["dsbn"],
-        num_batch_labels=num_batch_types if par["dsbn"] else None,
-        domain_spec_batchnorm=par["dsbn"],
-        input_emb_style="continuous",
-        n_input_bins=par["n_input_bins"],
-        cell_emb_style="cls",  # required for cell-type annotation
-        use_fast_transformer=False,  # TODO: parametrize when GPU is available
-        fast_transformer_backend="flash",  # TODO: parametrize when GPU is available
-        pre_norm=False,  # TODO: parametrize when GPU is available
-    )
-
-    # Load model params
-    logger.info(f"Loading model params from {model_file}")
-    try:
-        model.load_state_dict(pretrained_dict)
-    except RuntimeError:
-        logger.info("only load params that are in the model and match the size")
-        model_dict = model.state_dict()
-        pretrained_dict = {
-            k: v
-            for k, v in pretrained_dict.items()
-            if k in model_dict and v.shape == model_dict[k].shape
-        }
-        for k, v in pretrained_dict.items():
-            logger.info(f"Loading params {k} with shape {v.shape}")
-        model_dict.update(pretrained_dict)
-        model.load_state_dict(model_dict)
-
-    model.to(device)
-
-    # Load tokenized gene data
-    logger.info("Loading data for inference")
-    for k, v in {
-        "--obsm_gene_tokens": par["obsm_gene_tokens"],
-        "--obsm_tokenized_values": par["obsm_tokenized_values"],
-    }.items():
-        if v not in adata.obsm.keys():
-            raise KeyError(
-                f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm"
-            )
-
-    input_gene_ids = adata.obsm[par["obsm_gene_tokens"]]
-    input_values = adata.obsm[par["obsm_tokenized_values"]]
-
-    data_pt = {
-        "gene_ids": input_gene_ids,
-        "values": input_values,
-        "batch_labels": torch.from_numpy(batch_ids).long(),
-    }
-
-    data_loader = DataLoader(
-        dataset=SeqDataset(data_pt),
-        batch_size=par["batch_size"],
-        num_workers=min(os.cpu_count(), par["batch_size"] // 2),
-        pin_memory=True,
-    )
-
-    # Inference
-    logger.info("Predicting cell type classes")
-    model.eval()
-    predictions = []
-    probabilities = []
-    with torch.no_grad():
-        for batch_data in tqdm(data_loader):
-            input_gene_ids = batch_data["gene_ids"].to(device)
-            input_values = batch_data["values"].to(device)
-            batch_labels = batch_data["batch_labels"].to(device)
-
-            src_key_padding_mask = input_gene_ids.eq(vocab[par["pad_token"]])
-            with torch.cuda.amp.autocast(enabled=False):
-                output_dict = model(
-                    input_gene_ids,
-                    input_values,
-                    src_key_padding_mask=src_key_padding_mask,
-                    batch_labels=batch_labels if par["dsbn"] else None,
-                    CLS=True,  # Return celltype classification objective output
-                    CCE=False,
-                    MVC=False,
-                    ECS=False,
-                )
-                output_values = output_dict["cls_output"]
-
-            preds = output_values.argmax(1).cpu().numpy()
-            predictions.append(preds)
-
-            probs = functional.softmax(output_values, dim=1).max(1)[0]
-            probabilities.append(probs.cpu().numpy())
-
-    predictions = np.concatenate(predictions, axis=0)
-    probabilities = np.concatenate(probabilities, axis=0)
-
-    # Assign cell type labels to predicted classes
-    logger.info("Assigning cell type predictions and probabilities")
-    adata.obs["scgpt_class_pred"] = predictions
-    adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map(
-        lambda x: cell_type_mapper[x]
-    )
-    adata.obs[par["output_obs_probability"]] = probabilities
-
-    # Write output
-    logger.info("Writing output data")
-    mdata.mod[par["modality"]] = adata
-    mdata.write(par["output"], compression=par["output_compression"])
-
-
-if __name__ == "__main__":
-    freeze_support()
-    warnings.filterwarnings("ignore")
-    main()
diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py
deleted file mode 100644
index 2d2eb9bd9ca..00000000000
--- a/src/scgpt/cell_type_annotation/test.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import pytest
-from mudata import read_h5mu
-import sys
-import subprocess
-import re
-
-
-input_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu"
-ft_model = f"{meta['resources_dir']}/best_model.pt"
-model_config = f"{meta['resources_dir']}/args.json"
-model_vocab = f"{meta['resources_dir']}/vocab.json"
-
-
-def test_cell_type_inference(run_component, tmp_path):
-    output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu"
-
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_annotation_file,
-        "--modality",
-        "rna",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--model",
-        ft_model,
-        "--finetuned_checkpoints_key",
-        "model_state_dict",
-        "--label_mapper_key",
-        "id_to_class",
-        "--model_vocab",
-        model_vocab,
-        "--model_config",
-        model_config,
-        "--obs_batch_label",
-        "sample",
-        "--dsbn",
-        "True",
-    ]
-    run_component(args)
-
-    output_mudata = read_h5mu(output_annotation_file)
-    output_adata = output_mudata.mod["rna"]
-    assert "scgpt_pred" in output_adata.obs.keys(), (
-        "scgpt_pred is not present in anndata obs keys"
-    )
-    assert "scgpt_probability" in output_adata.obs.keys(), (
-        "scgpt_probability is not present in anndata obs keys"
-    )
-
-    # run withou dsbn
-    output_annotation_file_without_dsbn = (
-        tmp_path / "Kim2020_Lung_subset_annotated_no_dsbn.h5mu"
-    )
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_annotation_file_without_dsbn,
-        "--modality",
-        "rna",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--model",
-        ft_model,
-        "--model_vocab",
-        model_vocab,
-        "--model_config",
-        model_config,
-        "--finetuned_checkpoints_key",
-        "model_state_dict",
-        "--label_mapper_key",
-        "id_to_class",
-        "--obs_batch_label",
-        "sample",
-        "--dsbn",
-        "False",
-    ]
-    run_component(args)
-    # Read output file
-    output_mdata_no_dsbn = read_h5mu(output_annotation_file_without_dsbn)
-    output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"]
-
-    # Assert that embeddings without dsbn are different
-    assert not (
-        output_adata.obs["scgpt_pred"].astype(str)
-        == output_adata_no_dsbn.obs["scgpt_pred"].astype(str)
-    ).all(), "Cell type predictions with and without dsbn are the same"
-
-
-def test_annotation_dsbn_without_batch_labels(run_component, tmp_path):
-    output_annotation_labels_without_dsbn = (
-        tmp_path / "Kim2020_Lung_subset_annotated_labels_without_dsbn.h5mu"
-    )
-
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_annotation_labels_without_dsbn,
-        "--modality",
-        "rna",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--model",
-        ft_model,
-        "--model_vocab",
-        model_vocab,
-        "--model_config",
-        model_config,
-        "--finetuned_checkpoints_key",
-        "model_state_dict",
-        "--label_mapper_key",
-        "id_to_class",
-        "--dsbn",
-        "True",
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args)
-    assert re.search(
-        r"ValueError: When dsbn is set to True, you are required to provide batch labels \(obs_batch_labels\)\.",
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-def test_annotation_non_existing_keys(run_component, tmp_path):
-    output_annotation_dummy_values = (
-        tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu"
-    )
-
-    # Test for non-existing tokenized values key
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_annotation_dummy_values,
-        "--modality",
-        "rna",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "dummy_values_tokenized",
-        "--model",
-        ft_model,
-        "--model_vocab",
-        model_vocab,
-        "--model_config",
-        model_config,
-        "--finetuned_checkpoints_key",
-        "model_state_dict",
-        "--label_mapper_key",
-        "id_to_class",
-        "--obs_batch_label",
-        "sample",
-        "--dsbn",
-        "True",
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args)
-    assert re.search(
-        r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"',
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-def test_checkpoint_architecture(run_component, tmp_path):
-    output_dummy_model_key = tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu"
-
-    # Test for non-existing model file keys
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_dummy_model_key,
-        "--modality",
-        "rna",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--model",
-        ft_model,
-        "--model_vocab",
-        model_vocab,
-        "--model_config",
-        model_config,
-        "--finetuned_checkpoints_key",
-        "dummy_checkpoints_key",
-        "--label_mapper_key",
-        "id_to_class",
-        "--obs_batch_label",
-        "sample",
-        "--dsbn",
-        "True",
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args)
-    assert re.search(
-        r'KeyError: "The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper."',
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main([__file__]))
diff --git a/src/scgpt/cross_check_genes/config.vsh.yaml b/src/scgpt/cross_check_genes/config.vsh.yaml
deleted file mode 100644
index c51bd606b6f..00000000000
--- a/src/scgpt/cross_check_genes/config.vsh.yaml
+++ /dev/null
@@ -1,99 +0,0 @@
-name: cross_check_genes
-namespace: "scgpt"
-scope: "public"
-description: |
-  Cross-check genes with pre-trained scGPT model.
-authors:
-  - __merge__: /src/authors/jakub_majercik.yaml
-    roles: [ author ]
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ maintainer, author ]
-  - __merge__: /src/authors/elizabeth_mlynarski.yaml
-    roles: [ author ]
-  - __merge__: /src/authors/weiwei_schultz.yaml
-    roles: [ contributor ]
-
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: "--input"
-        type: file
-        direction: input
-        required: true
-        example: input.h5mu
-        description: |
-          The input h5mu file containing of pre-processed data.
-      - name: "--modality"
-        type: string
-        default: "rna"
-        required: false
-        description: |
-          The modality key of the MuData object containing the RNA AnnData object.
-      - name: "--vocab_file"
-        type: file
-        direction: input
-        required: true
-        example: resources_test/scgpt/vocab.json
-        description: |
-          Model vocabulary file path.
-      - name: "--input_var_gene_names"
-        type: string
-        example: "gene_name"
-        required: false
-        description: |
-          The name of the adata.var column containing gene names. By default the .var index will be used.
-      - name: "--var_input"
-        type: string
-        required: false
-        description: ".var column containing highly variable genes. If provided, will only cross-check HVG filtered genes with model vocabulary."
-  - name: Outputs
-    arguments:
-      - name: "--output"
-        type: file
-        direction: output
-        required: true
-        example: output.h5mu
-        description: |
-          The output cross-checked anndata file.
-      - name: "--output_var_filter"
-        type: string
-        default: "id_in_vocab"
-        description: In which .var slot to store a boolean array corresponding to which observations should be filtered out based on HVG and model vocabulary.
-    __merge__: [., /src/base/h5_compression_argument.yaml]
-
-  - name: Arguments
-    arguments:
-      - name: "--pad_token"
-        type: string
-        default: "<pad>"
-        required: false
-        description: |
-          The padding token used in the model.
-resources:
-  - type: python_script
-    path: script.py
-  - path: /src/utils/setup_logger.py
-test_resources:
-  - type: python_script
-    path: test.py
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu
-  - path: /resources_test/scgpt/source/vocab.json
-
-engines:
-  - type: docker
-    image: nvcr.io/nvidia/pytorch:23.09-py3
-    setup:
-      - type: python
-        __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
-      - type: python
-        packages:
-          - scgpt==0.2.1
-    test_setup:
-      - type: python
-    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
-runners:
-  - type: executable
-  - type: nextflow
-    directives:
-      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/scgpt/cross_check_genes/script.py b/src/scgpt/cross_check_genes/script.py
deleted file mode 100644
index c181d81534e..00000000000
--- a/src/scgpt/cross_check_genes/script.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import sys
-import mudata as mu
-from scgpt.tokenizer.gene_tokenizer import GeneVocab
-
-## VIASH START
-par = {
-    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu",
-    "output": "output.h5mu",
-    "modality": "rna",
-    "input_var_gene_names": None,
-    "output_var_filter": "id_in_vocab",
-    "pad_token": "<pad>",
-    "var_input": "filter_with_hvg",
-    "vocab_file": "resources_test/scgpt/source/vocab.json",
-    "output_compression": None,
-}
-
-meta = {"resources_dir": "src/utils"}
-## VIASH END
-
-sys.path.append(meta["resources_dir"])
-from setup_logger import setup_logger
-
-logger = setup_logger()
-
-# Read in data
-logger.info(f"Reading {par['input']}")
-mudata = mu.read_h5mu(par["input"])
-adata = mudata.mod[par["modality"]].copy()
-
-pad_token = par["pad_token"]
-special_tokens = [pad_token, "<cls>", "<eoc>"]
-
-# Fetching gene names
-if not par["input_var_gene_names"]:
-    genes = adata.var.index.astype(str).tolist()
-elif par["input_var_gene_names"] not in adata.var.columns:
-    raise ValueError(
-        f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs."
-    )
-else:
-    genes = adata.var[par["input_var_gene_names"]].astype(str).tolist()
-
-# Cross-check genes with pre-trained model
-logger.info(f"Loading model vocab from {par['vocab_file']}")
-vocab_file = par["vocab_file"]
-vocab = GeneVocab.from_file(vocab_file)
-[vocab.append_token(s) for s in special_tokens if s not in vocab]
-
-if par["var_input"]:
-    logger.info("Filtering genes based on model vocab and HVG")
-    filter_with_hvg = adata.var[par["var_input"]].tolist()
-    gene_filter_mask = [
-        1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg)
-    ]
-    logger.info(
-        f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}"
-    )
-else:
-    logger.info("Filtering genes based on model vocab")
-    gene_filter_mask = [1 if gene in vocab else 0 for gene in genes]
-    logger.info(
-        f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}"
-    )
-
-logger.info(f"Writing to {par['output']}")
-adata.var[par["output_var_filter"]] = gene_filter_mask
-adata.var[par["output_var_filter"]] = adata.var[par["output_var_filter"]].astype("bool")
-mudata.mod[par["modality"]] = adata
-mudata.write_h5mu(par["output"], compression=par["output_compression"])
diff --git a/src/scgpt/cross_check_genes/test.py b/src/scgpt/cross_check_genes/test.py
deleted file mode 100644
index bb8c53a1349..00000000000
--- a/src/scgpt/cross_check_genes/test.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import pytest
-import subprocess
-from mudata import read_h5mu
-import re
-import sys
-
-## VIASH START
-meta = {
-    "executable": "./target/docker/scgpt/cross_check/cross_check",
-    "resources_dir": "./resources_test/scgpt/",
-    "config": "./src/scgpt/cross_check/config.vsh.yaml",
-}
-## VIASH END
-
-input_path = meta["resources_dir"] + "/Kim2020_Lung_subset_preprocessed.h5mu"
-vocab_path = meta["resources_dir"] + "/vocab.json"
-
-
-def test_cross_check(run_component, random_path):
-    output_path = random_path(extension="h5mu")
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_path,
-        "--modality",
-        "rna",
-        "--vocab_file",
-        vocab_path,
-        "--output_compression",
-        "gzip",
-    ]
-    run_component(args)
-
-    output_mudata = read_h5mu(output_path)
-
-    # Check added columns
-    assert {"gene_name", "id_in_vocab"}.issubset(
-        set(output_mudata.mod["rna"].var.columns)
-    ), "Gene columns were not added."
-    # Check if genes were filtered
-    assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len(
-        output_mudata.mod["rna"].var["id_in_vocab"]
-    ), "Genes were not filtered."
-
-    output_hvg_path = random_path(extension="h5mu")
-    args_hvg = [
-        "--input",
-        input_path,
-        "--output",
-        output_hvg_path,
-        "--modality",
-        "rna",
-        "--var_input",
-        "filter_with_hvg",
-        "--vocab_file",
-        vocab_path,
-        "--output_compression",
-        "gzip",
-    ]
-
-    run_component(args_hvg)
-
-    output_mudata_hvg = read_h5mu(output_hvg_path)
-    # Check if genes were filtered based on HVG
-    assert sum(output_mudata_hvg.mod["rna"].var["id_in_vocab"]) != len(
-        output_mudata_hvg.mod["rna"].var["id_in_vocab"]
-    ), "Genes were not filtered."
-    assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len(
-        output_mudata_hvg.mod["rna"].var["id_in_vocab"]
-    ), "Genes were not filtered based on HVG."
-
-
-def test_cross_check_invalid_gene_layer_raises(run_component, random_path):
-    output_path = random_path(extension="h5mu")
-    args = [
-        "--input",
-        input_path,
-        "--output",
-        output_path,
-        "--vocab_file",
-        vocab_path,
-        "--input_var_gene_names",
-        "dummy_var",
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args)
-    assert re.search(
-        r"ValueError: Gene name column 'dummy_var' not found in .mod\['rna'\]\.obs\.",
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main([__file__]))
diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml
deleted file mode 100644
index fd238696c1e..00000000000
--- a/src/scgpt/embedding/config.vsh.yaml
+++ /dev/null
@@ -1,147 +0,0 @@
-name: embedding
-namespace: scgpt
-scope: "public"
-description: |
-  Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT.
-authors:
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ maintainer, author ]
-  - __merge__: /src/authors/elizabeth_mlynarski.yaml
-    roles: [ author ]
-  - __merge__: /src/authors/weiwei_schultz.yaml
-    roles: [ contributor ]
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: "--input"
-        type: file
-        direction: input
-        required: true
-        example: input.h5mu
-        description: |
-          The input h5mu file containing tokenized gene and count data. 
-      - name: "--modality"
-        description: |
-          Which modality from the input MuData file to process.
-
-        type: string
-        default: "rna"
-        required: false
-      - name: "--model"
-        type: file
-        direction: input
-        required: true
-        example: best_model.pt
-        description: |
-          Path to scGPT model file.
-      - name: "--model_vocab"
-        type: file
-        direction: input
-        required: true
-        example: vocab.json
-        description: |
-          Path to scGPT model vocabulary file.
-      - name: "--model_config"
-        type: file
-        direction: input
-        required: true
-        example: args.json
-        description: |
-          Path to scGPT model config file.
-      - name: "--obsm_gene_tokens"
-        type: string
-        default: "gene_id_tokens"
-        description: |
-          The key of the .obsm array containing the gene token ids
-        example: values.pt
-      - name: "--obsm_tokenized_values"
-        type: string
-        default: values_tokenized
-        description: |
-          The key of the .obsm array containing the count values of the tokenized genes
-      - name: "--obsm_padding_mask"
-        type: string
-        default: padding_mask
-        description: |
-          The key of the .obsm array containing the padding mask.
-      - name: "--var_gene_names"
-        type: string
-        description: |
-          The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used.
-      - name: "--obs_batch_label"
-        type: string
-        description: |
-          The name of the adata.obs column containing the batch labels. Must be provided when 'dsbn' is set to True.
-      - name: "--finetuned_checkpoints_key"
-        type: string
-        required: false
-        example: model_state_dict
-        description: |
-          Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models.
-          
-  - name: Outputs
-    arguments:
-      - name: "--output"
-        type: file
-        required: true
-        description: |
-          Path to output anndata file containing pre-processed data as well as scGPT embeddings.
-        direction: output
-        example: output.h5mu
-      - name: "--obsm_embeddings"
-        type: string
-        default: "X_scGPT"
-        description: |
-          The name of the adata.obsm array to which scGPT embeddings will be written.
-    __merge__: [., /src/base/h5_compression_argument.yaml]
-  - name: Arguments
-    arguments:
-      - name: "--pad_token"
-        type: string
-        default: "<pad>"
-        description: |
-          The token to be used for padding.
-      - name: "--pad_value"
-        type: integer
-        default: -2
-        description: |
-          The value of the padding token.
-      - name: "--dsbn"
-        type: boolean
-        default: true
-        description: |
-          Whether to apply domain-specific batch normalization for generating embeddings. When set to True, 'obs_batch_labels' must be set as well.
-      - name: "--batch_size"
-        type: integer
-        default: 64
-        description: |
-          The batch size to be used for inference
-
-resources:
-  - type: python_script
-    path: script.py
-  - path: /src/utils/setup_logger.py
-test_resources:
-  - type: python_script
-    path: test.py
-  - path: /resources_test/scgpt/source
-  - path: /resources_test/scgpt/finetuned_model
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
-
-engines:
-  - type: docker
-    image: nvcr.io/nvidia/pytorch:23.09-py3
-    setup:
-      - type: python
-        __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ]
-      - type: python 
-        packages:
-          - scgpt==0.2.1
-    test_setup:
-      - type: python
-        __merge__: [ /src/base/requirements/viashpy.yaml ]
-runners:
-  - type: executable
-  - type: nextflow
-    directives:
-      label: [ highmem, highcpu, gpu ]
diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py
deleted file mode 100644
index b78d42c3f61..00000000000
--- a/src/scgpt/embedding/script.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import sys
-import numpy as np
-import mudata as mu
-import json
-from scgpt.tokenizer.gene_tokenizer import GeneVocab
-from scgpt.model import TransformerModel
-from scgpt.utils.util import load_pretrained
-import torch
-
-## VIASH START
-par = {
-    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu",
-    "obsm_gene_tokens": "gene_id_tokens",
-    "obsm_tokenized_values": "values_tokenized",
-    "obsm_padding_mask": "padding_mask",
-    "model": "resources_test/scgpt/source/best_model.pt",
-    "model_config": "resources_test/scgpt/source/args.json",
-    "model_vocab": "resources_test/scgpt/source/vocab.json",
-    "output": "Kim2020_Lung_embedded.h5ad",
-    "var_gene_names": "gene_name",
-    "obs_batch_label": "sample",
-    "obsm_embeddings": "X_scGPT",
-    "pad_token": "<pad>",
-    "pad_value": -2,
-    "batch_size": 64,
-    "modality": "rna",
-    "dsbn": True,
-    "n_input_bins": 51,
-}
-meta = {
-    "resources_dir": "src/utils",
-}
-## VIASH END
-
-sys.path.append(meta["resources_dir"])
-from setup_logger import setup_logger
-
-logger = setup_logger()
-
-logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-logger.info("Reading in data")
-
-# Read in data
-mdata = mu.read(par["input"])
-input_adata = mdata.mod[par["modality"]]
-adata = input_adata.copy()
-
-for k, v in {
-    "--obsm_gene_tokens": par["obsm_gene_tokens"],
-    "--obsm_tokenized_values": par["obsm_tokenized_values"],
-    "--obsm_padding_mask": par["obsm_padding_mask"],
-}.items():
-    if v not in adata.obsm.keys():
-        raise KeyError(
-            f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm"
-        )
-
-all_gene_ids = adata.obsm[par["obsm_gene_tokens"]]
-all_values = adata.obsm[par["obsm_tokenized_values"]]
-padding_mask = adata.obsm[par["obsm_padding_mask"]]
-
-# Fetch batch ids for domain-specific batch normalization
-if par["dsbn"] and not par["obs_batch_label"]:
-    raise ValueError(
-        "When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels)."
-    )
-elif par["dsbn"] and par["obs_batch_label"]:
-    logger.info("Fetching batch id's for domain-specific batch normalization")
-    batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category")
-    batch_id_labels = batch_id_cats.cat.codes.values
-    batch_ids = batch_id_labels.tolist()
-    batch_ids = np.array(batch_ids)
-    num_batch_types = len(set(batch_ids))
-elif not par["dsbn"] and par["obs_batch_label"]:
-    logger.info(
-        "Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed."
-    )
-
-# Set padding specs
-logger.info("Setting padding specs")
-pad_token = par["pad_token"]
-pad_value = par["pad_value"]
-special_tokens = [pad_token, "<cls>", "<eoc>"]
-
-# Fetching gene names
-logger.info("Fetching gene names")
-if not par["var_gene_names"]:
-    genes = adata.var.index.astype(str).tolist()
-else:
-    genes = adata.var[par["var_gene_names"]].astype(str).tolist()
-
-# Model files
-logger.info("Loading model, vocab and configs")
-model_config_file = par["model_config"]
-model_file = par["model"]
-vocab_file = par["model_vocab"]
-
-# Load vocab
-vocab = GeneVocab.from_file(vocab_file)
-for s in special_tokens:
-    if s not in vocab:
-        vocab.append_token(s)
-
-vocab.set_default_index(vocab["<pad>"])
-ntokens = len(vocab)
-gene_ids = np.array(vocab(genes), dtype=int)
-
-# Load model configs
-with open(model_config_file, "r") as f:
-    model_configs = json.load(f)
-embsize = model_configs["embsize"]
-nhead = model_configs["nheads"]
-d_hid = model_configs["d_hid"]
-nlayers = model_configs["nlayers"]
-
-# Instantiate model
-logger.info("Initializing transformer model")
-model = TransformerModel(
-    ntokens,
-    d_model=embsize,
-    nhead=nhead,
-    d_hid=d_hid,
-    nlayers=nlayers,
-    vocab=vocab,
-    dropout=0.5,  # scGPT default, only relevant for fine-tuning applications
-    pad_token=pad_token,
-    pad_value=pad_value,
-    nlayers_cls=3,  # only applicable for decoder-based operations
-    n_cls=1,  # only applicable for decoder-based operations
-    do_mvc=False,  # only applicable for decoder-based operations
-    ecs_threshold=0.8,  # only applicable for decoder-based operations
-    do_dab=False,  # only applicable for decoder-based operations
-    use_batch_labels=False,  # only applicable for decoder-based operations
-    num_batch_labels=num_batch_types if par["dsbn"] else None,
-    domain_spec_batchnorm=par["dsbn"],
-    input_emb_style="continuous",  # scGPT default
-    explicit_zero_prob=False,  # TODO: Parametrize when GPU-based machine types are supported
-    use_fast_transformer=False,  # TODO: Parametrize when GPU-based machine types are supported
-    # fast_transformer_backend="flash",  #TODO: Parametrize when GPU-based machine types are supported
-    pre_norm=False,  # TODO: Parametrize when GPU-based machine types are supported
-)
-
-
-logger.info("Loading model")
-model_file = par["model"]
-model_dict = torch.load(model_file, map_location=device)
-
-# Ensure the provided model has the correct architecture
-finetuned_checkpoints_key = par.get("finetuned_checkpoints_key")
-if finetuned_checkpoints_key:
-    try:
-        model_dict = model_dict[finetuned_checkpoints_key]
-    except KeyError as e:
-        raise ValueError(
-            f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper."
-        ) from e
-
-# Load model
-load_pretrained(model, model_dict, verbose=False)
-
-# Embed tokenized data
-logger.info("Converting tokenized input data to embeddings")
-model.to(device)
-model.eval()
-
-cell_embeddings = model.encode_batch(
-    torch.from_numpy(all_gene_ids),
-    torch.from_numpy(all_values).float(),
-    src_key_padding_mask=torch.from_numpy(padding_mask),
-    batch_size=par["batch_size"],
-    batch_labels=torch.from_numpy(batch_ids).long() if par["dsbn"] else None,
-    output_to_cpu=True,
-    time_step=0,
-    return_np=True,
-)
-
-cell_embeddings = cell_embeddings / np.linalg.norm(
-    cell_embeddings, axis=1, keepdims=True
-)
-
-# Write output
-logger.info("Writing output data")
-adata.obsm[par["obsm_embeddings"]] = cell_embeddings
-mdata.mod[par["modality"]] = adata
-mdata.write(par["output"], compression=par["output_compression"])
diff --git a/src/scgpt/embedding/test.py b/src/scgpt/embedding/test.py
deleted file mode 100644
index 8ba9b190920..00000000000
--- a/src/scgpt/embedding/test.py
+++ /dev/null
@@ -1,350 +0,0 @@
-import pytest
-import subprocess
-import re
-import sys
-import mudata as mu
-import numpy as np
-
-
-## VIASH START
-meta = {
-    "resources_dir": "resources_test",
-}
-## VIASH END
-
-input = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu"
-model_file = f"{meta['resources_dir']}/source/best_model.pt"
-ft_model_file = f"{meta['resources_dir']}/finetuned_model/best_model.pt"
-vocab_file = f"{meta['resources_dir']}/source/vocab.json"
-model_config_file = f"{meta['resources_dir']}/source/args.json"
-input_file = mu.read(input)
-
-
-def test_integration_embedding(run_component, tmp_path):
-    output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
-
-    run_component(
-        [
-            "--input",
-            input,
-            "--modality",
-            "rna",
-            "--model",
-            model_file,
-            "--model_vocab",
-            vocab_file,
-            "--model_config",
-            model_config_file,
-            "--dsbn",
-            "True",
-            "--obs_batch_label",
-            "sample",
-            "--obsm_gene_tokens",
-            "gene_id_tokens",
-            "--obsm_tokenized_values",
-            "values_tokenized",
-            "--obsm_padding_mask",
-            "padding_mask",
-            "--output",
-            output_embedding_file,
-            "--batch_size",
-            "4",
-        ]
-    )
-
-    # Read output file
-    output_mdata = mu.read(output_embedding_file)
-    output_adata = output_mdata.mod["rna"]
-
-    # check that embedding obs is present
-    assert "X_scGPT" in output_adata.obsm.keys(), (
-        "X_scGPT is not present in anndata obsm keys"
-    )
-
-    # check embedding size
-    assert output_adata.obsm["X_scGPT"].shape[1] == 512, (
-        "Embedding size does not equal 512"
-    )
-
-    # check embedding value range
-    assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), (
-        "Embedding values are nan"
-    )
-    assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), (
-        "Range of embedding values is outside of [-1, 1]"
-    )
-
-    # Run embeddings without dsbn
-    output_embedding_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
-
-    run_component(
-        [
-            "--input",
-            input,
-            "--modality",
-            "rna",
-            "--model",
-            model_file,
-            "--model_vocab",
-            vocab_file,
-            "--model_config",
-            model_config_file,
-            "--dsbn",
-            "False",
-            "--obsm_gene_tokens",
-            "gene_id_tokens",
-            "--obsm_tokenized_values",
-            "values_tokenized",
-            "--obsm_padding_mask",
-            "padding_mask",
-            "--output",
-            output_embedding_file_without_dsbn,
-            "--batch_size",
-            "4",
-        ]
-    )
-
-    # Read output file
-    output_mdata_no_dsbn = mu.read(output_embedding_file_without_dsbn)
-    output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"]
-
-    # Assert that embeddings without dsbn are different
-    assert not (
-        output_adata.obsm["X_scGPT"] == output_adata_no_dsbn.obsm["X_scGPT"]
-    ).all(), "Embeddings with and without dsbn are the same"
-
-
-def test_integration_embedding_dsbn_without_batch_labels(run_component, tmp_path):
-    output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
-
-    args = [
-        "--input",
-        input,
-        "--modality",
-        "rna",
-        "--model",
-        model_file,
-        "--model_vocab",
-        vocab_file,
-        "--model_config",
-        model_config_file,
-        "--dsbn",
-        "True",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--obsm_padding_mask",
-        "padding_mask",
-        "--output",
-        output_embedding_file,
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args)
-    assert re.search(
-        r"ValueError: When dsbn is set to True, you are required to provide batch labels \(input_obs_batch_labels\)\.",
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-def test_integration_embedding_non_existing_keys(run_component, tmp_path):
-    output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
-
-    # Test for non-existing gene names key
-    args_1 = [
-        "--input",
-        input,
-        "--modality",
-        "rna",
-        "--model",
-        model_file,
-        "--model_vocab",
-        vocab_file,
-        "--model_config",
-        model_config_file,
-        "--dsbn",
-        "True",
-        "--obs_batch_label",
-        "sample",
-        "--var_gene_names",
-        "dummy_gene_name_key",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--obsm_padding_mask",
-        "padding_mask",
-        "--output",
-        output_embedding_file,
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args_1)
-    assert re.search(
-        r"KeyError: \'dummy_gene_name_key\'", err.value.stdout.decode("utf-8")
-    )
-
-    # Test for non-existing batch label key
-    args_2 = [
-        "--input",
-        input,
-        "--modality",
-        "rna",
-        "--model",
-        model_file,
-        "--model_vocab",
-        vocab_file,
-        "--model_config",
-        model_config_file,
-        "--dsbn",
-        "True",
-        "--obs_batch_label",
-        "dummy_batch_label_key",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--obsm_padding_mask",
-        "padding_mask",
-        "--output",
-        output_embedding_file,
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args_2)
-    assert re.search(
-        r"KeyError: \'dummy_batch_label_key\'", err.value.stdout.decode("utf-8")
-    )
-
-    # Test for non-existing tokenized values key
-    args_3 = [
-        "--input",
-        input,
-        "--modality",
-        "rna",
-        "--model",
-        model_file,
-        "--model_vocab",
-        vocab_file,
-        "--model_config",
-        model_config_file,
-        "--dsbn",
-        "True",
-        "--obs_batch_label",
-        "sample",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "dummy_values_tokenized",
-        "--obsm_padding_mask",
-        "padding_mask",
-        "--output",
-        output_embedding_file,
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args_3)
-    assert re.search(
-        r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"',
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-def test_finetuned_model(run_component, tmp_path):
-    output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
-
-    run_component(
-        [
-            "--input",
-            input,
-            "--modality",
-            "rna",
-            "--model",
-            ft_model_file,
-            "--model_vocab",
-            vocab_file,
-            "--model_config",
-            model_config_file,
-            "--dsbn",
-            "True",
-            "--obs_batch_label",
-            "sample",
-            "--obsm_gene_tokens",
-            "gene_id_tokens",
-            "--obsm_tokenized_values",
-            "values_tokenized",
-            "--obsm_padding_mask",
-            "padding_mask",
-            "--finetuned_checkpoints_key",
-            "model_state_dict",
-            "--output",
-            output_embedding_file,
-            "--batch_size",
-            "4",
-        ]
-    )
-
-    # Read output file
-    output_mdata = mu.read(output_embedding_file)
-    output_adata = output_mdata.mod["rna"]
-
-    # check that embedding obs is present
-    assert "X_scGPT" in output_adata.obsm.keys(), (
-        "X_scGPT is not present in anndata obsm keys"
-    )
-
-    # check embedding size
-    assert output_adata.obsm["X_scGPT"].shape[1] == 512, (
-        "Embedding size does not equal 512"
-    )
-
-    # check embedding value range
-    assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), (
-        "Embedding values are nan"
-    )
-    assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), (
-        "Range of embedding values is outside of [-1, 1]"
-    )
-
-
-def test_finetuned_model_architecture(run_component, tmp_path):
-    output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
-
-    args = [
-        "--input",
-        input,
-        "--modality",
-        "rna",
-        "--model",
-        ft_model_file,
-        "--model_vocab",
-        vocab_file,
-        "--model_config",
-        model_config_file,
-        "--dsbn",
-        "True",
-        "--obs_batch_label",
-        "sample",
-        "--obsm_gene_tokens",
-        "gene_id_tokens",
-        "--obsm_tokenized_values",
-        "values_tokenized",
-        "--obsm_padding_mask",
-        "padding_mask",
-        "--finetuned_checkpoints_key",
-        "dummy_checkpoints_key",
-        "--output",
-        output_embedding_file,
-    ]
-
-    with pytest.raises(subprocess.CalledProcessError) as err:
-        run_component(args)
-    assert re.search(
-        r"ValueError: The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.",
-        err.value.stdout.decode("utf-8"),
-    )
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main([__file__]))
diff --git a/src/scgpt/pad_tokenize/config.vsh.yaml b/src/scgpt/pad_tokenize/config.vsh.yaml
deleted file mode 100644
index 6efa9a01948..00000000000
--- a/src/scgpt/pad_tokenize/config.vsh.yaml
+++ /dev/null
@@ -1,125 +0,0 @@
-name: pad_tokenize
-namespace: "scgpt"
-scope: "public"
-description: |
-  Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning.
-authors:
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ maintainer, author ]
-  - __merge__: /src/authors/elizabeth_mlynarski.yaml
-    roles: [ author ]
-  - __merge__: /src/authors/weiwei_schultz.yaml
-    roles: [ contributor ]
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: "--input"
-        type: file
-        direction: input
-        required: true
-        example: input.h5mu
-        description: |
-          The input h5mu file of pre-processed data.
-      - name: "--modality"
-        description: |
-          Which modality from the input MuData file to process.
-        type: string
-        default: "rna"
-        required: false
-      - name: "--model_vocab"
-        type: file
-        direction: input
-        required: true
-        example: vocab.json
-        description: |
-          Path to model vocabulary file.
-      - name: "--var_gene_names"
-        type: string
-        required: false
-        description: |
-          The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used.
-      - name: "--var_input"
-        type: string
-        default: "id_in_vocab"
-        description: |
-          The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.
-      - name: "--input_obsm_binned_counts"
-        type: string
-        default: "binned_counts"
-        required: false
-        description: |
-          The name of the .obsm field containing the binned counts to be padded and tokenized.
-
-  - name: Outputs
-    arguments:
-      - name: "--output"
-        type: file
-        required: true
-        description: |
-          The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask.
-        direction: output
-        example: output.h5mu
-      - name: "--obsm_gene_tokens"
-        type: string
-        default: "gene_id_tokens"
-        description: |
-          The key of the .obsm array containing the gene token ids
-        example: values.pt
-      - name: "--obsm_tokenized_values"
-        type: string
-        default: values_tokenized
-        description: |
-          The key of the .obsm array containing the count values of the tokenized genes
-      - name: "--obsm_padding_mask"
-        type: string
-        default: padding_mask
-        description: |
-          The key of the .obsm array containing the padding mask.
-    __merge__: [., /src/base/h5_compression_argument.yaml]
-
-  - name: Arguments
-    arguments:
-      - name: "--pad_token"
-        type: string
-        default: "<pad>"
-        required: false
-        description: |
-          Token used for padding.
-      - name: "--pad_value"
-        type: integer
-        default: -2
-        required: false
-        description: |
-          The value of the padding token.
-      - name: "--max_seq_len"
-        type: integer
-        description: |
-          The maximum sequence length of the tokenized data. Defaults to the number of features if not provided.
-
-resources:
-  - type: python_script
-    path: script.py
-  - path: /src/utils/setup_logger.py
-  - path: /src/utils/subset_vars.py
-test_resources:
-  - type: python_script
-    path: test.py
-  - path: /resources_test/scgpt/
-
-engines:
-  - type: docker
-    image: nvcr.io/nvidia/pytorch:23.09-py3
-    setup:
-      - type: python
-        __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ]
-      - type: python
-        packages:
-          - scgpt==0.2.1
-          - ipython~=8.5.0
-    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
-runners:
-  - type: executable
-  - type: nextflow
-    directives:
-      label: [ lowmem, lowcpu ]
diff --git a/src/scgpt/pad_tokenize/script.py b/src/scgpt/pad_tokenize/script.py
deleted file mode 100644
index 641e28a189e..00000000000
--- a/src/scgpt/pad_tokenize/script.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import sys
-import mudata as mu
-import numpy as np
-from scipy.sparse import issparse
-from scgpt.tokenizer import tokenize_and_pad_batch
-from scgpt.tokenizer.gene_tokenizer import GeneVocab
-
-
-## VIASH START
-par = {
-    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu",
-    "model_vocab": "resources_test/scgpt/source/vocab.json",
-    "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu",
-    "pad_token": "<pad>",
-    "pad_value": -2,
-    "modality": "rna",
-    "input_obsm_binned_counts": "binned_counts",
-    "max_seq_len": None,
-    "var_gene_names": None,
-    "obsm_gene_tokens": "gene_id_tokens",
-    "obsm_tokenized_values": "values_tokenized",
-    "obsm_padding_mask": "padding_mask",
-    "output_compression": None,
-    "var_input": "id_in_vocab",
-}
-meta = {"resources_dir": "src/utils/"}
-
-# mdata = mu.read(par["input"])
-# mdata.mod["rna"].obsm["binned_counts"] = mdata.mod["rna"].layers["binned"]
-# mdata.write_h5mu(par["input"])
-## VIASH END
-
-sys.path.append(meta["resources_dir"])
-from setup_logger import setup_logger
-from subset_vars import subset_vars
-
-logger = setup_logger()
-
-logger.info("Reading in data")
-
-# Read in data
-mdata = mu.read(par["input"])
-input_adata = mdata.mod[par["modality"]]
-adata = input_adata.copy()
-
-adata = subset_vars(adata, par["var_input"])
-
-# Set padding specs
-pad_token = par["pad_token"]
-special_tokens = [pad_token, "<cls>", "<eoc>"]
-pad_value = -2
-
-logger.info("Fetching counts and gene names")
-# Fetch counts
-all_counts = (
-    adata.obsm[par["input_obsm_binned_counts"]].toarray()
-    if issparse(adata.obsm[par["input_obsm_binned_counts"]])
-    else adata.obsm[par["input_obsm_binned_counts"]]
-)
-
-# Fetching gene names
-if not par["var_gene_names"]:
-    genes = adata.var.index.astype(str).tolist()
-else:
-    genes = adata.var[par["var_gene_names"]].astype(str).tolist()
-
-# Fetch gene names and look up tokens in vocab
-logger.info("Reading in vocab and fetching gene tokens")
-vocab_file = par["model_vocab"]
-vocab = GeneVocab.from_file(vocab_file)
-for s in special_tokens:
-    if s not in vocab:
-        vocab.append_token(s)
-
-vocab.set_default_index(vocab["<pad>"])
-ntokens = len(vocab)
-gene_ids = np.array(vocab(genes), dtype=int)
-
-# Fetch max seq len
-if not par["max_seq_len"]:
-    max_seq_len = adata.var.shape[0] + 1
-else:
-    max_seq_len = par["max_seq_len"]
-
-# Tokenize and pad data
-logger.info(
-    f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}."
-)
-tokenized_data = tokenize_and_pad_batch(
-    all_counts,
-    gene_ids,
-    max_len=max_seq_len,
-    vocab=vocab,
-    pad_token=pad_token,
-    pad_value=pad_value,
-    append_cls=True,  # append <cls> token at the beginning,
-    include_zero_gene=False,
-    return_pt=True,
-    mod_type=None,
-    vocab_mod=None,
-)
-
-all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"]
-padding_mask = all_gene_ids.eq(vocab[pad_token])
-
-logger.info("Writing output data")
-input_adata.obsm[par["obsm_gene_tokens"]] = all_gene_ids.numpy()
-input_adata.obsm[par["obsm_tokenized_values"]] = all_values.numpy()
-input_adata.obsm[par["obsm_padding_mask"]] = padding_mask.numpy()
-
-mdata.write(par["output"], compression=par["output_compression"])
diff --git a/src/scgpt/pad_tokenize/test.py b/src/scgpt/pad_tokenize/test.py
deleted file mode 100644
index e79a9b6cce6..00000000000
--- a/src/scgpt/pad_tokenize/test.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pytest
-import sys
-import mudata as mu
-from scgpt.tokenizer.gene_tokenizer import GeneVocab
-
-## VIASH START
-meta = {
-    "resources_dir": "resources_test/scgpt",
-    "executable": "./target/docker/scgpt/integration_pad_tokenize/integration_pad_tokenize",
-    "temp_dir": "tmp",
-    "config": "./target/docker/scgpt/integration_pad_tokenize/.config.vsh.yaml",
-}
-## VIASH END
-
-input_file = (
-    f"{meta['resources_dir']}/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu"
-)
-vocab_file = f"{meta['resources_dir']}/scgpt/source/vocab.json"
-vocab = GeneVocab.from_file(vocab_file)
-
-
-def test_integration_pad_tokenize(run_component, tmp_path):
-    output = tmp_path / "Kim2020_Lung_tokenized.h5mu"
-
-    run_component(
-        [
-            "--input",
-            input_file,
-            "--output",
-            output,
-            "--modality",
-            "rna",
-            "--var_input",
-            "scgpt_cross_checked_genes",
-            "--obsm_gene_tokens",
-            "gene_id_tokens",
-            "--obsm_tokenized_values",
-            "values_tokenized",
-            "--obsm_padding_mask",
-            "padding_mask",
-            "--pad_token",
-            "<pad>",
-            "--pad_value",
-            "-2",
-            "--input_obsm_binned_counts",
-            "binned_counts",
-            "--model_vocab",
-            vocab_file,
-        ]
-    )
-
-    output_file = mu.read(output)
-    output_adata = output_file.mod["rna"]
-
-    gene_ids = output_adata.obsm["gene_id_tokens"]
-    values = output_adata.obsm["values_tokenized"]
-    padding_mask = output_adata.obsm["padding_mask"]
-
-    # check output dimensions
-    ## nr of genes that are tokenized
-    assert gene_ids.shape[1] <= output_adata.var.shape[0] + 1, (
-        "gene_ids shape[1] is higher than adata.var.shape[0] (n_hvg + 1)"
-    )
-    assert values.shape[1] <= output_adata.var.shape[0] + 1, (
-        "values shape[1] is higher than adata.var.shape[0] (n_hvg + 1)"
-    )
-    assert padding_mask.shape[1] <= output_adata.var.shape[0] + 1, (
-        "padding_mask shape[1] is higher than adata.var.shape[0] (n_hvg + 1)"
-    )
-
-    ## equal size of output tensors
-    assert gene_ids.shape == values.shape, (
-        "gene_ids shape[1] does not match values shape[1]"
-    )
-    assert gene_ids.shape == padding_mask.shape, (
-        "gene_ids shape[1] does not match padding_mask shape[1]"
-    )
-
-    ## check values of output tensors
-    assert gene_ids.dtype == "int64", "tokenized gene_ids are not integers"
-    assert (gene_ids > 0).all(), "not all gene id tokens are higher than 0"
-
-    assert values.dtype == "float32", "tokenized values are not floats"
-    assert (values >= -2).all(), "not all tokenized values are higher than/equal to -2"
-
-    assert padding_mask.dtype == bool, "padding mask is not boolean"
-
-    ## check cls token
-    assert (gene_ids[:, 0] == vocab["<cls>"]).all(), (
-        "cls token was not correctly appended at the beginning of the gene_ids tensor"
-    )
-    assert (values[:, 0] == 0).all(), (
-        "cls token was not correctly appended at the beginning of the values tensors"
-    )
-
-    # check padding values
-    masked_gene_ids = gene_ids[padding_mask]
-    unmasked_gene_ids = gene_ids[~padding_mask]
-    assert all(masked_gene_ids == vocab["<pad>"]), (
-        "masked gene_ids contain non-pad tokens"
-    )
-    assert all(unmasked_gene_ids != vocab["<pad>"]), (
-        "unmasked gene_ids contain pad tokens"
-    )
-
-    masked_values = values[padding_mask]
-    unmasked_values = values[~padding_mask]
-    assert all(masked_values == -2), "masked values contain non-pad values"
-    assert all(unmasked_values != -2), "unmasked values contain pad values"
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main([__file__]))
diff --git a/src/workflows/annotation/scgpt_annotation/config.vsh.yaml b/src/workflows/annotation/scgpt_annotation/config.vsh.yaml
deleted file mode 100644
index 81321b482ab..00000000000
--- a/src/workflows/annotation/scgpt_annotation/config.vsh.yaml
+++ /dev/null
@@ -1,211 +0,0 @@
-name: "scgpt_annotation"
-namespace: "workflows/annotation"
-scope: "public"
-description: |
-  Cell type annotation workflow using scGPT. 
-  The workflow takes a pre-processed h5mu file as query input, and performs
-    - subsetting for HVG
-    - cross-checking of genes with the model vocabulary
-    - binning of gene counts
-    - padding and tokenizing of genes
-    - transformer-based cell type prediction
-  Note that cell-type prediction using scGPT is only possible using a fine-tuned scGPT model.
-info:
-  name: "scGPT Annotation"
-  test_dependencies:
-    - name: scgpt_annotation_test
-      namespace: test_workflows/annotation
-
-authors:
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ author, maintainer ]
-  - __merge__: /src/authors/elizabeth_mlynarski.yaml
-    roles: [ contributor ]
-  - __merge__: /src/authors/weiwei_schultz.yaml
-    roles: [ contributor ]
-    
-argument_groups:
-  - name: "Query input"
-    arguments:
-      - name: "--id"
-        required: true
-        type: string
-        description: ID of the sample.
-        example: foo
-      - name: "--input"
-        type: file
-        required: true
-        description: Path to the input file.
-        example: input.h5mu
-      - name: "--modality"
-        description: |
-          Which modality from the input MuData file to process.
-        type: string
-        default: "rna"
-        required: false
-      - name: "--input_layer"
-        type: string
-        required: False
-        description: |
-          The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts.
-      - name: "--input_var_gene_names"
-        type: string
-        required: false
-        description: |
-          The .var field in the input (query) containing gene names; if not provided, the var index will be used.
-      - name: "--input_obs_batch_label"
-        type: string
-        required: true
-        description: |
-          The .obs field in the input (query) dataset containing the batch labels.
-
-  - name: "Model input"
-    arguments:
-      - name: "--model"
-        type: file
-        required: true
-        example: best_model.pt
-        description: |
-          The scGPT model file. 
-          Must be a fine-tuned model that contains keys for checkpoints (--finetuned_checkpoints_key) and cell type label mapper(--label_mapper_key).
-      - name: "--model_config"
-        type: file
-        required: true
-        example: args.json
-        description: |
-          The scGPT model configuration file. 
-      - name: "--model_vocab"
-        type: file
-        required: true
-        example: vocab.json
-        description: |
-          The scGPT model vocabulary file.
-      - name: "--finetuned_checkpoints_key"
-        type: string
-        default: model_state_dict
-        description: |
-          Key in the model file containing the pre-trained checkpoints.
-      - name: "--label_mapper_key"
-        type: string
-        default: id_to_class
-        description: |
-          Key in the model file containing the cell type class to label mapper dictionary.
-
-  - name: "Outputs"
-    arguments:
-      - name: "--output"
-        type: file
-        required: true
-        direction: output
-        description: Output file path
-        example: output.h5mu
-      - name: "--output_compression"
-        type: string
-        example: "gzip"
-        required: false
-        choices: ["gzip", "lzf"]
-        description: |
-          The compression algorithm to use for the output h5mu file.
-      - name: "--output_obs_predictions"
-        type: string
-        default: "scgpt_pred"
-        required: false
-        description: |
-          The name of the adata.obs column to write predicted cell type labels to.
-      - name: "--output_obs_probability"
-        type: string
-        default: "scgpt_probability"
-        required: false
-        description: |
-          The name of the adata.obs column to write predicted cell type labels to.
-          
-  - name: "Padding arguments"
-    arguments:
-      - name: "--pad_token"
-        type: string
-        default: "<pad>"
-        required: false
-        description: |
-          Token used for padding.
-      - name: "--pad_value"
-        type: integer
-        default: -2
-        required: false
-        description: |
-          The value of the padding token.
-  
-  - name: "HVG subset arguments"
-    arguments:
-      - name: "--n_hvg"
-        type: integer
-        default: 1200
-        description: |
-          Number of highly variable genes to subset for.
-      - name: "--hvg_flavor"
-        type: string
-        choices: ["cell_ranger", "seurat"]
-        default: "cell_ranger"
-        description: |
-          Method to be used for identifying highly variable genes. 
-          Note that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`).
-
-  - name: "Tokenization arguments"
-    arguments:
-      - name: "--max_seq_len"
-        type: integer
-        required: false
-        description: |
-          The maximum sequence length of the tokenized data.
-
-  - name: "Embedding arguments"
-    arguments:
-      - name: --dsbn
-        type: boolean
-        default: true
-        description: |
-          Apply domain-specific batch normalization
-      - name: "--batch_size"
-        type: integer
-        default: 64
-        min: 1
-        description: |
-          The batch size to be used for embedding inference.
-
-  - name: "Binning arguments"
-    arguments:
-      - name: "--n_input_bins"
-        type: integer
-        default: 51
-        required: False
-        min: 1
-        description: |
-          The number of bins to discretize the data into; When no value is provided, data won't be binned.
-      - name: "--seed"
-        type: integer
-        min: 0
-        required: false
-        description: |
-          Seed for random number generation used for binning. If not set, no seed is used.
-
-resources:
-  - type: nextflow_script
-    path: main.nf
-    entrypoint: run_wf
-
-test_resources:
-  - type: nextflow_script
-    path: test.nf
-    entrypoint: test_wf
-  - path: /resources_test/scgpt
-
-dependencies:
-  - name: scgpt/cross_check_genes
-  - name: scgpt/binning
-  - name: feature_annotation/highly_variable_features_scanpy
-  - name: filter/do_filter
-  - name: scgpt/pad_tokenize
-  - name: scgpt/cell_type_annotation
-    alias: scgpt_celltype_annotation
-
-runners:
-  - type: nextflow
diff --git a/src/workflows/annotation/scgpt_annotation/integration_test.sh b/src/workflows/annotation/scgpt_annotation/integration_test.sh
deleted file mode 100755
index 108a4c4db47..00000000000
--- a/src/workflows/annotation/scgpt_annotation/integration_test.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# get the root of the directory
-REPO_ROOT=$(git rev-parse --show-toplevel)
-
-# ensure that the command below is run from the root of the repository
-cd "$REPO_ROOT"
-
-nextflow run . \
-  -main-script src/workflows/annotation/scgpt_annotation/test.nf \
-  -profile docker,no_publish \
-  -entry test_wf \
-  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
diff --git a/src/workflows/annotation/scgpt_annotation/main.nf b/src/workflows/annotation/scgpt_annotation/main.nf
deleted file mode 100644
index 010ed7b188f..00000000000
--- a/src/workflows/annotation/scgpt_annotation/main.nf
+++ /dev/null
@@ -1,112 +0,0 @@
-workflow run_wf {
-
-  take:
-    input_ch
-
-  main:
-    output_ch = input_ch
-    // Set aside the output for this workflow to avoid conflicts
-    | map {id, state -> 
-      def new_state = state + ["workflow_output": state.output]
-      [id, new_state]
-    }
-    // Annotate the mudata object with highly variable genes.
-    | highly_variable_features_scanpy.run(
-      fromState: [
-        "input": "input",
-        "layer": "input_layer",
-        "modality": "modality",
-        "n_top_features": "n_hvg",
-        "flavor": "hvg_flavor"
-      ],
-      args: [
-        "var_name_filter": "scgpt_filter_with_hvg"
-      ],
-      toState: ["input": "output"]
-    )
-    // Check whether the genes are part of the provided vocabulary. 
-    // Subsets for genes present in vocab only.
-    | cross_check_genes.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "vocab_file": "model_vocab",
-        "input_var_gene_names": "input_var_gene_names",
-        "output": "output",
-        "pad_token": "pad_token"
-      ],
-      args: [
-        "var_input": "scgpt_filter_with_hvg",
-        "output_var_filter": "scgpt_cross_checked_genes"
-      ],
-      toState: ["input": "output"]
-    )
-    // Bins the data into a fixed number of bins.
-    | binning.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "input_layer": "input_layer",
-        "n_input_bins": "n_input_bins",
-        "output": "output",
-        "seed": "seed"
-      ],
-      args: [
-        "output_obsm_binned_counts": "binned_counts",
-        "var_input": "scgpt_cross_checked_genes"
-      ],
-      toState: ["input": "output"]
-    )
-    // Padding and tokenization of gene count values.
-    | pad_tokenize.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "model_vocab": "model_vocab",
-        "var_gene_names": "input_var_gene_names",
-        "pad_token": "pad_token",
-        "pad_value": "pad_value",
-        "max_seq_len": "max_seq_len",
-        "output": "output"
-      ],
-      args: [
-        "input_obsm_binned_counts": "binned_counts",
-        "obsm_gene_tokens": "gene_id_tokens",
-        "obsm_tokenized_values": "values_tokenized",
-        "obsm_padding_mask": "padding_mask",
-        "var_input": "scgpt_cross_checked_genes"
-      ],
-      toState: ["input": "output"]
-    )
-    // scGPT decoder-based cell type annotation.
-    | scgpt_celltype_annotation.run(
-      fromState: [
-        "model": "model",
-        "model_vocab": "model_vocab",
-        "model_config": "model_config",
-        "label_mapper_key": "label_mapper_key",
-        "finetuned_checkpoints_key": "finetuned_checkpoints_key",
-        "input": "input",
-        "modality": "modality",
-        "obs_batch_label": "input_obs_batch_label",
-        "pad_token": "pad_token",
-        "pad_value": "pad_value",
-        "n_input_bins": "n_input_bins",
-        "dsbn": "dsbn",
-        "batch_size": "batch_size",
-        "seed": "seed",
-        "output_obs_predictions": "output_obs_predictions",
-        "output_obs_probability": "output_obs_probability",
-        "output": "workflow_output",
-        "output_compression": "output_compression"
-      ],
-      args: [
-        "obsm_gene_tokens": "gene_id_tokens",
-        "obsm_tokenized_values": "values_tokenized"
-      ],
-      toState: {id, output, state -> ["output": output.output]}
-    )
-
-  emit:
-    output_ch
-}
diff --git a/src/workflows/annotation/scgpt_annotation/nextflow.config b/src/workflows/annotation/scgpt_annotation/nextflow.config
deleted file mode 100644
index 059100c489c..00000000000
--- a/src/workflows/annotation/scgpt_annotation/nextflow.config
+++ /dev/null
@@ -1,10 +0,0 @@
-manifest {
-  nextflowVersion = '!>=20.12.1-edge'
-}
-
-params {
-  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
-}
-
-// include common settings
-includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
diff --git a/src/workflows/annotation/scgpt_annotation/test.nf b/src/workflows/annotation/scgpt_annotation/test.nf
deleted file mode 100644
index 5e86cc16b9e..00000000000
--- a/src/workflows/annotation/scgpt_annotation/test.nf
+++ /dev/null
@@ -1,58 +0,0 @@
-nextflow.enable.dsl=2
-
-include { scgpt_annotation } from params.rootDir + "/target/nextflow/workflows/annotation/scgpt_annotation/main.nf"
-include { scgpt_annotation_test } from params.rootDir + "/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf"
-
-params.resources_test = params.rootDir + "/resources_test"
-
-workflow test_wf {
-  resources_test = file(params.resources_test)
-  scgpt_test_resources = resources_test / "scgpt"
-
-    output_ch = Channel.fromList([
-        [
-            id: "simple_execution_test",
-            input: scgpt_test_resources.resolve("test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
-            model: scgpt_test_resources.resolve("finetuned_model/best_model.pt"),
-            model_config: scgpt_test_resources.resolve("source/args.json"),
-            model_vocab: scgpt_test_resources.resolve("source/vocab.json"),
-            input_layer: "log_normalized",
-            input_obs_batch_label: "sample",
-            // change default to reduce resource requirements
-            n_hvg: 400,
-            seed: 1
-        ]
-    ])
-    | map{ state -> [state.id, state] }
-    | scgpt_annotation
-    | view { output ->
-      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
-
-      // check id
-      def id = output[0]
-      assert id.endsWith("_test")
-
-      // check output
-      def state = output[1]
-      assert state instanceof Map : "State should be a map. Found: ${state}"
-      assert state.containsKey("output") : "Output should contain key 'output'."
-      assert state.output.isFile() : "'output' should be a file."
-      assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
-
-      "Output: $output"
-    }
-    | scgpt_annotation_test.run(
-        fromState: [
-          "input": "output"
-        ],
-        args: [
-          "n_hvg": 400
-        ]
-    )
-    | toSortedList()
-    | map { output_list ->
-      assert output_list.size() == 1 : "output channel should contain 1 event"
-      assert output_list.collect{it[0]} == ["simple_execution_test"]
-    }
-
-}
diff --git a/src/workflows/integration/scgpt_leiden/config.vsh.yaml b/src/workflows/integration/scgpt_leiden/config.vsh.yaml
deleted file mode 100644
index 256dca6a887..00000000000
--- a/src/workflows/integration/scgpt_leiden/config.vsh.yaml
+++ /dev/null
@@ -1,185 +0,0 @@
-name: "scgpt_leiden"
-namespace: "workflows/integration"
-scope: "public"
-description: "Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result."
-authors:
-  - __merge__: /src/authors/dorien_roosen.yaml
-    roles: [ maintainer, author ]
-  - __merge__: /src/authors/elizabeth_mlynarski.yaml
-    roles: [ author ]
-  - __merge__: /src/authors/weiwei_schultz.yaml
-    roles: [ contributor ]
-info:
-  test_dependencies:
-argument_groups:
-  - name: "Inputs"
-    arguments:
-      - name: "--id"
-        required: true
-        type: string
-        description: ID of the sample.
-        example: foo
-      - name: "--input"
-        type: file
-        required: true
-        description: Path to the input file.
-        example: input.h5mu
-      - name: "--modality"
-        description: |
-          Which modality from the input MuData file to process.
-        type: string
-        default: "rna"
-        required: false
-      - name: "--input_layer"
-        type: string
-        required: False
-        description: |
-          The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts.
-      - name: "--var_gene_names"
-        type: string
-        required: false
-        description: |
-          The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.
-      - name: "--obs_batch_label"
-        type: string
-        description: |
-          The name of the adata obs column containing the batch labels.
-  - name: Model
-    arguments:
-      - name: "--model"
-        type: file
-        required: true
-        example: resources_test/scgpt/best_model.pt
-        description: |
-          Path to scGPT model file.
-      - name: "--model_vocab"
-        type: file
-        direction: input
-        required: true
-        example: resources_test/scgpt/vocab.json
-        description: |
-          Path to scGPT model vocabulary file.
-      - name: "--model_config"
-        type: file
-        direction: input
-        required: true
-        example: args.json
-        description: |
-          Path to scGPT model config file.
-      - name: "--finetuned_checkpoints_key"
-        type: string
-        required: false
-        example: model_state_dict
-        description: |
-          Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models.
-  - name: "Outputs"
-    arguments:
-      - name: "--output"
-        type: file
-        required: true
-        direction: output
-        description: Output file path
-        example: output.h5mu
-      - name: "--obsm_integrated"
-        type: string
-        default: "X_scgpt"
-        required: false
-        description: "In which .obsm slot to store the resulting integrated embedding."
-
-  - name: "Padding arguments"
-    arguments:
-      - name: "--pad_token"
-        type: string
-        default: "<pad>"
-        required: false
-        description: |
-          Token used for padding.
-      - name: "--pad_value"
-        type: integer
-        default: -2
-        required: false
-        description: |
-          The value of the padding token.
-  
-  - name: "HVG subset arguments"
-    arguments:
-      - name: "--n_hvg"
-        type: integer
-        default: 1200
-        description: |
-          Number of highly variable genes to subset for.
-      - name: "--hvg_flavor"
-        type: string
-        choices: ["cell_ranger", "seurat"]
-        default: "cell_ranger"
-        description: |
-          Method to be used for identifying highly variable genes. 
-          Note that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`).
-
-  - name: "Tokenization arguments"
-    arguments:
-      - name: "--max_seq_len"
-        type: integer
-        required: false
-        description: |
-          The maximum sequence length of the tokenized data. Defaults to the number of features if not provided.
-  - name: "Embedding arguments"
-    arguments:
-      - name: --dsbn
-        type: boolean
-        default: true
-        description: |
-          Apply domain-specific batch normalization
-      - name: "--batch_size"
-        type: integer
-        default: 64
-        description: |
-          The batch size to be used for embedding inference.
-
-  - name: "Binning arguments"
-    arguments:
-      - name: "--n_input_bins"
-        type: integer
-        default: 51
-        required: False
-        min: 1
-        description: |
-          The number of bins to discretize the data into; When no value is provided, data won't be binned.
-      - name: "--seed"
-        type: integer
-        required: false
-        description: |
-          Seed for random number generation used for binning. If not set, no seed is used.
-
-  - name: "Clustering arguments"
-    arguments:
-      - name: "--leiden_resolution"
-        type: double
-        description: Control the coarseness of the clustering. Higher values lead to more clusters.
-        default: [1]
-        multiple: true
-
-resources:
-  - type: nextflow_script
-    path: main.nf
-    entrypoint: run_wf
-
-dependencies:
-  - name: scgpt/cross_check_genes
-  - name: scgpt/binning
-  - name: feature_annotation/highly_variable_features_scanpy
-  - name: scgpt/pad_tokenize
-  - name: scgpt/embedding
-  - name: workflows/multiomics/neighbors_leiden_umap
-
-test_resources:
-  - type: nextflow_script
-    path: test.nf
-    entrypoint: test_wf
-  - type: nextflow_script
-    path: test.nf
-    entrypoint: test_wf2
-  - path: /resources_test/scgpt
-
-runners:
-  - type: nextflow
diff --git a/src/workflows/integration/scgpt_leiden/integration_test.sh b/src/workflows/integration/scgpt_leiden/integration_test.sh
deleted file mode 100755
index 001299c408e..00000000000
--- a/src/workflows/integration/scgpt_leiden/integration_test.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# get the root of the directory
-REPO_ROOT=$(git rev-parse --show-toplevel)
-
-# ensure that the command below is run from the root of the repository
-cd "$REPO_ROOT"
-
-nextflow \
-  run . \
-  -main-script src/workflows/integration/scgpt_leiden/test.nf \
-  -entry test_wf \
-  -profile docker,no_publish \
-  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
-
-nextflow \
-  run . \
-  -main-script src/workflows/integration/scgpt_leiden/test.nf \
-  -entry test_wf2 \
-  -profile docker,no_publish \
-  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
diff --git a/src/workflows/integration/scgpt_leiden/main.nf b/src/workflows/integration/scgpt_leiden/main.nf
deleted file mode 100644
index ac5df109aa8..00000000000
--- a/src/workflows/integration/scgpt_leiden/main.nf
+++ /dev/null
@@ -1,138 +0,0 @@
-workflow run_wf {
-  
-  take:
-    input_ch
-
-  main:
-    output_ch = input_ch
-    // Set aside the output for this workflow to avoid conflicts
-    | map {id, state -> 
-      def new_state = state + ["workflow_output": state.output]
-      [id, new_state]
-    }
-    // Annotates the mudata object with highly variable genes.
-    | highly_variable_features_scanpy.run(
-      fromState: [
-          "input": "input",
-          "layer": "input_layer",
-          "modality": "modality",
-          "n_top_features": "n_hvg",
-          "flavor": "hvg_flavor"
-        ],
-      args: ["var_name_filter": "scgpt_filter_with_hvg"],
-      toState: ["input": "output"]
-    )
-    // Check whether the genes are part of the provided vocabulary.
-    | cross_check_genes.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "vocab_file": "model_vocab",
-        "input_var_gene_names": "var_gene_names",
-        "output": "output",
-        "pad_token": "pad_token"
-      ],
-      args: [
-        "var_input": "scgpt_filter_with_hvg",
-        "output_var_filter": "scgpt_cross_checked_genes"
-      ],
-      toState: [
-        "input": "output"
-      ]
-    )
-    // Bins the data into a fixed number of bins.
-    | binning.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "input_layer": "input_layer",
-        "n_input_bins": "n_input_bins",
-        "output": "output"
-      ],
-      args: [
-        "output_obsm_binned_counts": "binned_counts",
-        "var_input": "scgpt_cross_checked_genes"
-      ],
-      toState: [
-        "input": "output"
-      ]
-    )
-    // Padding and tokenization of gene count values.
-    | pad_tokenize.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "model_vocab": "model_vocab",
-        "var_gene_names": "var_gene_names",
-        "pad_token": "pad_token",
-        "pad_value": "pad_value",
-        "max_seq_len": "max_seq_len",
-        "output": "output"
-      ],
-      args: [
-        "input_obsm_binned_counts": "binned_counts",
-        "var_input": "scgpt_cross_checked_genes",
-        "obsm_gene_tokens": "gene_id_tokens",
-        "obsm_tokenized_values": "values_tokenized",
-        "obsm_padding_mask": "padding_mask"
-      ],
-      toState: [
-        "input": "output"
-      ]
-    )
-    // Generation of cell embedings from the tokenized gene counts values.
-    | embedding.run(
-      fromState: [
-        "input": "input",
-        "modality": "modality",
-        "model": "model",
-        "model_vocab": "model_vocab",
-        "model_config": "model_config",
-        "var_gene_names": "var_gene_names",
-        "obs_batch_label": "obs_batch_label",
-        "pad_token": "pad_token",
-        "pad_value": "pad_value",
-        "dsbn": "dsbn",
-        "batch_size": "batch_size",
-        "obsm_embeddings": "obsm_integrated",
-        "finetuned_checkpoints_key": "finetuned_checkpoints_key",
-        "output": "output"
-      ],
-      args: [
-        "obsm_gene_tokens": "gene_id_tokens",
-        "obsm_tokenized_values": "values_tokenized",
-        "obsm_padding_mask": "padding_mask"
-      ],
-      toState: [
-        "input": "output"
-      ]
-    )
-    // Calculation of neighbors, leiden clustering and UMAP.
-    | neighbors_leiden_umap.run(
-      fromState: [
-        "input": "input",
-        "obsm_input": "obsm_integrated",
-        "modality": "modality",
-        "uns_neighbors": "uns_neighbors",
-        "obsp_neighbor_distances": "obsp_neighbor_distances",
-        "obsp_neighbor_connectivities": "obsp_neighbor_connectivities",
-        "output": "workflow_output",
-        "leiden_resolution": "leiden_resolution",
-        "obsm_umap": "obsm_integrated",
-      ],
-      toState: [
-        "output": "output"
-      ],
-      args: [
-        "uns_neighbors": "scGPT_integration_neighbors",
-        "obsp_neighbor_distances": "scGPT_integration_distances",
-        "obsp_neighbor_connectivities": "scGPT_integration_connectivities",
-        "obs_cluster": "scGPT_integration_leiden",
-        "obsm_umap": "X_scGPT_umap"
-      ]
-    )
-    | setState(["output"])
-  
-  emit:
-    output_ch
-}
\ No newline at end of file
diff --git a/src/workflows/integration/scgpt_leiden/nextflow.config b/src/workflows/integration/scgpt_leiden/nextflow.config
deleted file mode 100644
index 8108bc25e84..00000000000
--- a/src/workflows/integration/scgpt_leiden/nextflow.config
+++ /dev/null
@@ -1,10 +0,0 @@
-manifest {
-  nextflowVersion = '!>=20.12.1-edge'
-}
-
-params {
-  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
-}
-
-// include common settings
-includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
\ No newline at end of file
diff --git a/src/workflows/integration/scgpt_leiden/test.nf b/src/workflows/integration/scgpt_leiden/test.nf
deleted file mode 100644
index 064cf4a5b49..00000000000
--- a/src/workflows/integration/scgpt_leiden/test.nf
+++ /dev/null
@@ -1,100 +0,0 @@
-nextflow.enable.dsl=2
-
-include { scgpt_leiden } from params.rootDir + "/target/nextflow/workflows/integration/scgpt_leiden/main.nf"
-
-params.resources_test = params.rootDir + "/resources_test"
-
-workflow test_wf {
-
-  resources_test = file(params.resources_test)
-
-    output_ch = Channel.fromList([
-        [
-          id: "simple_execution_test",
-          input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
-          model: resources_test.resolve("scgpt/source/best_model.pt"),
-          model_config: resources_test.resolve("scgpt/source/args.json"),
-          model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
-          input_layer: "log_normalized",
-          obs_batch_label: "sample",
-          n_hvg: 400,
-          seed: 1,
-          leiden_resolution: [1.0, 0.25]
-        ],
-        [
-          id: "no_leiden_resolutions_test",
-          input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
-          model: resources_test.resolve("scgpt/source/best_model.pt"),
-          model_config: resources_test.resolve("scgpt/source/args.json"),
-          model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
-          obs_batch_label: "sample",
-          n_hvg: 400,
-          seed: 1,
-          input_layer: "log_normalized",
-          leiden_resolution: []
-        ]
-    ])
-    | map{ state -> [state.id, state] }
-    | scgpt_leiden
-    | view { output ->
-      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
-
-      // check id
-      def id = output[0]
-      assert id.endsWith("_test")
-
-      // check output
-      def state = output[1]
-      assert state instanceof Map : "State should be a map. Found: ${state}"
-      assert state.containsKey("output") : "Output should contain key 'output'."
-      assert state.output.isFile() : "'output' should be a file."
-      assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
-
-      "Output: $output"
-    }
-    | toSortedList{a, b -> a[0] <=> b[0]}
-    | map { output_list ->
-      assert output_list.size() == 2 : "output channel should contain 2 events"
-      assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"]
-    }
-}
-
-
-workflow test_wf2 {
-
-  resources_test = file(params.resources_test)
-
-  output_ch = Channel.fromList([
-      [
-        id: "test_output_arg",
-        input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
-        model: resources_test.resolve("scgpt/source/best_model.pt"),
-        model_config: resources_test.resolve("scgpt/source/args.json"),
-        model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
-        input_layer: "log_normalized",
-        obs_batch_label: "sample",
-        n_hvg: 400,
-        leiden_resolution: [1.0, 0.25],
-        output: "test.h5mu"
-      ],
-    ])
-    | map{ state -> [state.id, state] }
-    | scgpt_leiden
-    | view { output ->
-      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
-
-      // check output
-      def state = output[1]
-      assert state instanceof Map : "State should be a map. Found: ${state}"
-      assert state.containsKey("output") : "Output should contain key 'output'."
-      assert state.output.isFile() : "'output' should be a file."
-      assert state.output.toString().endsWith("test.h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
-
-      "Output: $output"
-    }
-    | toSortedList({a, b -> a[0] <=> b[0]})
-    | map { output_list ->
-      assert output_list.size() == 1 : "output channel should contain 1 event"
-      assert output_list.collect{it[0]} == ["test_output_arg"]
-    }
-  }

From 52f30ac09c452ba2aa23c83a476fa9f5b8465f70 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 16 Sep 2025 19:50:40 +0200
Subject: [PATCH 2/4] update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3963eca7859..f7a4c92279f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 * `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044).
 
+* Deprecated all scGPT functionality (PR #1075).
+
 ## NEW FUNCTIONALITY
 
 * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070).

From 8201cb5bf637938bef5ace9b3688dfd43bd34cc4 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 16 Sep 2025 19:53:01 +0200
Subject: [PATCH 3/4] remove test resources

---
 resources_test_scripts/scgpt.sh | 135 --------------------------------
 1 file changed, 135 deletions(-)
 delete mode 100755 resources_test_scripts/scgpt.sh

diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh
deleted file mode 100755
index b8bf34f47a3..00000000000
--- a/resources_test_scripts/scgpt.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-set -eo pipefail
-
-# ensure that the command below is run from the root of the repository
-REPO_ROOT=$(git rev-parse --show-toplevel)
-cd "$REPO_ROOT"
-
-# settings
-ID=scgpt
-OUT=resources_test/$ID
-
-# create foundational model directory
-foundation_model_dir="$OUT/source"
-mkdir -p "$foundation_model_dir"
-export foundation_model_dir
-
-# create finetuned model directory
-finetuned_model_dir="$OUT/finetuned_model"
-mkdir -p "$finetuned_model_dir"
-export finetuned_model_dir
-
-# install gdown if necessary
-# Check whether gdown is available
-if ! command -v gdown &> /dev/null; then
-    echo "This script requires gdown. Please make sure the binary is added to your PATH."
-    exit 1
-fi
-
-# install torch if necessary
-# Check whether torch is available
-if ! python -c "import torch"; then
-    echo "This script requires torch. Please make sure it is available in your python environment."
-    exit 1
-fi
-
-echo "> Downloading scGPT foundation model (full_human)"
-# download foundational model files (full_human)
-# https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y
-gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json"
-gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json"
-gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt"
-
-echo "> Converting to finetuned model format"
-python <<HEREDOC
-import torch
-import mudata
-import os
-
-foundation_model_dir = os.environ.get('foundation_model_dir')
-finetuned_model_dir = os.environ.get('finetuned_model_dir')
-
-found_model_path = f"{foundation_model_dir}/best_model.pt"
-ft_model_path = f"{finetuned_model_dir}/best_model.pt"
-
-f_model_dict = torch.load(found_model_path, map_location="cpu")
-model_dict = {}
-model_dict["model_state_dict"] = f_model_dict
-model_dict["id_to_class"] = {k: str(k) for k in range(15)}
-torch.save(model_dict, ft_model_path)
-HEREDOC
-
-# create test data dir
-test_resources_dir="$OUT/test_resources"
-mkdir -p "$test_resources_dir"
-
-echo "> Downloading test resources"
-# download test data
-# https://drive.google.com/file/d/1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL/view?usp=drive_link
-gdown '1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL' -O "${test_resources_dir}/Kim2020_Lung.h5ad"
-
-echo "> Converting to h5mu"
-python <<HEREDOC
-import anndata as ad
-import mudata as mu
-input_adata = ad.read_h5ad("${test_resources_dir}/Kim2020_Lung.h5ad")
-input_mdata = mu.MuData({'rna': input_adata})
-input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
-HEREDOC
-
-echo "> Subsetting datasets"
-viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
-  --number_of_observations 4000
-
-rm "${test_resources_dir}/Kim2020_Lung.h5ad"
-rm "${test_resources_dir}/Kim2020_Lung.h5mu"
-
-echo "> Preprocessing datasets"
-nextflow \
-  run . \
-  -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
-  -profile docker \
-  -c src/workflows/utils/labels_ci.config \
-  --input "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
-  --output "Kim2020_Lung_subset_preprocessed.h5mu" \
-  --publish_dir "${test_resources_dir}"
-
-echo "> Filtering highly variable features"
-viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
-  --layer "log_normalized" \
-  --var_name_filter "scgpt_filter_with_hvg" \
-  --n_top_features 1200 \
-  --flavor "cell_ranger"
-  
-echo "> Running scGPT cross check genes"
-viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
-  --vocab_file "${foundation_model_dir}/vocab.json" \
-  --var_input "scgpt_filter_with_hvg" \
-  --output_var_filter "scgpt_cross_checked_genes"
-
-echo "> Running scGPT binning"
-viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
-  --input_layer "log_normalized" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
-  --output_obsm_binned_counts "binned_counts" \
-  --var_input "scgpt_cross_checked_genes"
-
-echo "> Running scGPT tokenizing"
-viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
-  --input_obsm_binned_counts "binned_counts" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --model_vocab "${foundation_model_dir}/vocab.json" \
-  --var_input "scgpt_cross_checked_genes" \
-
-
-echo "> Removing unnecessary files in test resources dir"
-find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
-
-echo "> scGPT test resources are ready!"

From ee0f3f88b3675d95a4bb42e3dfdee5333b0f9f23 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Fri, 10 Oct 2025 09:38:55 +0200
Subject: [PATCH 4/4] update changelog

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f7a4c92279f..c01567f5e57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,8 +4,6 @@
 
 * `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044).
 
-* Deprecated all scGPT functionality (PR #1075).
-
 ## NEW FUNCTIONALITY
 
 * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070).
@@ -18,6 +16,8 @@
 
 * `workflows/differential_expression/pseudobulk_deseq2`: Workflow for generating pseudobulk samples from single-cell data followed by DESeq2 differential expression analysis (PR #1044)
 
+* Deprecated all scGPT functionality (PR #1075).
+
 ## MINOR CHANGES
 
 * `transform/normalize_total`, `transform/clr`, `transform/log1p`: Add disk resource labels (PR #1073).