From 70d83d2389d116f75d9771b9f561435a7ae6d689 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 16 Sep 2025 19:48:46 +0200 Subject: [PATCH 1/4] remove scgpt components and workflows --- src/scgpt/binning/config.vsh.yaml | 93 ----- src/scgpt/binning/script.py | 129 ------- src/scgpt/binning/test.py | 59 --- .../cell_type_annotation/config.vsh.yaml | 164 -------- src/scgpt/cell_type_annotation/script.py | 255 ------------- src/scgpt/cell_type_annotation/test.py | 215 ----------- src/scgpt/cross_check_genes/config.vsh.yaml | 99 ----- src/scgpt/cross_check_genes/script.py | 70 ---- src/scgpt/cross_check_genes/test.py | 96 ----- src/scgpt/embedding/config.vsh.yaml | 147 -------- src/scgpt/embedding/script.py | 187 ---------- src/scgpt/embedding/test.py | 350 ------------------ src/scgpt/pad_tokenize/config.vsh.yaml | 125 ------- src/scgpt/pad_tokenize/script.py | 111 ------ src/scgpt/pad_tokenize/test.py | 113 ------ .../scgpt_annotation/config.vsh.yaml | 211 ----------- .../scgpt_annotation/integration_test.sh | 14 - .../annotation/scgpt_annotation/main.nf | 112 ------ .../scgpt_annotation/nextflow.config | 10 - .../annotation/scgpt_annotation/test.nf | 58 --- .../integration/scgpt_leiden/config.vsh.yaml | 185 --------- .../scgpt_leiden/integration_test.sh | 23 -- .../integration/scgpt_leiden/main.nf | 138 ------- .../integration/scgpt_leiden/nextflow.config | 10 - .../integration/scgpt_leiden/test.nf | 100 ----- 25 files changed, 3074 deletions(-) delete mode 100644 src/scgpt/binning/config.vsh.yaml delete mode 100644 src/scgpt/binning/script.py delete mode 100644 src/scgpt/binning/test.py delete mode 100644 src/scgpt/cell_type_annotation/config.vsh.yaml delete mode 100644 src/scgpt/cell_type_annotation/script.py delete mode 100644 src/scgpt/cell_type_annotation/test.py delete mode 100644 src/scgpt/cross_check_genes/config.vsh.yaml delete mode 100644 src/scgpt/cross_check_genes/script.py delete mode 100644 src/scgpt/cross_check_genes/test.py delete mode 100644 src/scgpt/embedding/config.vsh.yaml delete mode 100644 src/scgpt/embedding/script.py delete mode 100644 src/scgpt/embedding/test.py delete mode 100644 src/scgpt/pad_tokenize/config.vsh.yaml delete mode 100644 src/scgpt/pad_tokenize/script.py delete mode 100644 src/scgpt/pad_tokenize/test.py delete mode 100644 src/workflows/annotation/scgpt_annotation/config.vsh.yaml delete mode 100755 src/workflows/annotation/scgpt_annotation/integration_test.sh delete mode 100644 src/workflows/annotation/scgpt_annotation/main.nf delete mode 100644 src/workflows/annotation/scgpt_annotation/nextflow.config delete mode 100644 src/workflows/annotation/scgpt_annotation/test.nf delete mode 100644 src/workflows/integration/scgpt_leiden/config.vsh.yaml delete mode 100755 src/workflows/integration/scgpt_leiden/integration_test.sh delete mode 100644 src/workflows/integration/scgpt_leiden/main.nf delete mode 100644 src/workflows/integration/scgpt_leiden/nextflow.config delete mode 100644 src/workflows/integration/scgpt_leiden/test.nf diff --git a/src/scgpt/binning/config.vsh.yaml b/src/scgpt/binning/config.vsh.yaml deleted file mode 100644 index 2d3fcdb0486..00000000000 --- a/src/scgpt/binning/config.vsh.yaml +++ /dev/null @@ -1,93 +0,0 @@ -name: binning -namespace: "scgpt" -scope: "public" -description: | - Conversion of (pre-processed) expression count data into relative values (bins) to address scale differences across sequencing batches. -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ maintainer, author ] - - __merge__: /src/authors/elizabeth_mlynarski.yaml - roles: [ author ] - - __merge__: /src/authors/weiwei_schultz.yaml - roles: [ contributor ] - -argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - direction: input - required: true - example: input.h5mu - description: | - Input h5mu file. - - name: "--modality" - description: - Which modality from the input MuData file to process. - type: string - default: "rna" - required: false - - name: "--input_layer" - type: string - required: False - description: | - Mudata layer (key from .layers) to use as input data for binning. If not specified, .X is used. - - name: "--var_input" - type: string - default: "id_in_vocab" - description: | - The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes. - - name: "--n_input_bins" - type: integer - default: 51 - required: False - min: 1 - description: | - The number of bins to discretize the data into. When no value is provided, data won't be binned. - - - name: Outputs - arguments: - - name: "--output" - direction: output - type: file - example: output.h5mu - required: true - description: | - The output h5mu file containing the binned data. - - name: "--output_obsm_binned_counts" - type: string - default: "binned_counts" - description: | - The name of the adata layer to write the binned data to. - - name: "--seed" - type: integer - description: | - Seed for random number generation. - __merge__: [., /src/base/h5_compression_argument.yaml] - - -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py - - path: /src/utils/subset_vars.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu - -engines: - - type: docker - image: python:3.11-slim - setup: - - type: apt - packages: - - procps - - type: python - __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] - __merge__: [ /src/base/requirements/python_test_setup.yaml ] -runners: - - type: executable - - type: nextflow - directives: - label: [ midcpu, midmem ] diff --git a/src/scgpt/binning/script.py b/src/scgpt/binning/script.py deleted file mode 100644 index cac9312b576..00000000000 --- a/src/scgpt/binning/script.py +++ /dev/null @@ -1,129 +0,0 @@ -import sys -import mudata as mu -import numpy as np -from scipy.sparse import csr_matrix -import warnings - -## VIASH START -par = { - "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_genes_cross_checked.h5mu", - "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu", - "modality": "rna", - "input_layer": None, - "output_obsm_binned_counts": "binned_counts", - "n_input_bins": 51, - "output_compression": None, - "var_input": "id_in_vocab", - "seed": 0, -} -meta = {"resources_dir": "src/utils"} -## VIASH END - -if par["seed"]: - np.random.seed(par["seed"]) - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger -from subset_vars import subset_vars - -logger = setup_logger() - -logger.info("Reading in data") -# Read in data -mdata = mu.read(par["input"]) -input_adata = mdata.mod[par["modality"]] -adata = input_adata.copy() - -logger.info("Subsetting data based on highly variable gene and/or cross-checked genes") -adata = subset_vars(adata, par["var_input"]) - -logger.info("Converting the input layer into a CSR matrix") -if not par["input_layer"] or par["input_layer"] == "X": - layer_data = adata.X -else: - layer_data = adata.layers[par["input_layer"]] -layer_data = csr_matrix(layer_data) - -if layer_data.min() < 0: - raise ValueError( - f"Assuming non-negative data, but got min value {layer_data.min()}." - ) - -n_bins = par["n_input_bins"] # NOTE: the first bin is always a spectial for zero -logger.info(f"Binning data into {par['n_input_bins']} bins.") - - -def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: - assert x.ndim == 1 and bins.ndim == 1 - - left_digits = np.digitize(x, bins) - right_difits = np.digitize(x, bins, right=True) - - rands = np.random.rand(len(x)) # uniform random numbers - - digits = rands * (right_difits - left_digits) + left_digits - digits = np.ceil(digits) - smallest_dtype = np.min_scalar_type( - digits.max().astype(np.uint) - ) # Already checked for non-negative values - digits = digits.astype(smallest_dtype) - - return digits - - -with warnings.catch_warnings(): - # Make sure warnings are displayed once. - warnings.simplefilter("once") - # layer_data.indptr.size is the number of rows in the sparse matrix - binned_rows = [] - bin_edges = [] - logger.info( - "Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix" - ) - for row_number in range(layer_data.indptr.size - 1): - row_start_index, row_end_index = ( - layer_data.indptr[row_number], - layer_data.indptr[row_number + 1], - ) - # These are all non-zero counts in the row - non_zero_row = layer_data.data[row_start_index:row_end_index] - if len(non_zero_row) == 0: - logger.warning( - "The input data contains all zero rows. Please make sure " - "this is expected. You can use the `filter_cell_by_counts` " - "arg to filter out all zero rows." - ) - - # Add binned_rows and bin_edges as all 0 - # np.stack will upcast the dtype later - binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8)) - bin_edges.append(np.array([0] * n_bins)) - continue - - # Binning of non-zero values - bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1)) - non_zero_digits = _digitize(non_zero_row, bins) - assert non_zero_digits.min() >= 1 - assert non_zero_digits.max() <= n_bins - 1 - binned_rows.append(non_zero_digits) - - bin_edges.append(np.concatenate([[0], bins])) - -# Create new CSR matrix -logger.info("Creating a new CSR matrix of the binned count values") -binned_counts = csr_matrix( - ( - np.concatenate(binned_rows, casting="same_kind"), - layer_data.indices, - layer_data.indptr, - ), - shape=layer_data.shape, -) - -# Set binned values and bin edges layers to adata object -input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts -input_adata.obsm["bin_edges"] = np.stack(bin_edges) - -# Write mudata output -logger.info("Writing output data") -mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/binning/test.py b/src/scgpt/binning/test.py deleted file mode 100644 index 4cfad752966..00000000000 --- a/src/scgpt/binning/test.py +++ /dev/null @@ -1,59 +0,0 @@ -import pytest -import sys -import mudata as mu -from scipy.sparse import issparse - -## VIASH START -meta = { - "resources_dir": "resources_test", - "executable": "./target/docker/scgpt/binning/binning", - "temp_dir": "tmp", - "config": "./target/docker/scgpt/binning/.config.vsh.yaml", -} -## VIASH END - - -def test_binning(run_component, tmp_path): - input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_preprocessed.h5mu" - output_file_path = tmp_path / "Kim2020_Lung_subset_binned.h5mu" - - run_component( - [ - "--input", - input_file_path, - "--modality", - "rna", - "--output_obsm_binned_counts", - "binned_counts", - "--n_input_bins", - "51", - "--var_input", - "filter_with_hvg", - "--output", - output_file_path, - ] - ) - - # Read output file - output_mdata = mu.read(output_file_path) - output_adata = output_mdata.mod["rna"] - - # Check presence of binning layers - assert {"bin_edges", "binned_counts"}.issubset(output_adata.obsm.keys()), ( - "Binning obsm fields were not added." - ) - - # Check bin edges - bin_edges = output_adata.obsm["bin_edges"] - assert all(bin_edges[:, 0] == 0) - assert bin_edges.shape[1] == 51 - assert all(all(i >= 0) for i in bin_edges) - - # Check binned values - binned_values = output_adata.obsm["binned_counts"] - assert issparse(binned_values) - assert (binned_values.data <= 51).all(axis=None) - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/cell_type_annotation/config.vsh.yaml b/src/scgpt/cell_type_annotation/config.vsh.yaml deleted file mode 100644 index 165cb3b74d8..00000000000 --- a/src/scgpt/cell_type_annotation/config.vsh.yaml +++ /dev/null @@ -1,164 +0,0 @@ -name: cell_type_annotation -namespace: "scgpt" -scope: "public" -description: | - Annotate gene expression data with cell type classes through the scGPT model. -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ maintainer, author ] - - __merge__: /src/authors/jakub_majercik.yaml - roles: [ author ] - -argument_groups: - - name: Model input - arguments: - - name: "--model" - type: file - required: true - example: best_model.pt - description: | - The model file containing checkpoints and cell type label mapper. - - name: "--model_config" - type: file - required: true - example: args.json - description: | - The model configuration file. - - name: "--model_vocab" - type: file - required: true - example: vocab.json - description: | - Model vocabulary file directory. - - name: "--finetuned_checkpoints_key" - type: string - default: model_state_dict - description: | - Key in the model file containing the pretrained checkpoints. - - name: "--label_mapper_key" - type: string - default: id_to_class - description: | - Key in the model file containing the cell type class to label mapper dictionary. - - - name: Query input - arguments: - - name: "--input" - type: file - direction: input - required: true - example: scgpt_preprocess_ouput.h5mu - description: | - The input h5mu file containing of data that have been pre-processed (normalized, binned, genes cross-checked and tokenized). - - name: "--modality" - description: - Which modality from the input MuData file to process. - type: string - default: "rna" - required: false - - name: "--obs_batch_label" - type: string - required: false - description: | - The name of the adata.obs column containing the batch labels. Required if dsbn is set to true. - - name: "--obsm_gene_tokens" - type: string - default: "gene_id_tokens" - description: | - The key of the .obsm array containing the gene token ids - - name: "--obsm_tokenized_values" - type: string - default: values_tokenized - description: | - The key of the .obsm array containing the count values of the tokenized genes - - - name: Outputs - arguments: - - name: "--output" - type: file - direction: output - required: true - example: output.h5mu - description: | - The output mudata file. - - name: "--output_obs_predictions" - type: string - default: "scgpt_pred" - required: false - description: | - The name of the adata.obs column to write predicted cell type labels to. - - name: "--output_obs_probability" - type: string - default: "scgpt_probability" - required: false - description: | - The name of the adata.obs column to write the probabilities of the predicted cell type labels to. - __merge__: [., /src/base/h5_compression_argument.yaml] - - - name: Arguments - arguments: - - name: "--pad_token" - type: string - default: "" - required: false - description: | - The padding token used in the model. - - name: "--pad_value" - type: integer - default: -2 - required: false - description: | - The value of the padding. - - name: "--n_input_bins" - type: integer - default: 51 - required: false - description: | - The number of input bins. - - name: "--batch_size" - type: integer - default: 64 - required: false - description: | - The batch size. - - name: "--dsbn" - type: boolean - default: true - required: false - description: | - Whether to use domain-specific batch normalization. - - name: "--seed" - type: integer - description: | - Seed for random number generation. If not specified, no seed is used. - -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu - - path: /resources_test/scgpt/source/args.json - - path: /resources_test/scgpt/source/vocab.json - - path: /resources_test/scgpt/finetuned_model/best_model.pt - -engines: - - type: docker - image: nvcr.io/nvidia/pytorch:23.09-py3 - setup: - - type: python - __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] - - type: python - packages: - - scgpt==0.2.1 - test_setup: - - type: python - __merge__: [/src/base/requirements/scanpy.yaml] - __merge__: [ /src/base/requirements/python_test_setup.yaml, .] -runners: - - type: executable - - type: nextflow - directives: - label: [ highmem, highcpu, gpu ] diff --git a/src/scgpt/cell_type_annotation/script.py b/src/scgpt/cell_type_annotation/script.py deleted file mode 100644 index c3487712512..00000000000 --- a/src/scgpt/cell_type_annotation/script.py +++ /dev/null @@ -1,255 +0,0 @@ -import sys -import json -from multiprocessing import freeze_support -import os -import mudata as mu -from typing import Dict -import warnings -import torch -import numpy as np -from torch.nn import functional -from torch.utils.data import Dataset, DataLoader -from scgpt.model import TransformerModel -from scgpt.tokenizer.gene_tokenizer import GeneVocab -from scgpt.utils import set_seed -from tqdm import tqdm - -## VIASH START -par = { - "input": r"resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", - "modality": r"rna", - "model": r"resources_test/scgpt/finetuned_model/best_model.pt", - "model_config": r"resources_test/scgpt/source/args.json", - "model_vocab": r"resources_test/scgpt/source/vocab.json", - "obs_batch_label": r"sample", - "obsm_gene_tokens": r"gene_id_tokens", - "obsm_tokenized_values": r"values_tokenized", - "output": r"output.h5mu", - "output_compression": None, - "output_obs_predictions": r"predictions", - "output_obs_probability": r"probabilities", - "dsbn": True, - "seed": 0, - "pad_token": "", - "pad_value": -2, - "n_input_bins": 51, - "batch_size": 64, - "finetuned_checkpoints_key": "model_state_dict", - "label_mapper_key": "id_to_class", -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger - -logger = setup_logger() - - -class SeqDataset(Dataset): - def __init__(self, data: Dict[str, torch.Tensor]): - self.data = data - - def __len__(self): - return self.data["gene_ids"].shape[0] - - def __getitem__(self, idx): - return {k: v[idx] for k, v in self.data.items()} - - -def main(): - # Setting seed - if par["seed"]: - set_seed(par["seed"]) - - # Setting device - logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - # Read in data - logger.info("Reading in data") - mdata = mu.read(par["input"]) - input_adata = mdata.mod[par["modality"]] - adata = input_adata.copy() - - # Fetch batch ids for domain-specific batch normalization - if par["dsbn"] and not par["obs_batch_label"]: - raise ValueError( - "When dsbn is set to True, you are required to provide batch labels (obs_batch_labels)." - ) - elif par["dsbn"] and par["obs_batch_label"]: - logger.info("Fetching batch id's for domain-specific batch normalization") - batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") - batch_id_labels = batch_id_cats.cat.codes.values - batch_ids = batch_id_labels.tolist() - batch_ids = np.array(batch_ids) - num_batch_types = len(set(batch_ids)) - elif not par["dsbn"]: - # forward pass requires a tensor as input - batch_ids = np.zeros(adata.shape[0]) - - # Vocabulary configuration - logger.info("Loading model vocabulary") - special_tokens = [par["pad_token"], "", ""] - logger.info(f"Loading model vocab from {par['model_vocab']}") - vocab_file = par["model_vocab"] - vocab = GeneVocab.from_file(vocab_file) - [vocab.append_token(s) for s in special_tokens if s not in vocab] - vocab.set_default_index(vocab[par["pad_token"]]) - ntokens = len(vocab) - - # Model configuration - logger.info("Loading model and configurations") - model_config_file = par["model_config"] - with open(model_config_file, "r") as f: - model_configs = json.load(f) - embsize = model_configs["embsize"] - nhead = model_configs["nheads"] - d_hid = model_configs["d_hid"] - nlayers = model_configs["nlayers"] - - # Ensure the provided model has the correct architecture - logger.info("Loading model") - model_file = par["model"] - model_dict = torch.load(model_file, map_location=device) - for k, v in { - "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"], - "--label_mapper_key": par["label_mapper_key"], - }.items(): - if v not in model_dict.keys(): - raise KeyError( - f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." - ) - pretrained_dict = model_dict[par["finetuned_checkpoints_key"]] - - # Label mapper configuration - logger.info("Loading label mapper") - label_mapper = model_dict[par["label_mapper_key"]] - cell_type_mapper = {int(k): v for k, v in label_mapper.items()} - n_cls = len(cell_type_mapper) - - # Model instatiation - logger.info("Instantiating model") - model = TransformerModel( - ntokens, - d_model=embsize, # self.encoder (GenEncoder), self.value_encoder (ContinuousValueEncoder), self.transformerencoder(TransformerEncoderLayer) - nhead=nhead, # self.transformer_encoder(TransformerEncoderLayer) - d_hid=d_hid, # self.transformer_encoder(TransformerEncoderLayer) - nlayers=nlayers, # self.transformer_encoder(TransformerEncoderLayer), self.cls_decoder - nlayers_cls=3, # self.cls_decoder - n_cls=n_cls, # self.cls_decoder - vocab=vocab, - dropout=0.2, # self.transformer_encoder - pad_token=par["pad_token"], - pad_value=par["pad_value"], - do_mvc=False, - do_dab=False, - use_batch_labels=par["dsbn"], - num_batch_labels=num_batch_types if par["dsbn"] else None, - domain_spec_batchnorm=par["dsbn"], - input_emb_style="continuous", - n_input_bins=par["n_input_bins"], - cell_emb_style="cls", # required for cell-type annotation - use_fast_transformer=False, # TODO: parametrize when GPU is available - fast_transformer_backend="flash", # TODO: parametrize when GPU is available - pre_norm=False, # TODO: parametrize when GPU is available - ) - - # Load model params - logger.info(f"Loading model params from {model_file}") - try: - model.load_state_dict(pretrained_dict) - except RuntimeError: - logger.info("only load params that are in the model and match the size") - model_dict = model.state_dict() - pretrained_dict = { - k: v - for k, v in pretrained_dict.items() - if k in model_dict and v.shape == model_dict[k].shape - } - for k, v in pretrained_dict.items(): - logger.info(f"Loading params {k} with shape {v.shape}") - model_dict.update(pretrained_dict) - model.load_state_dict(model_dict) - - model.to(device) - - # Load tokenized gene data - logger.info("Loading data for inference") - for k, v in { - "--obsm_gene_tokens": par["obsm_gene_tokens"], - "--obsm_tokenized_values": par["obsm_tokenized_values"], - }.items(): - if v not in adata.obsm.keys(): - raise KeyError( - f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" - ) - - input_gene_ids = adata.obsm[par["obsm_gene_tokens"]] - input_values = adata.obsm[par["obsm_tokenized_values"]] - - data_pt = { - "gene_ids": input_gene_ids, - "values": input_values, - "batch_labels": torch.from_numpy(batch_ids).long(), - } - - data_loader = DataLoader( - dataset=SeqDataset(data_pt), - batch_size=par["batch_size"], - num_workers=min(os.cpu_count(), par["batch_size"] // 2), - pin_memory=True, - ) - - # Inference - logger.info("Predicting cell type classes") - model.eval() - predictions = [] - probabilities = [] - with torch.no_grad(): - for batch_data in tqdm(data_loader): - input_gene_ids = batch_data["gene_ids"].to(device) - input_values = batch_data["values"].to(device) - batch_labels = batch_data["batch_labels"].to(device) - - src_key_padding_mask = input_gene_ids.eq(vocab[par["pad_token"]]) - with torch.cuda.amp.autocast(enabled=False): - output_dict = model( - input_gene_ids, - input_values, - src_key_padding_mask=src_key_padding_mask, - batch_labels=batch_labels if par["dsbn"] else None, - CLS=True, # Return celltype classification objective output - CCE=False, - MVC=False, - ECS=False, - ) - output_values = output_dict["cls_output"] - - preds = output_values.argmax(1).cpu().numpy() - predictions.append(preds) - - probs = functional.softmax(output_values, dim=1).max(1)[0] - probabilities.append(probs.cpu().numpy()) - - predictions = np.concatenate(predictions, axis=0) - probabilities = np.concatenate(probabilities, axis=0) - - # Assign cell type labels to predicted classes - logger.info("Assigning cell type predictions and probabilities") - adata.obs["scgpt_class_pred"] = predictions - adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map( - lambda x: cell_type_mapper[x] - ) - adata.obs[par["output_obs_probability"]] = probabilities - - # Write output - logger.info("Writing output data") - mdata.mod[par["modality"]] = adata - mdata.write(par["output"], compression=par["output_compression"]) - - -if __name__ == "__main__": - freeze_support() - warnings.filterwarnings("ignore") - main() diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py deleted file mode 100644 index 2d2eb9bd9ca..00000000000 --- a/src/scgpt/cell_type_annotation/test.py +++ /dev/null @@ -1,215 +0,0 @@ -import pytest -from mudata import read_h5mu -import sys -import subprocess -import re - - -input_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu" -ft_model = f"{meta['resources_dir']}/best_model.pt" -model_config = f"{meta['resources_dir']}/args.json" -model_vocab = f"{meta['resources_dir']}/vocab.json" - - -def test_cell_type_inference(run_component, tmp_path): - output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu" - - args = [ - "--input", - input_path, - "--output", - output_annotation_file, - "--modality", - "rna", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--model", - ft_model, - "--finetuned_checkpoints_key", - "model_state_dict", - "--label_mapper_key", - "id_to_class", - "--model_vocab", - model_vocab, - "--model_config", - model_config, - "--obs_batch_label", - "sample", - "--dsbn", - "True", - ] - run_component(args) - - output_mudata = read_h5mu(output_annotation_file) - output_adata = output_mudata.mod["rna"] - assert "scgpt_pred" in output_adata.obs.keys(), ( - "scgpt_pred is not present in anndata obs keys" - ) - assert "scgpt_probability" in output_adata.obs.keys(), ( - "scgpt_probability is not present in anndata obs keys" - ) - - # run withou dsbn - output_annotation_file_without_dsbn = ( - tmp_path / "Kim2020_Lung_subset_annotated_no_dsbn.h5mu" - ) - args = [ - "--input", - input_path, - "--output", - output_annotation_file_without_dsbn, - "--modality", - "rna", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--model", - ft_model, - "--model_vocab", - model_vocab, - "--model_config", - model_config, - "--finetuned_checkpoints_key", - "model_state_dict", - "--label_mapper_key", - "id_to_class", - "--obs_batch_label", - "sample", - "--dsbn", - "False", - ] - run_component(args) - # Read output file - output_mdata_no_dsbn = read_h5mu(output_annotation_file_without_dsbn) - output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"] - - # Assert that embeddings without dsbn are different - assert not ( - output_adata.obs["scgpt_pred"].astype(str) - == output_adata_no_dsbn.obs["scgpt_pred"].astype(str) - ).all(), "Cell type predictions with and without dsbn are the same" - - -def test_annotation_dsbn_without_batch_labels(run_component, tmp_path): - output_annotation_labels_without_dsbn = ( - tmp_path / "Kim2020_Lung_subset_annotated_labels_without_dsbn.h5mu" - ) - - args = [ - "--input", - input_path, - "--output", - output_annotation_labels_without_dsbn, - "--modality", - "rna", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--model", - ft_model, - "--model_vocab", - model_vocab, - "--model_config", - model_config, - "--finetuned_checkpoints_key", - "model_state_dict", - "--label_mapper_key", - "id_to_class", - "--dsbn", - "True", - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args) - assert re.search( - r"ValueError: When dsbn is set to True, you are required to provide batch labels \(obs_batch_labels\)\.", - err.value.stdout.decode("utf-8"), - ) - - -def test_annotation_non_existing_keys(run_component, tmp_path): - output_annotation_dummy_values = ( - tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" - ) - - # Test for non-existing tokenized values key - args = [ - "--input", - input_path, - "--output", - output_annotation_dummy_values, - "--modality", - "rna", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "dummy_values_tokenized", - "--model", - ft_model, - "--model_vocab", - model_vocab, - "--model_config", - model_config, - "--finetuned_checkpoints_key", - "model_state_dict", - "--label_mapper_key", - "id_to_class", - "--obs_batch_label", - "sample", - "--dsbn", - "True", - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args) - assert re.search( - r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"', - err.value.stdout.decode("utf-8"), - ) - - -def test_checkpoint_architecture(run_component, tmp_path): - output_dummy_model_key = tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" - - # Test for non-existing model file keys - args = [ - "--input", - input_path, - "--output", - output_dummy_model_key, - "--modality", - "rna", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--model", - ft_model, - "--model_vocab", - model_vocab, - "--model_config", - model_config, - "--finetuned_checkpoints_key", - "dummy_checkpoints_key", - "--label_mapper_key", - "id_to_class", - "--obs_batch_label", - "sample", - "--dsbn", - "True", - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args) - assert re.search( - r'KeyError: "The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper."', - err.value.stdout.decode("utf-8"), - ) - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/cross_check_genes/config.vsh.yaml b/src/scgpt/cross_check_genes/config.vsh.yaml deleted file mode 100644 index c51bd606b6f..00000000000 --- a/src/scgpt/cross_check_genes/config.vsh.yaml +++ /dev/null @@ -1,99 +0,0 @@ -name: cross_check_genes -namespace: "scgpt" -scope: "public" -description: | - Cross-check genes with pre-trained scGPT model. -authors: - - __merge__: /src/authors/jakub_majercik.yaml - roles: [ author ] - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ maintainer, author ] - - __merge__: /src/authors/elizabeth_mlynarski.yaml - roles: [ author ] - - __merge__: /src/authors/weiwei_schultz.yaml - roles: [ contributor ] - - -argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - direction: input - required: true - example: input.h5mu - description: | - The input h5mu file containing of pre-processed data. - - name: "--modality" - type: string - default: "rna" - required: false - description: | - The modality key of the MuData object containing the RNA AnnData object. - - name: "--vocab_file" - type: file - direction: input - required: true - example: resources_test/scgpt/vocab.json - description: | - Model vocabulary file path. - - name: "--input_var_gene_names" - type: string - example: "gene_name" - required: false - description: | - The name of the adata.var column containing gene names. By default the .var index will be used. - - name: "--var_input" - type: string - required: false - description: ".var column containing highly variable genes. If provided, will only cross-check HVG filtered genes with model vocabulary." - - name: Outputs - arguments: - - name: "--output" - type: file - direction: output - required: true - example: output.h5mu - description: | - The output cross-checked anndata file. - - name: "--output_var_filter" - type: string - default: "id_in_vocab" - description: In which .var slot to store a boolean array corresponding to which observations should be filtered out based on HVG and model vocabulary. - __merge__: [., /src/base/h5_compression_argument.yaml] - - - name: Arguments - arguments: - - name: "--pad_token" - type: string - default: "" - required: false - description: | - The padding token used in the model. -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu - - path: /resources_test/scgpt/source/vocab.json - -engines: - - type: docker - image: nvcr.io/nvidia/pytorch:23.09-py3 - setup: - - type: python - __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] - - type: python - packages: - - scgpt==0.2.1 - test_setup: - - type: python - __merge__: [ /src/base/requirements/python_test_setup.yaml, .] -runners: - - type: executable - - type: nextflow - directives: - label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/scgpt/cross_check_genes/script.py b/src/scgpt/cross_check_genes/script.py deleted file mode 100644 index c181d81534e..00000000000 --- a/src/scgpt/cross_check_genes/script.py +++ /dev/null @@ -1,70 +0,0 @@ -import sys -import mudata as mu -from scgpt.tokenizer.gene_tokenizer import GeneVocab - -## VIASH START -par = { - "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu", - "output": "output.h5mu", - "modality": "rna", - "input_var_gene_names": None, - "output_var_filter": "id_in_vocab", - "pad_token": "", - "var_input": "filter_with_hvg", - "vocab_file": "resources_test/scgpt/source/vocab.json", - "output_compression": None, -} - -meta = {"resources_dir": "src/utils"} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger - -logger = setup_logger() - -# Read in data -logger.info(f"Reading {par['input']}") -mudata = mu.read_h5mu(par["input"]) -adata = mudata.mod[par["modality"]].copy() - -pad_token = par["pad_token"] -special_tokens = [pad_token, "", ""] - -# Fetching gene names -if not par["input_var_gene_names"]: - genes = adata.var.index.astype(str).tolist() -elif par["input_var_gene_names"] not in adata.var.columns: - raise ValueError( - f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs." - ) -else: - genes = adata.var[par["input_var_gene_names"]].astype(str).tolist() - -# Cross-check genes with pre-trained model -logger.info(f"Loading model vocab from {par['vocab_file']}") -vocab_file = par["vocab_file"] -vocab = GeneVocab.from_file(vocab_file) -[vocab.append_token(s) for s in special_tokens if s not in vocab] - -if par["var_input"]: - logger.info("Filtering genes based on model vocab and HVG") - filter_with_hvg = adata.var[par["var_input"]].tolist() - gene_filter_mask = [ - 1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg) - ] - logger.info( - f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}" - ) -else: - logger.info("Filtering genes based on model vocab") - gene_filter_mask = [1 if gene in vocab else 0 for gene in genes] - logger.info( - f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}" - ) - -logger.info(f"Writing to {par['output']}") -adata.var[par["output_var_filter"]] = gene_filter_mask -adata.var[par["output_var_filter"]] = adata.var[par["output_var_filter"]].astype("bool") -mudata.mod[par["modality"]] = adata -mudata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/cross_check_genes/test.py b/src/scgpt/cross_check_genes/test.py deleted file mode 100644 index bb8c53a1349..00000000000 --- a/src/scgpt/cross_check_genes/test.py +++ /dev/null @@ -1,96 +0,0 @@ -import pytest -import subprocess -from mudata import read_h5mu -import re -import sys - -## VIASH START -meta = { - "executable": "./target/docker/scgpt/cross_check/cross_check", - "resources_dir": "./resources_test/scgpt/", - "config": "./src/scgpt/cross_check/config.vsh.yaml", -} -## VIASH END - -input_path = meta["resources_dir"] + "/Kim2020_Lung_subset_preprocessed.h5mu" -vocab_path = meta["resources_dir"] + "/vocab.json" - - -def test_cross_check(run_component, random_path): - output_path = random_path(extension="h5mu") - args = [ - "--input", - input_path, - "--output", - output_path, - "--modality", - "rna", - "--vocab_file", - vocab_path, - "--output_compression", - "gzip", - ] - run_component(args) - - output_mudata = read_h5mu(output_path) - - # Check added columns - assert {"gene_name", "id_in_vocab"}.issubset( - set(output_mudata.mod["rna"].var.columns) - ), "Gene columns were not added." - # Check if genes were filtered - assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len( - output_mudata.mod["rna"].var["id_in_vocab"] - ), "Genes were not filtered." - - output_hvg_path = random_path(extension="h5mu") - args_hvg = [ - "--input", - input_path, - "--output", - output_hvg_path, - "--modality", - "rna", - "--var_input", - "filter_with_hvg", - "--vocab_file", - vocab_path, - "--output_compression", - "gzip", - ] - - run_component(args_hvg) - - output_mudata_hvg = read_h5mu(output_hvg_path) - # Check if genes were filtered based on HVG - assert sum(output_mudata_hvg.mod["rna"].var["id_in_vocab"]) != len( - output_mudata_hvg.mod["rna"].var["id_in_vocab"] - ), "Genes were not filtered." - assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len( - output_mudata_hvg.mod["rna"].var["id_in_vocab"] - ), "Genes were not filtered based on HVG." - - -def test_cross_check_invalid_gene_layer_raises(run_component, random_path): - output_path = random_path(extension="h5mu") - args = [ - "--input", - input_path, - "--output", - output_path, - "--vocab_file", - vocab_path, - "--input_var_gene_names", - "dummy_var", - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args) - assert re.search( - r"ValueError: Gene name column 'dummy_var' not found in .mod\['rna'\]\.obs\.", - err.value.stdout.decode("utf-8"), - ) - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml deleted file mode 100644 index fd238696c1e..00000000000 --- a/src/scgpt/embedding/config.vsh.yaml +++ /dev/null @@ -1,147 +0,0 @@ -name: embedding -namespace: scgpt -scope: "public" -description: | - Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT. -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ maintainer, author ] - - __merge__: /src/authors/elizabeth_mlynarski.yaml - roles: [ author ] - - __merge__: /src/authors/weiwei_schultz.yaml - roles: [ contributor ] -argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - direction: input - required: true - example: input.h5mu - description: | - The input h5mu file containing tokenized gene and count data. - - name: "--modality" - description: | - Which modality from the input MuData file to process. - - type: string - default: "rna" - required: false - - name: "--model" - type: file - direction: input - required: true - example: best_model.pt - description: | - Path to scGPT model file. - - name: "--model_vocab" - type: file - direction: input - required: true - example: vocab.json - description: | - Path to scGPT model vocabulary file. - - name: "--model_config" - type: file - direction: input - required: true - example: args.json - description: | - Path to scGPT model config file. - - name: "--obsm_gene_tokens" - type: string - default: "gene_id_tokens" - description: | - The key of the .obsm array containing the gene token ids - example: values.pt - - name: "--obsm_tokenized_values" - type: string - default: values_tokenized - description: | - The key of the .obsm array containing the count values of the tokenized genes - - name: "--obsm_padding_mask" - type: string - default: padding_mask - description: | - The key of the .obsm array containing the padding mask. - - name: "--var_gene_names" - type: string - description: | - The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used. - - name: "--obs_batch_label" - type: string - description: | - The name of the adata.obs column containing the batch labels. Must be provided when 'dsbn' is set to True. - - name: "--finetuned_checkpoints_key" - type: string - required: false - example: model_state_dict - description: | - Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models. - - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - description: | - Path to output anndata file containing pre-processed data as well as scGPT embeddings. - direction: output - example: output.h5mu - - name: "--obsm_embeddings" - type: string - default: "X_scGPT" - description: | - The name of the adata.obsm array to which scGPT embeddings will be written. - __merge__: [., /src/base/h5_compression_argument.yaml] - - name: Arguments - arguments: - - name: "--pad_token" - type: string - default: "" - description: | - The token to be used for padding. - - name: "--pad_value" - type: integer - default: -2 - description: | - The value of the padding token. - - name: "--dsbn" - type: boolean - default: true - description: | - Whether to apply domain-specific batch normalization for generating embeddings. When set to True, 'obs_batch_labels' must be set as well. - - name: "--batch_size" - type: integer - default: 64 - description: | - The batch size to be used for inference - -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/scgpt/source - - path: /resources_test/scgpt/finetuned_model - - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu - -engines: - - type: docker - image: nvcr.io/nvidia/pytorch:23.09-py3 - setup: - - type: python - __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] - - type: python - packages: - - scgpt==0.2.1 - test_setup: - - type: python - __merge__: [ /src/base/requirements/viashpy.yaml ] -runners: - - type: executable - - type: nextflow - directives: - label: [ highmem, highcpu, gpu ] diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py deleted file mode 100644 index b78d42c3f61..00000000000 --- a/src/scgpt/embedding/script.py +++ /dev/null @@ -1,187 +0,0 @@ -import sys -import numpy as np -import mudata as mu -import json -from scgpt.tokenizer.gene_tokenizer import GeneVocab -from scgpt.model import TransformerModel -from scgpt.utils.util import load_pretrained -import torch - -## VIASH START -par = { - "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", - "obsm_gene_tokens": "gene_id_tokens", - "obsm_tokenized_values": "values_tokenized", - "obsm_padding_mask": "padding_mask", - "model": "resources_test/scgpt/source/best_model.pt", - "model_config": "resources_test/scgpt/source/args.json", - "model_vocab": "resources_test/scgpt/source/vocab.json", - "output": "Kim2020_Lung_embedded.h5ad", - "var_gene_names": "gene_name", - "obs_batch_label": "sample", - "obsm_embeddings": "X_scGPT", - "pad_token": "", - "pad_value": -2, - "batch_size": 64, - "modality": "rna", - "dsbn": True, - "n_input_bins": 51, -} -meta = { - "resources_dir": "src/utils", -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger - -logger = setup_logger() - -logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -logger.info("Reading in data") - -# Read in data -mdata = mu.read(par["input"]) -input_adata = mdata.mod[par["modality"]] -adata = input_adata.copy() - -for k, v in { - "--obsm_gene_tokens": par["obsm_gene_tokens"], - "--obsm_tokenized_values": par["obsm_tokenized_values"], - "--obsm_padding_mask": par["obsm_padding_mask"], -}.items(): - if v not in adata.obsm.keys(): - raise KeyError( - f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" - ) - -all_gene_ids = adata.obsm[par["obsm_gene_tokens"]] -all_values = adata.obsm[par["obsm_tokenized_values"]] -padding_mask = adata.obsm[par["obsm_padding_mask"]] - -# Fetch batch ids for domain-specific batch normalization -if par["dsbn"] and not par["obs_batch_label"]: - raise ValueError( - "When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels)." - ) -elif par["dsbn"] and par["obs_batch_label"]: - logger.info("Fetching batch id's for domain-specific batch normalization") - batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") - batch_id_labels = batch_id_cats.cat.codes.values - batch_ids = batch_id_labels.tolist() - batch_ids = np.array(batch_ids) - num_batch_types = len(set(batch_ids)) -elif not par["dsbn"] and par["obs_batch_label"]: - logger.info( - "Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed." - ) - -# Set padding specs -logger.info("Setting padding specs") -pad_token = par["pad_token"] -pad_value = par["pad_value"] -special_tokens = [pad_token, "", ""] - -# Fetching gene names -logger.info("Fetching gene names") -if not par["var_gene_names"]: - genes = adata.var.index.astype(str).tolist() -else: - genes = adata.var[par["var_gene_names"]].astype(str).tolist() - -# Model files -logger.info("Loading model, vocab and configs") -model_config_file = par["model_config"] -model_file = par["model"] -vocab_file = par["model_vocab"] - -# Load vocab -vocab = GeneVocab.from_file(vocab_file) -for s in special_tokens: - if s not in vocab: - vocab.append_token(s) - -vocab.set_default_index(vocab[""]) -ntokens = len(vocab) -gene_ids = np.array(vocab(genes), dtype=int) - -# Load model configs -with open(model_config_file, "r") as f: - model_configs = json.load(f) -embsize = model_configs["embsize"] -nhead = model_configs["nheads"] -d_hid = model_configs["d_hid"] -nlayers = model_configs["nlayers"] - -# Instantiate model -logger.info("Initializing transformer model") -model = TransformerModel( - ntokens, - d_model=embsize, - nhead=nhead, - d_hid=d_hid, - nlayers=nlayers, - vocab=vocab, - dropout=0.5, # scGPT default, only relevant for fine-tuning applications - pad_token=pad_token, - pad_value=pad_value, - nlayers_cls=3, # only applicable for decoder-based operations - n_cls=1, # only applicable for decoder-based operations - do_mvc=False, # only applicable for decoder-based operations - ecs_threshold=0.8, # only applicable for decoder-based operations - do_dab=False, # only applicable for decoder-based operations - use_batch_labels=False, # only applicable for decoder-based operations - num_batch_labels=num_batch_types if par["dsbn"] else None, - domain_spec_batchnorm=par["dsbn"], - input_emb_style="continuous", # scGPT default - explicit_zero_prob=False, # TODO: Parametrize when GPU-based machine types are supported - use_fast_transformer=False, # TODO: Parametrize when GPU-based machine types are supported - # fast_transformer_backend="flash", #TODO: Parametrize when GPU-based machine types are supported - pre_norm=False, # TODO: Parametrize when GPU-based machine types are supported -) - - -logger.info("Loading model") -model_file = par["model"] -model_dict = torch.load(model_file, map_location=device) - -# Ensure the provided model has the correct architecture -finetuned_checkpoints_key = par.get("finetuned_checkpoints_key") -if finetuned_checkpoints_key: - try: - model_dict = model_dict[finetuned_checkpoints_key] - except KeyError as e: - raise ValueError( - f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." - ) from e - -# Load model -load_pretrained(model, model_dict, verbose=False) - -# Embed tokenized data -logger.info("Converting tokenized input data to embeddings") -model.to(device) -model.eval() - -cell_embeddings = model.encode_batch( - torch.from_numpy(all_gene_ids), - torch.from_numpy(all_values).float(), - src_key_padding_mask=torch.from_numpy(padding_mask), - batch_size=par["batch_size"], - batch_labels=torch.from_numpy(batch_ids).long() if par["dsbn"] else None, - output_to_cpu=True, - time_step=0, - return_np=True, -) - -cell_embeddings = cell_embeddings / np.linalg.norm( - cell_embeddings, axis=1, keepdims=True -) - -# Write output -logger.info("Writing output data") -adata.obsm[par["obsm_embeddings"]] = cell_embeddings -mdata.mod[par["modality"]] = adata -mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/embedding/test.py b/src/scgpt/embedding/test.py deleted file mode 100644 index 8ba9b190920..00000000000 --- a/src/scgpt/embedding/test.py +++ /dev/null @@ -1,350 +0,0 @@ -import pytest -import subprocess -import re -import sys -import mudata as mu -import numpy as np - - -## VIASH START -meta = { - "resources_dir": "resources_test", -} -## VIASH END - -input = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu" -model_file = f"{meta['resources_dir']}/source/best_model.pt" -ft_model_file = f"{meta['resources_dir']}/finetuned_model/best_model.pt" -vocab_file = f"{meta['resources_dir']}/source/vocab.json" -model_config_file = f"{meta['resources_dir']}/source/args.json" -input_file = mu.read(input) - - -def test_integration_embedding(run_component, tmp_path): - output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - run_component( - [ - "--input", - input, - "--modality", - "rna", - "--model", - model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obs_batch_label", - "sample", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--output", - output_embedding_file, - "--batch_size", - "4", - ] - ) - - # Read output file - output_mdata = mu.read(output_embedding_file) - output_adata = output_mdata.mod["rna"] - - # check that embedding obs is present - assert "X_scGPT" in output_adata.obsm.keys(), ( - "X_scGPT is not present in anndata obsm keys" - ) - - # check embedding size - assert output_adata.obsm["X_scGPT"].shape[1] == 512, ( - "Embedding size does not equal 512" - ) - - # check embedding value range - assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), ( - "Embedding values are nan" - ) - assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), ( - "Range of embedding values is outside of [-1, 1]" - ) - - # Run embeddings without dsbn - output_embedding_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - run_component( - [ - "--input", - input, - "--modality", - "rna", - "--model", - model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "False", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--output", - output_embedding_file_without_dsbn, - "--batch_size", - "4", - ] - ) - - # Read output file - output_mdata_no_dsbn = mu.read(output_embedding_file_without_dsbn) - output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"] - - # Assert that embeddings without dsbn are different - assert not ( - output_adata.obsm["X_scGPT"] == output_adata_no_dsbn.obsm["X_scGPT"] - ).all(), "Embeddings with and without dsbn are the same" - - -def test_integration_embedding_dsbn_without_batch_labels(run_component, tmp_path): - output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - args = [ - "--input", - input, - "--modality", - "rna", - "--model", - model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--output", - output_embedding_file, - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args) - assert re.search( - r"ValueError: When dsbn is set to True, you are required to provide batch labels \(input_obs_batch_labels\)\.", - err.value.stdout.decode("utf-8"), - ) - - -def test_integration_embedding_non_existing_keys(run_component, tmp_path): - output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - # Test for non-existing gene names key - args_1 = [ - "--input", - input, - "--modality", - "rna", - "--model", - model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obs_batch_label", - "sample", - "--var_gene_names", - "dummy_gene_name_key", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--output", - output_embedding_file, - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args_1) - assert re.search( - r"KeyError: \'dummy_gene_name_key\'", err.value.stdout.decode("utf-8") - ) - - # Test for non-existing batch label key - args_2 = [ - "--input", - input, - "--modality", - "rna", - "--model", - model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obs_batch_label", - "dummy_batch_label_key", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--output", - output_embedding_file, - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args_2) - assert re.search( - r"KeyError: \'dummy_batch_label_key\'", err.value.stdout.decode("utf-8") - ) - - # Test for non-existing tokenized values key - args_3 = [ - "--input", - input, - "--modality", - "rna", - "--model", - model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obs_batch_label", - "sample", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "dummy_values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--output", - output_embedding_file, - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args_3) - assert re.search( - r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"', - err.value.stdout.decode("utf-8"), - ) - - -def test_finetuned_model(run_component, tmp_path): - output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - run_component( - [ - "--input", - input, - "--modality", - "rna", - "--model", - ft_model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obs_batch_label", - "sample", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--finetuned_checkpoints_key", - "model_state_dict", - "--output", - output_embedding_file, - "--batch_size", - "4", - ] - ) - - # Read output file - output_mdata = mu.read(output_embedding_file) - output_adata = output_mdata.mod["rna"] - - # check that embedding obs is present - assert "X_scGPT" in output_adata.obsm.keys(), ( - "X_scGPT is not present in anndata obsm keys" - ) - - # check embedding size - assert output_adata.obsm["X_scGPT"].shape[1] == 512, ( - "Embedding size does not equal 512" - ) - - # check embedding value range - assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), ( - "Embedding values are nan" - ) - assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), ( - "Range of embedding values is outside of [-1, 1]" - ) - - -def test_finetuned_model_architecture(run_component, tmp_path): - output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - args = [ - "--input", - input, - "--modality", - "rna", - "--model", - ft_model_file, - "--model_vocab", - vocab_file, - "--model_config", - model_config_file, - "--dsbn", - "True", - "--obs_batch_label", - "sample", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--finetuned_checkpoints_key", - "dummy_checkpoints_key", - "--output", - output_embedding_file, - ] - - with pytest.raises(subprocess.CalledProcessError) as err: - run_component(args) - assert re.search( - r"ValueError: The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.", - err.value.stdout.decode("utf-8"), - ) - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/pad_tokenize/config.vsh.yaml b/src/scgpt/pad_tokenize/config.vsh.yaml deleted file mode 100644 index 6efa9a01948..00000000000 --- a/src/scgpt/pad_tokenize/config.vsh.yaml +++ /dev/null @@ -1,125 +0,0 @@ -name: pad_tokenize -namespace: "scgpt" -scope: "public" -description: | - Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning. -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ maintainer, author ] - - __merge__: /src/authors/elizabeth_mlynarski.yaml - roles: [ author ] - - __merge__: /src/authors/weiwei_schultz.yaml - roles: [ contributor ] - -argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - direction: input - required: true - example: input.h5mu - description: | - The input h5mu file of pre-processed data. - - name: "--modality" - description: | - Which modality from the input MuData file to process. - type: string - default: "rna" - required: false - - name: "--model_vocab" - type: file - direction: input - required: true - example: vocab.json - description: | - Path to model vocabulary file. - - name: "--var_gene_names" - type: string - required: false - description: | - The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used. - - name: "--var_input" - type: string - default: "id_in_vocab" - description: | - The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes. - - name: "--input_obsm_binned_counts" - type: string - default: "binned_counts" - required: false - description: | - The name of the .obsm field containing the binned counts to be padded and tokenized. - - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - description: | - The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask. - direction: output - example: output.h5mu - - name: "--obsm_gene_tokens" - type: string - default: "gene_id_tokens" - description: | - The key of the .obsm array containing the gene token ids - example: values.pt - - name: "--obsm_tokenized_values" - type: string - default: values_tokenized - description: | - The key of the .obsm array containing the count values of the tokenized genes - - name: "--obsm_padding_mask" - type: string - default: padding_mask - description: | - The key of the .obsm array containing the padding mask. - __merge__: [., /src/base/h5_compression_argument.yaml] - - - name: Arguments - arguments: - - name: "--pad_token" - type: string - default: "" - required: false - description: | - Token used for padding. - - name: "--pad_value" - type: integer - default: -2 - required: false - description: | - The value of the padding token. - - name: "--max_seq_len" - type: integer - description: | - The maximum sequence length of the tokenized data. Defaults to the number of features if not provided. - -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py - - path: /src/utils/subset_vars.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/scgpt/ - -engines: - - type: docker - image: nvcr.io/nvidia/pytorch:23.09-py3 - setup: - - type: python - __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] - - type: python - packages: - - scgpt==0.2.1 - - ipython~=8.5.0 - __merge__: [ /src/base/requirements/python_test_setup.yaml, .] -runners: - - type: executable - - type: nextflow - directives: - label: [ lowmem, lowcpu ] diff --git a/src/scgpt/pad_tokenize/script.py b/src/scgpt/pad_tokenize/script.py deleted file mode 100644 index 641e28a189e..00000000000 --- a/src/scgpt/pad_tokenize/script.py +++ /dev/null @@ -1,111 +0,0 @@ -import sys -import mudata as mu -import numpy as np -from scipy.sparse import issparse -from scgpt.tokenizer import tokenize_and_pad_batch -from scgpt.tokenizer.gene_tokenizer import GeneVocab - - -## VIASH START -par = { - "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu", - "model_vocab": "resources_test/scgpt/source/vocab.json", - "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", - "pad_token": "", - "pad_value": -2, - "modality": "rna", - "input_obsm_binned_counts": "binned_counts", - "max_seq_len": None, - "var_gene_names": None, - "obsm_gene_tokens": "gene_id_tokens", - "obsm_tokenized_values": "values_tokenized", - "obsm_padding_mask": "padding_mask", - "output_compression": None, - "var_input": "id_in_vocab", -} -meta = {"resources_dir": "src/utils/"} - -# mdata = mu.read(par["input"]) -# mdata.mod["rna"].obsm["binned_counts"] = mdata.mod["rna"].layers["binned"] -# mdata.write_h5mu(par["input"]) -## VIASH END - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger -from subset_vars import subset_vars - -logger = setup_logger() - -logger.info("Reading in data") - -# Read in data -mdata = mu.read(par["input"]) -input_adata = mdata.mod[par["modality"]] -adata = input_adata.copy() - -adata = subset_vars(adata, par["var_input"]) - -# Set padding specs -pad_token = par["pad_token"] -special_tokens = [pad_token, "", ""] -pad_value = -2 - -logger.info("Fetching counts and gene names") -# Fetch counts -all_counts = ( - adata.obsm[par["input_obsm_binned_counts"]].toarray() - if issparse(adata.obsm[par["input_obsm_binned_counts"]]) - else adata.obsm[par["input_obsm_binned_counts"]] -) - -# Fetching gene names -if not par["var_gene_names"]: - genes = adata.var.index.astype(str).tolist() -else: - genes = adata.var[par["var_gene_names"]].astype(str).tolist() - -# Fetch gene names and look up tokens in vocab -logger.info("Reading in vocab and fetching gene tokens") -vocab_file = par["model_vocab"] -vocab = GeneVocab.from_file(vocab_file) -for s in special_tokens: - if s not in vocab: - vocab.append_token(s) - -vocab.set_default_index(vocab[""]) -ntokens = len(vocab) -gene_ids = np.array(vocab(genes), dtype=int) - -# Fetch max seq len -if not par["max_seq_len"]: - max_seq_len = adata.var.shape[0] + 1 -else: - max_seq_len = par["max_seq_len"] - -# Tokenize and pad data -logger.info( - f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}." -) -tokenized_data = tokenize_and_pad_batch( - all_counts, - gene_ids, - max_len=max_seq_len, - vocab=vocab, - pad_token=pad_token, - pad_value=pad_value, - append_cls=True, # append token at the beginning, - include_zero_gene=False, - return_pt=True, - mod_type=None, - vocab_mod=None, -) - -all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] -padding_mask = all_gene_ids.eq(vocab[pad_token]) - -logger.info("Writing output data") -input_adata.obsm[par["obsm_gene_tokens"]] = all_gene_ids.numpy() -input_adata.obsm[par["obsm_tokenized_values"]] = all_values.numpy() -input_adata.obsm[par["obsm_padding_mask"]] = padding_mask.numpy() - -mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/pad_tokenize/test.py b/src/scgpt/pad_tokenize/test.py deleted file mode 100644 index e79a9b6cce6..00000000000 --- a/src/scgpt/pad_tokenize/test.py +++ /dev/null @@ -1,113 +0,0 @@ -import pytest -import sys -import mudata as mu -from scgpt.tokenizer.gene_tokenizer import GeneVocab - -## VIASH START -meta = { - "resources_dir": "resources_test/scgpt", - "executable": "./target/docker/scgpt/integration_pad_tokenize/integration_pad_tokenize", - "temp_dir": "tmp", - "config": "./target/docker/scgpt/integration_pad_tokenize/.config.vsh.yaml", -} -## VIASH END - -input_file = ( - f"{meta['resources_dir']}/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu" -) -vocab_file = f"{meta['resources_dir']}/scgpt/source/vocab.json" -vocab = GeneVocab.from_file(vocab_file) - - -def test_integration_pad_tokenize(run_component, tmp_path): - output = tmp_path / "Kim2020_Lung_tokenized.h5mu" - - run_component( - [ - "--input", - input_file, - "--output", - output, - "--modality", - "rna", - "--var_input", - "scgpt_cross_checked_genes", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--pad_token", - "", - "--pad_value", - "-2", - "--input_obsm_binned_counts", - "binned_counts", - "--model_vocab", - vocab_file, - ] - ) - - output_file = mu.read(output) - output_adata = output_file.mod["rna"] - - gene_ids = output_adata.obsm["gene_id_tokens"] - values = output_adata.obsm["values_tokenized"] - padding_mask = output_adata.obsm["padding_mask"] - - # check output dimensions - ## nr of genes that are tokenized - assert gene_ids.shape[1] <= output_adata.var.shape[0] + 1, ( - "gene_ids shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" - ) - assert values.shape[1] <= output_adata.var.shape[0] + 1, ( - "values shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" - ) - assert padding_mask.shape[1] <= output_adata.var.shape[0] + 1, ( - "padding_mask shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" - ) - - ## equal size of output tensors - assert gene_ids.shape == values.shape, ( - "gene_ids shape[1] does not match values shape[1]" - ) - assert gene_ids.shape == padding_mask.shape, ( - "gene_ids shape[1] does not match padding_mask shape[1]" - ) - - ## check values of output tensors - assert gene_ids.dtype == "int64", "tokenized gene_ids are not integers" - assert (gene_ids > 0).all(), "not all gene id tokens are higher than 0" - - assert values.dtype == "float32", "tokenized values are not floats" - assert (values >= -2).all(), "not all tokenized values are higher than/equal to -2" - - assert padding_mask.dtype == bool, "padding mask is not boolean" - - ## check cls token - assert (gene_ids[:, 0] == vocab[""]).all(), ( - "cls token was not correctly appended at the beginning of the gene_ids tensor" - ) - assert (values[:, 0] == 0).all(), ( - "cls token was not correctly appended at the beginning of the values tensors" - ) - - # check padding values - masked_gene_ids = gene_ids[padding_mask] - unmasked_gene_ids = gene_ids[~padding_mask] - assert all(masked_gene_ids == vocab[""]), ( - "masked gene_ids contain non-pad tokens" - ) - assert all(unmasked_gene_ids != vocab[""]), ( - "unmasked gene_ids contain pad tokens" - ) - - masked_values = values[padding_mask] - unmasked_values = values[~padding_mask] - assert all(masked_values == -2), "masked values contain non-pad values" - assert all(unmasked_values != -2), "unmasked values contain pad values" - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/src/workflows/annotation/scgpt_annotation/config.vsh.yaml b/src/workflows/annotation/scgpt_annotation/config.vsh.yaml deleted file mode 100644 index 81321b482ab..00000000000 --- a/src/workflows/annotation/scgpt_annotation/config.vsh.yaml +++ /dev/null @@ -1,211 +0,0 @@ -name: "scgpt_annotation" -namespace: "workflows/annotation" -scope: "public" -description: | - Cell type annotation workflow using scGPT. - The workflow takes a pre-processed h5mu file as query input, and performs - - subsetting for HVG - - cross-checking of genes with the model vocabulary - - binning of gene counts - - padding and tokenizing of genes - - transformer-based cell type prediction - Note that cell-type prediction using scGPT is only possible using a fine-tuned scGPT model. -info: - name: "scGPT Annotation" - test_dependencies: - - name: scgpt_annotation_test - namespace: test_workflows/annotation - -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ author, maintainer ] - - __merge__: /src/authors/elizabeth_mlynarski.yaml - roles: [ contributor ] - - __merge__: /src/authors/weiwei_schultz.yaml - roles: [ contributor ] - -argument_groups: - - name: "Query input" - arguments: - - name: "--id" - required: true - type: string - description: ID of the sample. - example: foo - - name: "--input" - type: file - required: true - description: Path to the input file. - example: input.h5mu - - name: "--modality" - description: | - Which modality from the input MuData file to process. - type: string - default: "rna" - required: false - - name: "--input_layer" - type: string - required: False - description: | - The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts. - - name: "--input_var_gene_names" - type: string - required: false - description: | - The .var field in the input (query) containing gene names; if not provided, the var index will be used. - - name: "--input_obs_batch_label" - type: string - required: true - description: | - The .obs field in the input (query) dataset containing the batch labels. - - - name: "Model input" - arguments: - - name: "--model" - type: file - required: true - example: best_model.pt - description: | - The scGPT model file. - Must be a fine-tuned model that contains keys for checkpoints (--finetuned_checkpoints_key) and cell type label mapper(--label_mapper_key). - - name: "--model_config" - type: file - required: true - example: args.json - description: | - The scGPT model configuration file. - - name: "--model_vocab" - type: file - required: true - example: vocab.json - description: | - The scGPT model vocabulary file. - - name: "--finetuned_checkpoints_key" - type: string - default: model_state_dict - description: | - Key in the model file containing the pre-trained checkpoints. - - name: "--label_mapper_key" - type: string - default: id_to_class - description: | - Key in the model file containing the cell type class to label mapper dictionary. - - - name: "Outputs" - arguments: - - name: "--output" - type: file - required: true - direction: output - description: Output file path - example: output.h5mu - - name: "--output_compression" - type: string - example: "gzip" - required: false - choices: ["gzip", "lzf"] - description: | - The compression algorithm to use for the output h5mu file. - - name: "--output_obs_predictions" - type: string - default: "scgpt_pred" - required: false - description: | - The name of the adata.obs column to write predicted cell type labels to. - - name: "--output_obs_probability" - type: string - default: "scgpt_probability" - required: false - description: | - The name of the adata.obs column to write predicted cell type labels to. - - - name: "Padding arguments" - arguments: - - name: "--pad_token" - type: string - default: "" - required: false - description: | - Token used for padding. - - name: "--pad_value" - type: integer - default: -2 - required: false - description: | - The value of the padding token. - - - name: "HVG subset arguments" - arguments: - - name: "--n_hvg" - type: integer - default: 1200 - description: | - Number of highly variable genes to subset for. - - name: "--hvg_flavor" - type: string - choices: ["cell_ranger", "seurat"] - default: "cell_ranger" - description: | - Method to be used for identifying highly variable genes. - Note that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`). - - - name: "Tokenization arguments" - arguments: - - name: "--max_seq_len" - type: integer - required: false - description: | - The maximum sequence length of the tokenized data. - - - name: "Embedding arguments" - arguments: - - name: --dsbn - type: boolean - default: true - description: | - Apply domain-specific batch normalization - - name: "--batch_size" - type: integer - default: 64 - min: 1 - description: | - The batch size to be used for embedding inference. - - - name: "Binning arguments" - arguments: - - name: "--n_input_bins" - type: integer - default: 51 - required: False - min: 1 - description: | - The number of bins to discretize the data into; When no value is provided, data won't be binned. - - name: "--seed" - type: integer - min: 0 - required: false - description: | - Seed for random number generation used for binning. If not set, no seed is used. - -resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - -test_resources: - - type: nextflow_script - path: test.nf - entrypoint: test_wf - - path: /resources_test/scgpt - -dependencies: - - name: scgpt/cross_check_genes - - name: scgpt/binning - - name: feature_annotation/highly_variable_features_scanpy - - name: filter/do_filter - - name: scgpt/pad_tokenize - - name: scgpt/cell_type_annotation - alias: scgpt_celltype_annotation - -runners: - - type: nextflow diff --git a/src/workflows/annotation/scgpt_annotation/integration_test.sh b/src/workflows/annotation/scgpt_annotation/integration_test.sh deleted file mode 100755 index 108a4c4db47..00000000000 --- a/src/workflows/annotation/scgpt_annotation/integration_test.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -nextflow run . \ - -main-script src/workflows/annotation/scgpt_annotation/test.nf \ - -profile docker,no_publish \ - -entry test_wf \ - -c src/workflows/utils/labels_ci.config \ - -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/annotation/scgpt_annotation/main.nf b/src/workflows/annotation/scgpt_annotation/main.nf deleted file mode 100644 index 010ed7b188f..00000000000 --- a/src/workflows/annotation/scgpt_annotation/main.nf +++ /dev/null @@ -1,112 +0,0 @@ -workflow run_wf { - - take: - input_ch - - main: - output_ch = input_ch - // Set aside the output for this workflow to avoid conflicts - | map {id, state -> - def new_state = state + ["workflow_output": state.output] - [id, new_state] - } - // Annotate the mudata object with highly variable genes. - | highly_variable_features_scanpy.run( - fromState: [ - "input": "input", - "layer": "input_layer", - "modality": "modality", - "n_top_features": "n_hvg", - "flavor": "hvg_flavor" - ], - args: [ - "var_name_filter": "scgpt_filter_with_hvg" - ], - toState: ["input": "output"] - ) - // Check whether the genes are part of the provided vocabulary. - // Subsets for genes present in vocab only. - | cross_check_genes.run( - fromState: [ - "input": "input", - "modality": "modality", - "vocab_file": "model_vocab", - "input_var_gene_names": "input_var_gene_names", - "output": "output", - "pad_token": "pad_token" - ], - args: [ - "var_input": "scgpt_filter_with_hvg", - "output_var_filter": "scgpt_cross_checked_genes" - ], - toState: ["input": "output"] - ) - // Bins the data into a fixed number of bins. - | binning.run( - fromState: [ - "input": "input", - "modality": "modality", - "input_layer": "input_layer", - "n_input_bins": "n_input_bins", - "output": "output", - "seed": "seed" - ], - args: [ - "output_obsm_binned_counts": "binned_counts", - "var_input": "scgpt_cross_checked_genes" - ], - toState: ["input": "output"] - ) - // Padding and tokenization of gene count values. - | pad_tokenize.run( - fromState: [ - "input": "input", - "modality": "modality", - "model_vocab": "model_vocab", - "var_gene_names": "input_var_gene_names", - "pad_token": "pad_token", - "pad_value": "pad_value", - "max_seq_len": "max_seq_len", - "output": "output" - ], - args: [ - "input_obsm_binned_counts": "binned_counts", - "obsm_gene_tokens": "gene_id_tokens", - "obsm_tokenized_values": "values_tokenized", - "obsm_padding_mask": "padding_mask", - "var_input": "scgpt_cross_checked_genes" - ], - toState: ["input": "output"] - ) - // scGPT decoder-based cell type annotation. - | scgpt_celltype_annotation.run( - fromState: [ - "model": "model", - "model_vocab": "model_vocab", - "model_config": "model_config", - "label_mapper_key": "label_mapper_key", - "finetuned_checkpoints_key": "finetuned_checkpoints_key", - "input": "input", - "modality": "modality", - "obs_batch_label": "input_obs_batch_label", - "pad_token": "pad_token", - "pad_value": "pad_value", - "n_input_bins": "n_input_bins", - "dsbn": "dsbn", - "batch_size": "batch_size", - "seed": "seed", - "output_obs_predictions": "output_obs_predictions", - "output_obs_probability": "output_obs_probability", - "output": "workflow_output", - "output_compression": "output_compression" - ], - args: [ - "obsm_gene_tokens": "gene_id_tokens", - "obsm_tokenized_values": "values_tokenized" - ], - toState: {id, output, state -> ["output": output.output]} - ) - - emit: - output_ch -} diff --git a/src/workflows/annotation/scgpt_annotation/nextflow.config b/src/workflows/annotation/scgpt_annotation/nextflow.config deleted file mode 100644 index 059100c489c..00000000000 --- a/src/workflows/annotation/scgpt_annotation/nextflow.config +++ /dev/null @@ -1,10 +0,0 @@ -manifest { - nextflowVersion = '!>=20.12.1-edge' -} - -params { - rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() -} - -// include common settings -includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/annotation/scgpt_annotation/test.nf b/src/workflows/annotation/scgpt_annotation/test.nf deleted file mode 100644 index 5e86cc16b9e..00000000000 --- a/src/workflows/annotation/scgpt_annotation/test.nf +++ /dev/null @@ -1,58 +0,0 @@ -nextflow.enable.dsl=2 - -include { scgpt_annotation } from params.rootDir + "/target/nextflow/workflows/annotation/scgpt_annotation/main.nf" -include { scgpt_annotation_test } from params.rootDir + "/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf" - -params.resources_test = params.rootDir + "/resources_test" - -workflow test_wf { - resources_test = file(params.resources_test) - scgpt_test_resources = resources_test / "scgpt" - - output_ch = Channel.fromList([ - [ - id: "simple_execution_test", - input: scgpt_test_resources.resolve("test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), - model: scgpt_test_resources.resolve("finetuned_model/best_model.pt"), - model_config: scgpt_test_resources.resolve("source/args.json"), - model_vocab: scgpt_test_resources.resolve("source/vocab.json"), - input_layer: "log_normalized", - input_obs_batch_label: "sample", - // change default to reduce resource requirements - n_hvg: 400, - seed: 1 - ] - ]) - | map{ state -> [state.id, state] } - | scgpt_annotation - | view { output -> - assert output.size() == 2 : "Outputs should contain two elements; [id, state]" - - // check id - def id = output[0] - assert id.endsWith("_test") - - // check output - def state = output[1] - assert state instanceof Map : "State should be a map. Found: ${state}" - assert state.containsKey("output") : "Output should contain key 'output'." - assert state.output.isFile() : "'output' should be a file." - assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" - - "Output: $output" - } - | scgpt_annotation_test.run( - fromState: [ - "input": "output" - ], - args: [ - "n_hvg": 400 - ] - ) - | toSortedList() - | map { output_list -> - assert output_list.size() == 1 : "output channel should contain 1 event" - assert output_list.collect{it[0]} == ["simple_execution_test"] - } - -} diff --git a/src/workflows/integration/scgpt_leiden/config.vsh.yaml b/src/workflows/integration/scgpt_leiden/config.vsh.yaml deleted file mode 100644 index 256dca6a887..00000000000 --- a/src/workflows/integration/scgpt_leiden/config.vsh.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "scgpt_leiden" -namespace: "workflows/integration" -scope: "public" -description: "Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result." -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ maintainer, author ] - - __merge__: /src/authors/elizabeth_mlynarski.yaml - roles: [ author ] - - __merge__: /src/authors/weiwei_schultz.yaml - roles: [ contributor ] -info: - test_dependencies: -argument_groups: - - name: "Inputs" - arguments: - - name: "--id" - required: true - type: string - description: ID of the sample. - example: foo - - name: "--input" - type: file - required: true - description: Path to the input file. - example: input.h5mu - - name: "--modality" - description: | - Which modality from the input MuData file to process. - type: string - default: "rna" - required: false - - name: "--input_layer" - type: string - required: False - description: | - The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts. - - name: "--var_gene_names" - type: string - required: false - description: | - The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used. - - name: "--obs_batch_label" - type: string - description: | - The name of the adata obs column containing the batch labels. - - name: Model - arguments: - - name: "--model" - type: file - required: true - example: resources_test/scgpt/best_model.pt - description: | - Path to scGPT model file. - - name: "--model_vocab" - type: file - direction: input - required: true - example: resources_test/scgpt/vocab.json - description: | - Path to scGPT model vocabulary file. - - name: "--model_config" - type: file - direction: input - required: true - example: args.json - description: | - Path to scGPT model config file. - - name: "--finetuned_checkpoints_key" - type: string - required: false - example: model_state_dict - description: | - Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models. - - name: "Outputs" - arguments: - - name: "--output" - type: file - required: true - direction: output - description: Output file path - example: output.h5mu - - name: "--obsm_integrated" - type: string - default: "X_scgpt" - required: false - description: "In which .obsm slot to store the resulting integrated embedding." - - - name: "Padding arguments" - arguments: - - name: "--pad_token" - type: string - default: "" - required: false - description: | - Token used for padding. - - name: "--pad_value" - type: integer - default: -2 - required: false - description: | - The value of the padding token. - - - name: "HVG subset arguments" - arguments: - - name: "--n_hvg" - type: integer - default: 1200 - description: | - Number of highly variable genes to subset for. - - name: "--hvg_flavor" - type: string - choices: ["cell_ranger", "seurat"] - default: "cell_ranger" - description: | - Method to be used for identifying highly variable genes. - Note that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`). - - - name: "Tokenization arguments" - arguments: - - name: "--max_seq_len" - type: integer - required: false - description: | - The maximum sequence length of the tokenized data. Defaults to the number of features if not provided. - - name: "Embedding arguments" - arguments: - - name: --dsbn - type: boolean - default: true - description: | - Apply domain-specific batch normalization - - name: "--batch_size" - type: integer - default: 64 - description: | - The batch size to be used for embedding inference. - - - name: "Binning arguments" - arguments: - - name: "--n_input_bins" - type: integer - default: 51 - required: False - min: 1 - description: | - The number of bins to discretize the data into; When no value is provided, data won't be binned. - - name: "--seed" - type: integer - required: false - description: | - Seed for random number generation used for binning. If not set, no seed is used. - - - name: "Clustering arguments" - arguments: - - name: "--leiden_resolution" - type: double - description: Control the coarseness of the clustering. Higher values lead to more clusters. - default: [1] - multiple: true - -resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - -dependencies: - - name: scgpt/cross_check_genes - - name: scgpt/binning - - name: feature_annotation/highly_variable_features_scanpy - - name: scgpt/pad_tokenize - - name: scgpt/embedding - - name: workflows/multiomics/neighbors_leiden_umap - -test_resources: - - type: nextflow_script - path: test.nf - entrypoint: test_wf - - type: nextflow_script - path: test.nf - entrypoint: test_wf2 - - path: /resources_test/scgpt - -runners: - - type: nextflow diff --git a/src/workflows/integration/scgpt_leiden/integration_test.sh b/src/workflows/integration/scgpt_leiden/integration_test.sh deleted file mode 100755 index 001299c408e..00000000000 --- a/src/workflows/integration/scgpt_leiden/integration_test.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -nextflow \ - run . \ - -main-script src/workflows/integration/scgpt_leiden/test.nf \ - -entry test_wf \ - -profile docker,no_publish \ - -c src/workflows/utils/labels_ci.config \ - -c src/workflows/utils/integration_tests.config - -nextflow \ - run . \ - -main-script src/workflows/integration/scgpt_leiden/test.nf \ - -entry test_wf2 \ - -profile docker,no_publish \ - -c src/workflows/utils/labels_ci.config \ - -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/scgpt_leiden/main.nf b/src/workflows/integration/scgpt_leiden/main.nf deleted file mode 100644 index ac5df109aa8..00000000000 --- a/src/workflows/integration/scgpt_leiden/main.nf +++ /dev/null @@ -1,138 +0,0 @@ -workflow run_wf { - - take: - input_ch - - main: - output_ch = input_ch - // Set aside the output for this workflow to avoid conflicts - | map {id, state -> - def new_state = state + ["workflow_output": state.output] - [id, new_state] - } - // Annotates the mudata object with highly variable genes. - | highly_variable_features_scanpy.run( - fromState: [ - "input": "input", - "layer": "input_layer", - "modality": "modality", - "n_top_features": "n_hvg", - "flavor": "hvg_flavor" - ], - args: ["var_name_filter": "scgpt_filter_with_hvg"], - toState: ["input": "output"] - ) - // Check whether the genes are part of the provided vocabulary. - | cross_check_genes.run( - fromState: [ - "input": "input", - "modality": "modality", - "vocab_file": "model_vocab", - "input_var_gene_names": "var_gene_names", - "output": "output", - "pad_token": "pad_token" - ], - args: [ - "var_input": "scgpt_filter_with_hvg", - "output_var_filter": "scgpt_cross_checked_genes" - ], - toState: [ - "input": "output" - ] - ) - // Bins the data into a fixed number of bins. - | binning.run( - fromState: [ - "input": "input", - "modality": "modality", - "input_layer": "input_layer", - "n_input_bins": "n_input_bins", - "output": "output" - ], - args: [ - "output_obsm_binned_counts": "binned_counts", - "var_input": "scgpt_cross_checked_genes" - ], - toState: [ - "input": "output" - ] - ) - // Padding and tokenization of gene count values. - | pad_tokenize.run( - fromState: [ - "input": "input", - "modality": "modality", - "model_vocab": "model_vocab", - "var_gene_names": "var_gene_names", - "pad_token": "pad_token", - "pad_value": "pad_value", - "max_seq_len": "max_seq_len", - "output": "output" - ], - args: [ - "input_obsm_binned_counts": "binned_counts", - "var_input": "scgpt_cross_checked_genes", - "obsm_gene_tokens": "gene_id_tokens", - "obsm_tokenized_values": "values_tokenized", - "obsm_padding_mask": "padding_mask" - ], - toState: [ - "input": "output" - ] - ) - // Generation of cell embedings from the tokenized gene counts values. - | embedding.run( - fromState: [ - "input": "input", - "modality": "modality", - "model": "model", - "model_vocab": "model_vocab", - "model_config": "model_config", - "var_gene_names": "var_gene_names", - "obs_batch_label": "obs_batch_label", - "pad_token": "pad_token", - "pad_value": "pad_value", - "dsbn": "dsbn", - "batch_size": "batch_size", - "obsm_embeddings": "obsm_integrated", - "finetuned_checkpoints_key": "finetuned_checkpoints_key", - "output": "output" - ], - args: [ - "obsm_gene_tokens": "gene_id_tokens", - "obsm_tokenized_values": "values_tokenized", - "obsm_padding_mask": "padding_mask" - ], - toState: [ - "input": "output" - ] - ) - // Calculation of neighbors, leiden clustering and UMAP. - | neighbors_leiden_umap.run( - fromState: [ - "input": "input", - "obsm_input": "obsm_integrated", - "modality": "modality", - "uns_neighbors": "uns_neighbors", - "obsp_neighbor_distances": "obsp_neighbor_distances", - "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", - "output": "workflow_output", - "leiden_resolution": "leiden_resolution", - "obsm_umap": "obsm_integrated", - ], - toState: [ - "output": "output" - ], - args: [ - "uns_neighbors": "scGPT_integration_neighbors", - "obsp_neighbor_distances": "scGPT_integration_distances", - "obsp_neighbor_connectivities": "scGPT_integration_connectivities", - "obs_cluster": "scGPT_integration_leiden", - "obsm_umap": "X_scGPT_umap" - ] - ) - | setState(["output"]) - - emit: - output_ch -} \ No newline at end of file diff --git a/src/workflows/integration/scgpt_leiden/nextflow.config b/src/workflows/integration/scgpt_leiden/nextflow.config deleted file mode 100644 index 8108bc25e84..00000000000 --- a/src/workflows/integration/scgpt_leiden/nextflow.config +++ /dev/null @@ -1,10 +0,0 @@ -manifest { - nextflowVersion = '!>=20.12.1-edge' -} - -params { - rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() -} - -// include common settings -includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/scgpt_leiden/test.nf b/src/workflows/integration/scgpt_leiden/test.nf deleted file mode 100644 index 064cf4a5b49..00000000000 --- a/src/workflows/integration/scgpt_leiden/test.nf +++ /dev/null @@ -1,100 +0,0 @@ -nextflow.enable.dsl=2 - -include { scgpt_leiden } from params.rootDir + "/target/nextflow/workflows/integration/scgpt_leiden/main.nf" - -params.resources_test = params.rootDir + "/resources_test" - -workflow test_wf { - - resources_test = file(params.resources_test) - - output_ch = Channel.fromList([ - [ - id: "simple_execution_test", - input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), - model: resources_test.resolve("scgpt/source/best_model.pt"), - model_config: resources_test.resolve("scgpt/source/args.json"), - model_vocab: resources_test.resolve("scgpt/source/vocab.json"), - input_layer: "log_normalized", - obs_batch_label: "sample", - n_hvg: 400, - seed: 1, - leiden_resolution: [1.0, 0.25] - ], - [ - id: "no_leiden_resolutions_test", - input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), - model: resources_test.resolve("scgpt/source/best_model.pt"), - model_config: resources_test.resolve("scgpt/source/args.json"), - model_vocab: resources_test.resolve("scgpt/source/vocab.json"), - obs_batch_label: "sample", - n_hvg: 400, - seed: 1, - input_layer: "log_normalized", - leiden_resolution: [] - ] - ]) - | map{ state -> [state.id, state] } - | scgpt_leiden - | view { output -> - assert output.size() == 2 : "Outputs should contain two elements; [id, state]" - - // check id - def id = output[0] - assert id.endsWith("_test") - - // check output - def state = output[1] - assert state instanceof Map : "State should be a map. Found: ${state}" - assert state.containsKey("output") : "Output should contain key 'output'." - assert state.output.isFile() : "'output' should be a file." - assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" - - "Output: $output" - } - | toSortedList{a, b -> a[0] <=> b[0]} - | map { output_list -> - assert output_list.size() == 2 : "output channel should contain 2 events" - assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] - } -} - - -workflow test_wf2 { - - resources_test = file(params.resources_test) - - output_ch = Channel.fromList([ - [ - id: "test_output_arg", - input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), - model: resources_test.resolve("scgpt/source/best_model.pt"), - model_config: resources_test.resolve("scgpt/source/args.json"), - model_vocab: resources_test.resolve("scgpt/source/vocab.json"), - input_layer: "log_normalized", - obs_batch_label: "sample", - n_hvg: 400, - leiden_resolution: [1.0, 0.25], - output: "test.h5mu" - ], - ]) - | map{ state -> [state.id, state] } - | scgpt_leiden - | view { output -> - assert output.size() == 2 : "Outputs should contain two elements; [id, state]" - - // check output - def state = output[1] - assert state instanceof Map : "State should be a map. Found: ${state}" - assert state.containsKey("output") : "Output should contain key 'output'." - assert state.output.isFile() : "'output' should be a file." - assert state.output.toString().endsWith("test.h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" - - "Output: $output" - } - | toSortedList({a, b -> a[0] <=> b[0]}) - | map { output_list -> - assert output_list.size() == 1 : "output channel should contain 1 event" - assert output_list.collect{it[0]} == ["test_output_arg"] - } - } From 52f30ac09c452ba2aa23c83a476fa9f5b8465f70 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 16 Sep 2025 19:50:40 +0200 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3963eca7859..f7a4c92279f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ * `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044). +* Deprecated all scGPT functionality (PR #1075). + ## NEW FUNCTIONALITY * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070). From 8201cb5bf637938bef5ace9b3688dfd43bd34cc4 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 16 Sep 2025 19:53:01 +0200 Subject: [PATCH 3/4] remove test resources --- resources_test_scripts/scgpt.sh | 135 -------------------------------- 1 file changed, 135 deletions(-) delete mode 100755 resources_test_scripts/scgpt.sh diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh deleted file mode 100755 index b8bf34f47a3..00000000000 --- a/resources_test_scripts/scgpt.sh +++ /dev/null @@ -1,135 +0,0 @@ -set -eo pipefail - -# ensure that the command below is run from the root of the repository -REPO_ROOT=$(git rev-parse --show-toplevel) -cd "$REPO_ROOT" - -# settings -ID=scgpt -OUT=resources_test/$ID - -# create foundational model directory -foundation_model_dir="$OUT/source" -mkdir -p "$foundation_model_dir" -export foundation_model_dir - -# create finetuned model directory -finetuned_model_dir="$OUT/finetuned_model" -mkdir -p "$finetuned_model_dir" -export finetuned_model_dir - -# install gdown if necessary -# Check whether gdown is available -if ! command -v gdown &> /dev/null; then - echo "This script requires gdown. Please make sure the binary is added to your PATH." - exit 1 -fi - -# install torch if necessary -# Check whether torch is available -if ! python -c "import torch"; then - echo "This script requires torch. Please make sure it is available in your python environment." - exit 1 -fi - -echo "> Downloading scGPT foundation model (full_human)" -# download foundational model files (full_human) -# https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y -gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json" -gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json" -gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt" - -echo "> Converting to finetuned model format" -python < Downloading test resources" -# download test data -# https://drive.google.com/file/d/1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL/view?usp=drive_link -gdown '1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL' -O "${test_resources_dir}/Kim2020_Lung.h5ad" - -echo "> Converting to h5mu" -python < Subsetting datasets" -viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \ - --input "${test_resources_dir}/Kim2020_Lung.h5mu" \ - --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \ - --number_of_observations 4000 - -rm "${test_resources_dir}/Kim2020_Lung.h5ad" -rm "${test_resources_dir}/Kim2020_Lung.h5mu" - -echo "> Preprocessing datasets" -nextflow \ - run . \ - -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \ - -profile docker \ - -c src/workflows/utils/labels_ci.config \ - --input "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \ - --output "Kim2020_Lung_subset_preprocessed.h5mu" \ - --publish_dir "${test_resources_dir}" - -echo "> Filtering highly variable features" -viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ - --layer "log_normalized" \ - --var_name_filter "scgpt_filter_with_hvg" \ - --n_top_features 1200 \ - --flavor "cell_ranger" - -echo "> Running scGPT cross check genes" -viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \ - --vocab_file "${foundation_model_dir}/vocab.json" \ - --var_input "scgpt_filter_with_hvg" \ - --output_var_filter "scgpt_cross_checked_genes" - -echo "> Running scGPT binning" -viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \ - --input_layer "log_normalized" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \ - --output_obsm_binned_counts "binned_counts" \ - --var_input "scgpt_cross_checked_genes" - -echo "> Running scGPT tokenizing" -viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \ - --input_obsm_binned_counts "binned_counts" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \ - --model_vocab "${foundation_model_dir}/vocab.json" \ - --var_input "scgpt_cross_checked_genes" \ - - -echo "> Removing unnecessary files in test resources dir" -find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete - -echo "> scGPT test resources are ready!" From ee0f3f88b3675d95a4bb42e3dfdee5333b0f9f23 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Fri, 10 Oct 2025 09:38:55 +0200 Subject: [PATCH 4/4] update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7a4c92279f..c01567f5e57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,6 @@ * `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044). -* Deprecated all scGPT functionality (PR #1075). - ## NEW FUNCTIONALITY * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070). @@ -18,6 +16,8 @@ * `workflows/differential_expression/pseudobulk_deseq2`: Workflow for generating pseudobulk samples from single-cell data followed by DESeq2 differential expression analysis (PR #1044) +* Deprecated all scGPT functionality (PR #1075). + ## MINOR CHANGES * `transform/normalize_total`, `transform/clr`, `transform/log1p`: Add disk resource labels (PR #1073).