diff --git a/docs/cli.md b/docs/cli.md index 2c73847..2f9bd7a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -125,6 +125,10 @@ cas anndata2cas --anndata path/to/anndata.h5ad --labelsets item1 item2 item3 --o names. The labelsets should be provided in order, starting from rank 0 (leaf nodes) and ascending to higher ranks. - `--output` : Output CAS file name (default: output.json). - `--hierarchy`: Flag indicating whether to include hierarchy in the output. +- `--accession_columns`: List of columns in the AnnData obs that contain accession ID information. + This list should match the order and length of the labelsets argument. + If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set. + Defaults to None. ## Convert ABC to CAS diff --git a/src/cas/__main__.py b/src/cas/__main__.py index e937adb..362ef2e 100644 --- a/src/cas/__main__.py +++ b/src/cas/__main__.py @@ -107,8 +107,9 @@ def main(): labelsets = args.labelsets output_file_path = args.output include_hierarchy = args.hierarchy + accession_columns = args.accession_columns - anndata2cas(anndata_file_path, labelsets, output_file_path, include_hierarchy) + anndata2cas(anndata_file_path, labelsets, output_file_path, include_hierarchy, accession_columns) elif args.action == "abc2cas": args = parser.parse_args() cat_set_file_path = args.catset @@ -390,6 +391,10 @@ def create_anndata2cas_operation_parser(subparsers): --output : Output CAS file name (default: output.json). --hierarchy : Flag indicating whether to include hierarchy in the output. + --accession_columns : List of columns in the AnnData obs that contain accession information. + This list should match the order and length of the labelsets argument. + If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set. + Defaults to None. Usage Example: @@ -427,6 +432,14 @@ def create_anndata2cas_operation_parser(subparsers): action="store_true", help="Include hierarchy in the output.", ) + parser_anndata2cas.add_argument( + "--accession_columns", + nargs="+", + default=None, + help="An optional list of accession_id columns to populate cell_set_accession. " + "This list should match the order and length of the labelsets argument. " + "If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.", + ) def create_abc2cas_operation_parser(subparsers): diff --git a/src/cas/accession/base_accession_manager.py b/src/cas/accession/base_accession_manager.py index cf55c5d..31cced1 100644 --- a/src/cas/accession/base_accession_manager.py +++ b/src/cas/accession/base_accession_manager.py @@ -8,7 +8,7 @@ class BaseAccessionManager(metaclass=abc.ABCMeta): @abc.abstractmethod def generate_accession_id( - self, id_recommendation: str = None, labelset: str = None + self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None ) -> str: """ Generates an auto-increment based accession id. If the recommended accession_id is available, uses it. @@ -16,6 +16,7 @@ def generate_accession_id( id_recommendation: accession id recommendation. Function uses this id if it is available, provides an auto-incremented id otherwise. labelset: Labelset name. If provided, uses it as a prefix to the accession id. + cellset_name: Name of the cell set for which the accession ID is being generated. Return: accession_id """ pass diff --git a/src/cas/accession/hash_accession_manager.py b/src/cas/accession/hash_accession_manager.py index b3dcce0..55267c8 100644 --- a/src/cas/accession/hash_accession_manager.py +++ b/src/cas/accession/hash_accession_manager.py @@ -18,7 +18,7 @@ def __init__(self, accession_prefix=None, digest_size=5): self.accession_ids = list() def generate_accession_id( - self, id_recommendation: str = None, cell_ids: List = None, labelset: str = None, suppress_warnings=False + self, id_recommendation: str = None, cell_ids: List = None, labelset: str = None, suppress_warnings=False, cellset_name: str = None ) -> str: """ Generates a Blake2b hashing algorithm based hash for the given cell IDs. @@ -28,6 +28,7 @@ def generate_accession_id( cell_ids: Cell IDs list. Algorithm sorts cell ids internally. labelset: Labelset name. If provided, uses it as a prefix to the accession id. suppress_warnings: If True, suppresses warnings. + cellset_name: this parameter is not utilized in this implementation. Return: accession_id """ if id_recommendation and labelset and ":" not in id_recommendation: diff --git a/src/cas/accession/incremental_accession_manager.py b/src/cas/accession/incremental_accession_manager.py index 6834055..821083c 100644 --- a/src/cas/accession/incremental_accession_manager.py +++ b/src/cas/accession/incremental_accession_manager.py @@ -19,7 +19,7 @@ def __init__(self, accession_prefix=None, last_accession_id=0): self.accession_ids = list() def generate_accession_id( - self, id_recommendation: str = None, labelset: str = None + self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None ) -> str: """ Generates an auto-increment based accession id. If the recommended accession_id is available, uses it. @@ -27,6 +27,7 @@ def generate_accession_id( id_recommendation: accession id recommendation. Function uses this id if it is available, provides an auto-incremented id otherwise. labelset: this parameter is not utilized in this implementation. + cellset_name: this parameter is not utilized in this implementation. Return: accession_id """ if id_recommendation: diff --git a/src/cas/accession/mapped_accession_manager.py b/src/cas/accession/mapped_accession_manager.py new file mode 100644 index 0000000..7b35602 --- /dev/null +++ b/src/cas/accession/mapped_accession_manager.py @@ -0,0 +1,33 @@ +from cas.accession.base_accession_manager import BaseAccessionManager + + +class MappedAccessionManager(BaseAccessionManager): + """ + Predefined Accession ID generator based on a mapping of cell set names to accession IDs. + This accession manager is used when the accession IDs are already defined and mapped to specific cell sets. + """ + + def __init__(self, accession_map): + """ + Initializer. + Params: + accession_map: map of cell set names to their corresponding accession IDs. + (To enable usage of same names accross different labelsets, key is identified as labelset:cell_label). + """ + self.accession_map = accession_map + + def generate_accession_id( + self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None, **kwargs + ) -> str: + """ + Generates an auto-increment based accession id. If the recommended accession_id is available, uses it. + Params: + id_recommendation: this parameter is not utilized in this implementation. + labelset: this parameter is not utilized in this implementation. + cellset_name: Name of the cell set for which the accession ID is being generated. + Return: accession_id + """ + if labelset + ':' + cellset_name in self.accession_map: + return self.accession_map[labelset + ':' +cellset_name] + else: + raise ValueError(f"Cell set name '{labelset}:{cellset_name}' not found in the accession map.") \ No newline at end of file diff --git a/src/cas/anndata_to_cas.py b/src/cas/anndata_to_cas.py index d0ba9be..08bfcd6 100644 --- a/src/cas/anndata_to_cas.py +++ b/src/cas/anndata_to_cas.py @@ -9,6 +9,7 @@ calculate_labelset, generate_parent_cell_lookup, get_authors_from_doi, + create_accession_mapping ) @@ -17,6 +18,7 @@ def anndata2cas( labelsets: List[str], output_file_path: str, include_hierarchy: bool, + accession_columns: List[str] = None, ): """ Convert an AnnData file to Cell Annotation Schema (CAS) JSON. @@ -28,17 +30,23 @@ def anndata2cas( to higher ranks. output_file_path (str): Output CAS file name. include_hierarchy (bool): Flag indicating whether to include hierarchy in the output. + accession_columns (List[str], optional): List of columns in the AnnData obs that contain accession information. + If provided, these columns will be used to populate the 'cell_set_accession' field in the CAS annotations. + Otherwise, accession IDs will be automatically generated using a hash of the cells in each cell set. + Defaults to None. """ anndata = read_anndata_file(anndata_file_path) labelset_dict = calculate_labelset(anndata.obs, labelsets) + accessions_mapping = create_accession_mapping(anndata.obs, labelsets, accession_columns) + cas = generate_cas_metadata(dict(anndata.uns)) add_labelsets_to_cas(cas, labelset_dict) - parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict) + parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping) add_annotations_to_cas(cas, labelset_dict, parent_cell_look_up) diff --git a/src/cas/utils/conversion_utils.py b/src/cas/utils/conversion_utils.py index d5460e8..bf59667 100644 --- a/src/cas/utils/conversion_utils.py +++ b/src/cas/utils/conversion_utils.py @@ -11,6 +11,7 @@ from cas_schema import schemas from cas.accession.hash_accession_manager import HashAccessionManager +from cas.accession.mapped_accession_manager import MappedAccessionManager from cas.dataset_retrieval.dataset_retriever import DatasetRetriever from cas.file_utils import get_cas_schema_names @@ -201,7 +202,7 @@ def collect_parent_cell_ids(cas: Dict[str, Any]) -> Dict[str, Set]: return parent_cell_ids -def generate_parent_cell_lookup(anndata, labelset_dict): +def generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping: Dict[str, str] = None,): """ Generates a lookup dictionary mapping cell labels to various metadata, including cell IDs, rank, and cell ontology terms. This function is designed to precompute the lookup information needed for @@ -212,13 +213,18 @@ def generate_parent_cell_lookup(anndata, labelset_dict): including metadata in anndata.obs. labelset_dict (Dict[str, Any]): A dictionary where keys are labelset names and values are dictionaries containing members and their ranks. + accessions_mapping (Dict[str, str], optional): Mapping of cellset names to accession IDs. + (To enable usage of same names accross different labelsets, key is identified as labelset:cell_label). Returns: Dict[str, Any]: A dictionary where each key is a cell label and each value is another dictionary containing keys for 'cell_ids' (a set of cell IDs associated with the label), 'rank', 'cell_ontology_term_id', and 'cell_ontology_term'. """ - accession_manager = HashAccessionManager() + if accessions_mapping is None: + accession_manager = HashAccessionManager() + else: + accession_manager = MappedAccessionManager(accession_map=accessions_mapping) parent_cell_look_up = {} for k, v in labelset_dict.items(): for label in v["members"]: @@ -227,7 +233,7 @@ def generate_parent_cell_lookup(anndata, labelset_dict): ) cell_ids = get_cell_ids(anndata.obs, k, label) cell_set_accession = accession_manager.generate_accession_id( - cell_ids=cell_ids, labelset=k + cell_ids=cell_ids, labelset=k, cellset_name=label ) if label in parent_cell_look_up: @@ -326,7 +332,6 @@ def add_parent_hierarchy_to_annotations( # Add parent data to the annotation annotation.update( { - "parent_cell_set_name": parent, "parent_cell_set_accession": p_accession, } ) @@ -483,3 +488,31 @@ def retrieve_schema(schema_name): with schema_file.open("rt") as f: schema = json.loads(f.read()) return schema + + +def create_accession_mapping(adata_obs: pd.DataFrame, labelsets: list, accession_columns: list) -> Optional[Dict[str, str]]: + """ + Creates a mapping of cellset names to accession IDs based on the provided labelsets and accession columns. + Args: + adata_obs: The observations DataFrame (`obs`) of an AnnData object containing the dataset. + labelsets: List of labelset names to be used for mapping. + accession_columns: List of columns in the AnnData obs that contain accession information. + + Returns: Map of cellset names to accession IDs, where keys are formatted as "labelset:cell_label". + """ + if accession_columns: + if len(labelsets) != len(accession_columns): + raise ValueError("The labelsets and accession_columns lists must have the same length.") + + mapping = {} + for labelset, acc_col in zip(labelsets, accession_columns): + # Group the obs by the labelset column and get unique values from the corresponding accession column. + groups = adata_obs.groupby(labelset)[acc_col].unique() + for cellset_name, acc_vals in groups.items(): + if len(acc_vals) != 1: + raise ValueError(f"Non one-to-one mapping for '{labelset}' value '{cellset_name}'.") + # Create a combined key to enable usage of same names across different labelsets + mapping[f"{labelset}:{cellset_name}"] = acc_vals[0] + return mapping + else: + return None \ No newline at end of file diff --git a/src/test/conversion_utils_test.py b/src/test/conversion_utils_test.py index 1f88642..a6af102 100644 --- a/src/test/conversion_utils_test.py +++ b/src/test/conversion_utils_test.py @@ -184,7 +184,7 @@ def test_add_parent_hierarchy(self): "cell_ontology_term_id": "CL:1234567", "labelset": "labelset1", "parent_cell_set_accession": "P_123", - "parent_cell_set_name": "P", + # "parent_cell_set_name": "P", } ] diff --git a/src/test/spreadsheet_to_cas_test.py b/src/test/spreadsheet_to_cas_test.py index 0b74862..0bae1c8 100644 --- a/src/test/spreadsheet_to_cas_test.py +++ b/src/test/spreadsheet_to_cas_test.py @@ -228,7 +228,7 @@ def test_spreadsheet2cas(self, mock_read_anndata_file, mock_download_source_h5ad self.assertEqual(len(json_data), 9) self.assertEqual(len(json_data["annotations"]), 8) - self.assertEqual(len(json_data["annotations"][0]), 7) + self.assertEqual(len(json_data["annotations"][0]), 6) self.assertEqual(len(json_data["labelsets"]), 2) finally: # Remove the JSON file after the test