cellannotation · hkir-dev · Jun 4, 2025 · Jun 2, 2025 · Jun 2, 2025
diff --git a/docs/cli.md b/docs/cli.md
@@ -125,6 +125,10 @@ cas anndata2cas --anndata path/to/anndata.h5ad --labelsets item1 item2 item3 --o
     names. The labelsets should be provided in order, starting from rank 0 (leaf nodes) and ascending to higher ranks.
 - `--output` : Output CAS file name (default: output.json).
 - `--hierarchy`: Flag indicating whether to include hierarchy in the output.
+- `--accession_columns`: List of columns in the AnnData obs that contain accession ID information.
+            This list should match the order and length of the labelsets argument.
+            If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.
+            Defaults to None.
 
 ## Convert ABC to CAS
 

diff --git a/src/cas/__main__.py b/src/cas/__main__.py
@@ -107,8 +107,9 @@ def main():
         labelsets = args.labelsets
         output_file_path = args.output
         include_hierarchy = args.hierarchy
+        accession_columns = args.accession_columns
 
-        anndata2cas(anndata_file_path, labelsets, output_file_path, include_hierarchy)
+        anndata2cas(anndata_file_path, labelsets, output_file_path, include_hierarchy, accession_columns)
     elif args.action == "abc2cas":
         args = parser.parse_args()
         cat_set_file_path = args.catset
@@ -390,6 +391,10 @@ def create_anndata2cas_operation_parser(subparsers):
 
     --output        : Output CAS file name (default: output.json).
     --hierarchy     : Flag indicating whether to include hierarchy in the output.
+    --accession_columns : List of columns in the AnnData obs that contain accession information.
+            This list should match the order and length of the labelsets argument.
+            If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.
+            Defaults to None.
 
 
     Usage Example:
@@ -427,6 +432,14 @@ def create_anndata2cas_operation_parser(subparsers):
         action="store_true",
         help="Include hierarchy in the output.",
     )
+    parser_anndata2cas.add_argument(
+        "--accession_columns",
+        nargs="+",
+        default=None,
+        help="An optional list of accession_id columns to populate cell_set_accession. "
+             "This list should match the order and length of the labelsets argument. "
+             "If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.",
+    )
 
 
 def create_abc2cas_operation_parser(subparsers):

diff --git a/src/cas/accession/base_accession_manager.py b/src/cas/accession/base_accession_manager.py
@@ -8,14 +8,15 @@ class BaseAccessionManager(metaclass=abc.ABCMeta):
 
     @abc.abstractmethod
     def generate_accession_id(
-        self, id_recommendation: str = None, labelset: str = None
+        self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None
     ) -> str:
         """
         Generates an auto-increment based accession id. If the recommended accession_id is available, uses it.
         Params:
             id_recommendation: accession id recommendation. Function uses this id if it is available,
             provides an auto-incremented id otherwise.
             labelset: Labelset name. If provided, uses it as a prefix to the accession id.
+            cellset_name: Name of the cell set for which the accession ID is being generated.
         Return: accession_id
         """
         pass
diff --git a/src/cas/accession/hash_accession_manager.py b/src/cas/accession/hash_accession_manager.py
@@ -18,7 +18,7 @@ def __init__(self, accession_prefix=None, digest_size=5):
         self.accession_ids = list()
 
     def generate_accession_id(
-        self, id_recommendation: str = None, cell_ids: List = None, labelset: str = None, suppress_warnings=False
+        self, id_recommendation: str = None, cell_ids: List = None, labelset: str = None, suppress_warnings=False, cellset_name: str = None
     ) -> str:
         """
         Generates a Blake2b hashing algorithm based hash for the given cell IDs.
@@ -28,6 +28,7 @@ def generate_accession_id(
             cell_ids: Cell IDs list. Algorithm sorts cell ids internally.
             labelset: Labelset name. If provided, uses it as a prefix to the accession id.
             suppress_warnings: If True, suppresses warnings.
+            cellset_name: this parameter is not utilized in this implementation.
         Return: accession_id
         """
         if id_recommendation and labelset and ":" not in id_recommendation:

diff --git a/src/cas/accession/incremental_accession_manager.py b/src/cas/accession/incremental_accession_manager.py
@@ -19,14 +19,15 @@ def __init__(self, accession_prefix=None, last_accession_id=0):
         self.accession_ids = list()
 
     def generate_accession_id(
-        self, id_recommendation: str = None, labelset: str = None
+        self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None
     ) -> str:
         """
         Generates an auto-increment based accession id. If the recommended accession_id is available, uses it.
         Params:
             id_recommendation: accession id recommendation. Function uses this id if it is available,
             provides an auto-incremented id otherwise.
             labelset: this parameter is not utilized in this implementation.
+            cellset_name: this parameter is not utilized in this implementation.
         Return: accession_id
         """
         if id_recommendation:

diff --git a/src/cas/accession/mapped_accession_manager.py b/src/cas/accession/mapped_accession_manager.py
@@ -0,0 +1,33 @@
+from cas.accession.base_accession_manager import BaseAccessionManager
+
+
+class MappedAccessionManager(BaseAccessionManager):
+    """
+    Predefined Accession ID generator based on a mapping of cell set names to accession IDs.
+    This accession manager is used when the accession IDs are already defined and mapped to specific cell sets.
+    """
+
+    def __init__(self, accession_map):
+        """
+        Initializer.
+        Params:
+            accession_map: map of cell set names to their corresponding accession IDs.
+            (To enable usage of same names accross different labelsets, key is identified as labelset:cell_label).
+        """
+        self.accession_map = accession_map
+
+    def generate_accession_id(
+        self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None, **kwargs
+    ) -> str:
+        """
+        Generates an auto-increment based accession id. If the recommended accession_id is available, uses it.
+        Params:
+            id_recommendation: this parameter is not utilized in this implementation.
+            labelset: this parameter is not utilized in this implementation.
+            cellset_name: Name of the cell set for which the accession ID is being generated.
+        Return: accession_id
+        """
+        if labelset + ':' + cellset_name in self.accession_map:
+            return self.accession_map[labelset + ':' +cellset_name]
+        else:
+            raise ValueError(f"Cell set name '{labelset}:{cellset_name}' not found in the accession map.")
diff --git a/src/cas/anndata_to_cas.py b/src/cas/anndata_to_cas.py
@@ -9,6 +9,7 @@
     calculate_labelset,
     generate_parent_cell_lookup,
     get_authors_from_doi,
+    create_accession_mapping
 )
 
 
@@ -17,6 +18,7 @@ def anndata2cas(
     labelsets: List[str],
     output_file_path: str,
     include_hierarchy: bool,
+    accession_columns: List[str] = None,
 ):
     """
     Convert an AnnData file to Cell Annotation Schema (CAS) JSON.
@@ -28,17 +30,23 @@ def anndata2cas(
         to higher ranks.
         output_file_path (str): Output CAS file name.
         include_hierarchy (bool): Flag indicating whether to include hierarchy in the output.
+        accession_columns (List[str], optional): List of columns in the AnnData obs that contain accession information.
+            If provided, these columns will be used to populate the 'cell_set_accession' field in the CAS annotations.
+            Otherwise, accession IDs will be automatically generated using a hash of the cells in each cell set.
+            Defaults to None.
     """
 
     anndata = read_anndata_file(anndata_file_path)
 
     labelset_dict = calculate_labelset(anndata.obs, labelsets)
 
+    accessions_mapping = create_accession_mapping(anndata.obs, labelsets, accession_columns)
+
     cas = generate_cas_metadata(dict(anndata.uns))
 
     add_labelsets_to_cas(cas, labelset_dict)
 
-    parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict)
+    parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping)
 
     add_annotations_to_cas(cas, labelset_dict, parent_cell_look_up)
 

diff --git a/src/cas/utils/conversion_utils.py b/src/cas/utils/conversion_utils.py
@@ -11,6 +11,7 @@
 from cas_schema import schemas
 
 from cas.accession.hash_accession_manager import HashAccessionManager
+from cas.accession.mapped_accession_manager import MappedAccessionManager
 from cas.dataset_retrieval.dataset_retriever import DatasetRetriever
 from cas.file_utils import get_cas_schema_names
 
@@ -201,7 +202,7 @@ def collect_parent_cell_ids(cas: Dict[str, Any]) -> Dict[str, Set]:
     return parent_cell_ids
 
 
-def generate_parent_cell_lookup(anndata, labelset_dict):
+def generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping: Dict[str, str] = None,):
     """
     Generates a lookup dictionary mapping cell labels to various metadata, including cell IDs, rank,
     and cell ontology terms. This function is designed to precompute the lookup information needed for
@@ -212,13 +213,18 @@ def generate_parent_cell_lookup(anndata, labelset_dict):
                               including metadata in anndata.obs.
         labelset_dict (Dict[str, Any]): A dictionary where keys are labelset names and values
                                         are dictionaries containing members and their ranks.
+        accessions_mapping (Dict[str, str], optional): Mapping of cellset names to accession IDs.
+        (To enable usage of same names accross different labelsets, key is identified as labelset:cell_label).
 
     Returns:
         Dict[str, Any]: A dictionary where each key is a cell label and each value is another
                         dictionary containing keys for 'cell_ids' (a set of cell IDs associated
                         with the label), 'rank', 'cell_ontology_term_id', and 'cell_ontology_term'.
     """
-    accession_manager = HashAccessionManager()
+    if accessions_mapping is None:
+        accession_manager = HashAccessionManager()
+    else:
+        accession_manager = MappedAccessionManager(accession_map=accessions_mapping)
     parent_cell_look_up = {}
     for k, v in labelset_dict.items():
         for label in v["members"]:
@@ -227,7 +233,7 @@ def generate_parent_cell_lookup(anndata, labelset_dict):
             )
             cell_ids = get_cell_ids(anndata.obs, k, label)
             cell_set_accession = accession_manager.generate_accession_id(
-                cell_ids=cell_ids, labelset=k
+                cell_ids=cell_ids, labelset=k, cellset_name=label
             )
 
             if label in parent_cell_look_up:
@@ -326,7 +332,6 @@ def add_parent_hierarchy_to_annotations(
             # Add parent data to the annotation
             annotation.update(
                 {
-                    "parent_cell_set_name": parent,
                     "parent_cell_set_accession": p_accession,
                 }
             )
@@ -483,3 +488,31 @@ def retrieve_schema(schema_name):
     with schema_file.open("rt") as f:
         schema = json.loads(f.read())
     return schema
+
+
+def create_accession_mapping(adata_obs: pd.DataFrame, labelsets: list, accession_columns: list) -> Optional[Dict[str, str]]:
+    """
+    Creates a mapping of cellset names to accession IDs based on the provided labelsets and accession columns.
+    Args:
+        adata_obs: The observations DataFrame (`obs`) of an AnnData object containing the dataset.
+        labelsets: List of labelset names to be used for mapping.
+        accession_columns: List of columns in the AnnData obs that contain accession information.
+
+    Returns: Map of cellset names to accession IDs, where keys are formatted as "labelset:cell_label".
+    """
+    if accession_columns:
+        if len(labelsets) != len(accession_columns):
+            raise ValueError("The labelsets and accession_columns lists must have the same length.")
+
+        mapping = {}
+        for labelset, acc_col in zip(labelsets, accession_columns):
+            # Group the obs by the labelset column and get unique values from the corresponding accession column.
+            groups = adata_obs.groupby(labelset)[acc_col].unique()
+            for cellset_name, acc_vals in groups.items():
+                if len(acc_vals) != 1:
+                    raise ValueError(f"Non one-to-one mapping for '{labelset}' value '{cellset_name}'.")
+                # Create a combined key to enable usage of same names across different labelsets
+                mapping[f"{labelset}:{cellset_name}"] = acc_vals[0]
+        return mapping
+    else:
+        return None
diff --git a/src/test/conversion_utils_test.py b/src/test/conversion_utils_test.py
@@ -184,7 +184,7 @@ def test_add_parent_hierarchy(self):
                 "cell_ontology_term_id": "CL:1234567",
                 "labelset": "labelset1",
                 "parent_cell_set_accession": "P_123",
-                "parent_cell_set_name": "P",
+                # "parent_cell_set_name": "P",
             }
         ]
 

diff --git a/src/test/spreadsheet_to_cas_test.py b/src/test/spreadsheet_to_cas_test.py
@@ -228,7 +228,7 @@ def test_spreadsheet2cas(self, mock_read_anndata_file, mock_download_source_h5ad
 
             self.assertEqual(len(json_data), 9)
             self.assertEqual(len(json_data["annotations"]), 8)
-            self.assertEqual(len(json_data["annotations"][0]), 7)
+            self.assertEqual(len(json_data["annotations"][0]), 6)
             self.assertEqual(len(json_data["labelsets"]), 2)
         finally:
             # Remove the JSON file after the test