Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ cas anndata2cas --anndata path/to/anndata.h5ad --labelsets item1 item2 item3 --o
names. The labelsets should be provided in order, starting from rank 0 (leaf nodes) and ascending to higher ranks.
- `--output` : Output CAS file name (default: output.json).
- `--hierarchy`: Flag indicating whether to include hierarchy in the output.
- `--accession_columns`: List of columns in the AnnData obs that contain accession ID information.
This list should match the order and length of the labelsets argument.
If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.
Defaults to None.

## Convert ABC to CAS

Expand Down
15 changes: 14 additions & 1 deletion src/cas/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,9 @@ def main():
labelsets = args.labelsets
output_file_path = args.output
include_hierarchy = args.hierarchy
accession_columns = args.accession_columns

anndata2cas(anndata_file_path, labelsets, output_file_path, include_hierarchy)
anndata2cas(anndata_file_path, labelsets, output_file_path, include_hierarchy, accession_columns)
elif args.action == "abc2cas":
args = parser.parse_args()
cat_set_file_path = args.catset
Expand Down Expand Up @@ -390,6 +391,10 @@ def create_anndata2cas_operation_parser(subparsers):

--output : Output CAS file name (default: output.json).
--hierarchy : Flag indicating whether to include hierarchy in the output.
--accession_columns : List of columns in the AnnData obs that contain accession information.
This list should match the order and length of the labelsets argument.
If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.
Defaults to None.


Usage Example:
Expand Down Expand Up @@ -427,6 +432,14 @@ def create_anndata2cas_operation_parser(subparsers):
action="store_true",
help="Include hierarchy in the output.",
)
parser_anndata2cas.add_argument(
"--accession_columns",
nargs="+",
default=None,
help="An optional list of accession_id columns to populate cell_set_accession. "
"This list should match the order and length of the labelsets argument. "
"If not provided, accession IDs will be automatically generated using a hash of the cells in each cell set.",
)


def create_abc2cas_operation_parser(subparsers):
Expand Down
3 changes: 2 additions & 1 deletion src/cas/accession/base_accession_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ class BaseAccessionManager(metaclass=abc.ABCMeta):

@abc.abstractmethod
def generate_accession_id(
self, id_recommendation: str = None, labelset: str = None
self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None
) -> str:
"""
Generates an auto-increment based accession id. If the recommended accession_id is available, uses it.
Params:
id_recommendation: accession id recommendation. Function uses this id if it is available,
provides an auto-incremented id otherwise.
labelset: Labelset name. If provided, uses it as a prefix to the accession id.
cellset_name: Name of the cell set for which the accession ID is being generated.
Return: accession_id
"""
pass
3 changes: 2 additions & 1 deletion src/cas/accession/hash_accession_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self, accession_prefix=None, digest_size=5):
self.accession_ids = list()

def generate_accession_id(
self, id_recommendation: str = None, cell_ids: List = None, labelset: str = None, suppress_warnings=False
self, id_recommendation: str = None, cell_ids: List = None, labelset: str = None, suppress_warnings=False, cellset_name: str = None
) -> str:
"""
Generates a Blake2b hashing algorithm based hash for the given cell IDs.
Expand All @@ -28,6 +28,7 @@ def generate_accession_id(
cell_ids: Cell IDs list. Algorithm sorts cell ids internally.
labelset: Labelset name. If provided, uses it as a prefix to the accession id.
suppress_warnings: If True, suppresses warnings.
cellset_name: this parameter is not utilized in this implementation.
Return: accession_id
"""
if id_recommendation and labelset and ":" not in id_recommendation:
Expand Down
3 changes: 2 additions & 1 deletion src/cas/accession/incremental_accession_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ def __init__(self, accession_prefix=None, last_accession_id=0):
self.accession_ids = list()

def generate_accession_id(
self, id_recommendation: str = None, labelset: str = None
self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None
) -> str:
"""
Generates an auto-increment based accession id. If the recommended accession_id is available, uses it.
Params:
id_recommendation: accession id recommendation. Function uses this id if it is available,
provides an auto-incremented id otherwise.
labelset: this parameter is not utilized in this implementation.
cellset_name: this parameter is not utilized in this implementation.
Return: accession_id
"""
if id_recommendation:
Expand Down
33 changes: 33 additions & 0 deletions src/cas/accession/mapped_accession_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from cas.accession.base_accession_manager import BaseAccessionManager


class MappedAccessionManager(BaseAccessionManager):
"""
Predefined Accession ID generator based on a mapping of cell set names to accession IDs.
This accession manager is used when the accession IDs are already defined and mapped to specific cell sets.
"""

def __init__(self, accession_map):
"""
Initializer.
Params:
accession_map: map of cell set names to their corresponding accession IDs.
(To enable usage of same names accross different labelsets, key is identified as labelset:cell_label).
"""
self.accession_map = accession_map

def generate_accession_id(
self, id_recommendation: str = None, labelset: str = None, cellset_name: str = None, **kwargs
) -> str:
"""
Generates an auto-increment based accession id. If the recommended accession_id is available, uses it.
Params:
id_recommendation: this parameter is not utilized in this implementation.
labelset: this parameter is not utilized in this implementation.
cellset_name: Name of the cell set for which the accession ID is being generated.
Return: accession_id
"""
if labelset + ':' + cellset_name in self.accession_map:
return self.accession_map[labelset + ':' +cellset_name]
else:
raise ValueError(f"Cell set name '{labelset}:{cellset_name}' not found in the accession map.")
10 changes: 9 additions & 1 deletion src/cas/anndata_to_cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
calculate_labelset,
generate_parent_cell_lookup,
get_authors_from_doi,
create_accession_mapping
)


Expand All @@ -17,6 +18,7 @@ def anndata2cas(
labelsets: List[str],
output_file_path: str,
include_hierarchy: bool,
accession_columns: List[str] = None,
):
"""
Convert an AnnData file to Cell Annotation Schema (CAS) JSON.
Expand All @@ -28,17 +30,23 @@ def anndata2cas(
to higher ranks.
output_file_path (str): Output CAS file name.
include_hierarchy (bool): Flag indicating whether to include hierarchy in the output.
accession_columns (List[str], optional): List of columns in the AnnData obs that contain accession information.
If provided, these columns will be used to populate the 'cell_set_accession' field in the CAS annotations.
Otherwise, accession IDs will be automatically generated using a hash of the cells in each cell set.
Defaults to None.
"""

anndata = read_anndata_file(anndata_file_path)

labelset_dict = calculate_labelset(anndata.obs, labelsets)

accessions_mapping = create_accession_mapping(anndata.obs, labelsets, accession_columns)

cas = generate_cas_metadata(dict(anndata.uns))

add_labelsets_to_cas(cas, labelset_dict)

parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict)
parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping)

add_annotations_to_cas(cas, labelset_dict, parent_cell_look_up)

Expand Down
41 changes: 37 additions & 4 deletions src/cas/utils/conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from cas_schema import schemas

from cas.accession.hash_accession_manager import HashAccessionManager
from cas.accession.mapped_accession_manager import MappedAccessionManager
from cas.dataset_retrieval.dataset_retriever import DatasetRetriever
from cas.file_utils import get_cas_schema_names

Expand Down Expand Up @@ -201,7 +202,7 @@ def collect_parent_cell_ids(cas: Dict[str, Any]) -> Dict[str, Set]:
return parent_cell_ids


def generate_parent_cell_lookup(anndata, labelset_dict):
def generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping: Dict[str, str] = None,):
"""
Generates a lookup dictionary mapping cell labels to various metadata, including cell IDs, rank,
and cell ontology terms. This function is designed to precompute the lookup information needed for
Expand All @@ -212,13 +213,18 @@ def generate_parent_cell_lookup(anndata, labelset_dict):
including metadata in anndata.obs.
labelset_dict (Dict[str, Any]): A dictionary where keys are labelset names and values
are dictionaries containing members and their ranks.
accessions_mapping (Dict[str, str], optional): Mapping of cellset names to accession IDs.
(To enable usage of same names accross different labelsets, key is identified as labelset:cell_label).

Returns:
Dict[str, Any]: A dictionary where each key is a cell label and each value is another
dictionary containing keys for 'cell_ids' (a set of cell IDs associated
with the label), 'rank', 'cell_ontology_term_id', and 'cell_ontology_term'.
"""
accession_manager = HashAccessionManager()
if accessions_mapping is None:
accession_manager = HashAccessionManager()
else:
accession_manager = MappedAccessionManager(accession_map=accessions_mapping)
parent_cell_look_up = {}
for k, v in labelset_dict.items():
for label in v["members"]:
Expand All @@ -227,7 +233,7 @@ def generate_parent_cell_lookup(anndata, labelset_dict):
)
cell_ids = get_cell_ids(anndata.obs, k, label)
cell_set_accession = accession_manager.generate_accession_id(
cell_ids=cell_ids, labelset=k
cell_ids=cell_ids, labelset=k, cellset_name=label
)

if label in parent_cell_look_up:
Expand Down Expand Up @@ -326,7 +332,6 @@ def add_parent_hierarchy_to_annotations(
# Add parent data to the annotation
annotation.update(
{
"parent_cell_set_name": parent,
"parent_cell_set_accession": p_accession,
}
)
Expand Down Expand Up @@ -483,3 +488,31 @@ def retrieve_schema(schema_name):
with schema_file.open("rt") as f:
schema = json.loads(f.read())
return schema


def create_accession_mapping(adata_obs: pd.DataFrame, labelsets: list, accession_columns: list) -> Optional[Dict[str, str]]:
"""
Creates a mapping of cellset names to accession IDs based on the provided labelsets and accession columns.
Args:
adata_obs: The observations DataFrame (`obs`) of an AnnData object containing the dataset.
labelsets: List of labelset names to be used for mapping.
accession_columns: List of columns in the AnnData obs that contain accession information.

Returns: Map of cellset names to accession IDs, where keys are formatted as "labelset:cell_label".
"""
if accession_columns:
if len(labelsets) != len(accession_columns):
raise ValueError("The labelsets and accession_columns lists must have the same length.")

mapping = {}
for labelset, acc_col in zip(labelsets, accession_columns):
# Group the obs by the labelset column and get unique values from the corresponding accession column.
groups = adata_obs.groupby(labelset)[acc_col].unique()
for cellset_name, acc_vals in groups.items():
if len(acc_vals) != 1:
raise ValueError(f"Non one-to-one mapping for '{labelset}' value '{cellset_name}'.")
# Create a combined key to enable usage of same names across different labelsets
mapping[f"{labelset}:{cellset_name}"] = acc_vals[0]
return mapping
else:
return None
2 changes: 1 addition & 1 deletion src/test/conversion_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_add_parent_hierarchy(self):
"cell_ontology_term_id": "CL:1234567",
"labelset": "labelset1",
"parent_cell_set_accession": "P_123",
"parent_cell_set_name": "P",
# "parent_cell_set_name": "P",
}
]

Expand Down
2 changes: 1 addition & 1 deletion src/test/spreadsheet_to_cas_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def test_spreadsheet2cas(self, mock_read_anndata_file, mock_download_source_h5ad

self.assertEqual(len(json_data), 9)
self.assertEqual(len(json_data["annotations"]), 8)
self.assertEqual(len(json_data["annotations"][0]), 7)
self.assertEqual(len(json_data["annotations"][0]), 6)
self.assertEqual(len(json_data["labelsets"]), 2)
finally:
# Remove the JSON file after the test
Expand Down