minor changes to components

openproblems-bio · Aug 29, 2024 · 497ed55 · 497ed55
1 parent 441f50c
commit 497ed55
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 41 deletions.
diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh
@@ -8,6 +8,7 @@ cd "$REPO_ROOT"
 
 set -e
 
+###################################################################################
 DATASET_ID="10x_xenium/2023_10x_mouse_brain_xenium"
 TMP_DIR="temp/datasets/$DATASET_ID"
 OUT_DIR="resources_test/common/2023_10x_mouse_brain_xenium"
@@ -76,11 +77,25 @@ viash run src/data_processors/crop_region/config.vsh.yaml -- \
   --min_y 10000 \
   --max_y 12000
 
+
+###################################################################################
+DATASET_ID="allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2"
+TMP_DIR="temp/datasets/$DATASET_ID"
+OUT_DIR="resources_test/common/2023_abca_Yao_mouse_brain_scRNAseq_10Xv2"
+
+
 # generate sc reference
 VIASH_TEMP=/tmp/allen_brain_cell_atlas \
   viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml -- \
-  --output "$OUT_DIR/sc_reference.h5ad" --regions "OLF;TH"
+  --output "$TMP_DIR/tmp_sc_reference.h5ad" --regions "OLF;TH"
+
+viash run src/data_processors/subset_reference/config.vsh.yaml -- \
+  --input "$TMP_DIR/tmp_sc_reference.h5ad" \
+  --output "$OUT_DIR/sc_reference.h5ad"
+
+
 
+###################################################################################
 aws s3 sync --profile op \
   "resources_test/common/2023_10x_mouse_brain_xenium" \
   "s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium" \

diff --git a/src/data_loaders/download_allen_brain_cell_atlas/script.py b/src/data_loaders/download_allen_brain_cell_atlas/script.py
@@ -5,17 +5,14 @@
 import anndata as ad
 from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache
 
-# env setup:
-# pip install -U git+https://github.com/alleninstitute/abc_atlas_access
-
 ## VIASH START
 par = {
     "version": "20230630",
     "regions": ["OLF", "TH"],
-    "output": f"abc_atlas_20230630.h5ad",
+    "output": "temp/datasets/allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2/dataset.h5ad",
 }
 meta = {
-    "temp_dir": "/tmp",
+    "temp_dir": "/tmp/allen_brain_cell_atlas",
 }
 ## VIASH END
 
@@ -24,35 +21,37 @@
 REGIONS = par["regions"]
 TMP_DIR = Path(meta["temp_dir"] or "/tmp")
 
+print("Loading manifest", flush=True)
+# saved to TMPDIR / releases/{VERSION}/manifest.json
 abc_cache = AbcProjectCache.from_cache_dir(TMP_DIR)
 abc_cache.load_manifest(
     f"releases/{VERSION}/manifest.json"
-)  # saved to TMPDIR / releases/{VERSION}/manifest.json
-
-# From abc_cache.list_data_files('WMB-10Xv2') # TODO: potentially also load other chemistries (currently only 10Xv2)
-count_matrix_files = [f'WMB-10Xv2-{region}/raw' for region in REGIONS]
+)
 
+print("Downloading metadata", flush=True)
 # From abc_cache.list_metadata_files('WMB-10Xv2')
 metadata_files = [
     'cell_metadata_with_cluster_annotation',
     #'gene',
     #'region_of_interest_metadata'
 ]
+for file in metadata_files:
+    abc_cache.get_metadata_path(directory='WMB-10X', file_name=file)
+
+print("Downloading expression matrices", flush=True)
+# From abc_cache.list_data_files('WMB-10Xv2') # TODO: potentially also load other chemistries (currently only 10Xv2)
+count_matrix_files = [f'WMB-10Xv2-{region}/raw' for region in REGIONS]
 
-# Download data
 for file in count_matrix_files:
     abc_cache.get_data_path(directory='WMB-10Xv2', file_name=file)
 
-for file in metadata_files:
-    abc_cache.get_metadata_path(directory='WMB-10X', file_name=file)
-
-# Read an concatenate the data
+print("Reading obs", flush=True)
 obs = pd.read_csv(
-    TMP_DIR
-    / f"metadata/WMB-10X/{VERSION}/views/cell_metadata_with_cluster_annotation.csv",
+    TMP_DIR / f"metadata/WMB-10X/{VERSION}/views/cell_metadata_with_cluster_annotation.csv",
     index_col=0,
 )
 
+print("Reading expression matrices", flush=True)
 adatas = []
 for region in REGIONS:
     adata = ad.read_h5ad(
@@ -62,42 +61,59 @@
     adata.obs["region"] = region
     adatas.append(adata)
 
-adata = ad.concat(adatas)
+print("Concatenating data", flush=True)
+adata = ad.concat(adatas, merge="first")
 
-# Renaming etc. to match the api
-
-# Layers
+print("Processing .layers")
 adata.layers["counts"] = adata.X
-
-# Obs
-new_to_old_obs_keys = {
-    "dataset_id": "dataset_label", "assay": "library_method", "cell_type":'class', 
-    "cell_type_level2": "subclass", "cell_type_level3": "supertype", "cell_type_level4": "cluster",
-    "donor_id":'donor_label', "sex": "donor_sex", "tissue": "region_of_interest_acronym", "batch": "library_label",
+del adata.X
+
+print("Processing .obs")
+adata.obs = obs.loc[adata.obs.index]
+
+# rename fields
+rename_obs_keys = {
+    "dataset_id": "dataset_label",
+    "assay": "library_method",
+    "cell_type": 'class', 
+    "cell_type_level2": "subclass",
+    "cell_type_level3": "supertype",
+    "cell_type_level4": "cluster",
+    "donor_id":'donor_label',
+    "sex": "donor_sex",
+    "tissue": "region_of_interest_acronym",
+    "batch": "library_label",
     # #TODO "cell_type_unified" (?), maybe call the unified one "cell_type" and the original one "cell_type_level1"
     # other keys: "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id"
     # "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity", 
     # "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type", 
     # "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid"
- }
-new_key_to_value = {
-    "disease": "healthy", "organism": "Mus musculus", "tissue_general": "brain", 
+}
+adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
+
+# add additional information to obs
+store_info = {
+    "disease": "normal",
+    "disese_ontology_term_id": "PATO:0000461",
+    "organism": "Mus musculus",
+    "organism_ontology_term_id": "NCBITaxon:10090",
+    "tissue_general": "brain",
+    "tissue_general_ontology_term_id": "UBERON:0000955",
     "development_stage": "adult", # from metadata at GEO GSE246717: all ages >= 51 days
+    "development_stage_ontology_term_id": "MmusDv:0000110"
 }
-
-adata.obs = obs.rename(columns={old:new for new,old in new_to_old_obs_keys.items()})
-for key, value in new_key_to_value.items():
+for key, value in store_info.items():
     adata.obs[key] = value
+
+# remove undesired columns
 for key in adata.obs.columns:
-    if (key not in new_to_old_obs_keys.keys()) and (key not in new_key_to_value.keys()):
+    if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()):
+        print(f"Removing .obs['{key}']")
         del adata.obs[key]
 
 # Var
 adata.var["feature_id"] = adata.var_names
 adata.var = adata.var.rename(columns={"gene_symbol":"feature_name"})
-adata.var_names = adata.var["feature_name"]
-adata.var_names_make_unique()
-adata.var.index.name = None
 
 # Uns
 adata.uns["dataset_id"] = "2023_Yao_mouse_brain_scRNAseq_10Xv2"
@@ -110,7 +126,3 @@
 
 # Write data
 adata.write_h5ad(par["output"])
-
-# Delete the temporary files and directories
-import shutil
-shutil.rmtree(TMP_DIR)