Skip to content

Commit

Permalink
minor changes to components
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Aug 29, 2024
1 parent 441f50c commit 497ed55
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 41 deletions.
17 changes: 16 additions & 1 deletion scripts/create_test_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ cd "$REPO_ROOT"

set -e

###################################################################################
DATASET_ID="10x_xenium/2023_10x_mouse_brain_xenium"
TMP_DIR="temp/datasets/$DATASET_ID"
OUT_DIR="resources_test/common/2023_10x_mouse_brain_xenium"
Expand Down Expand Up @@ -76,11 +77,25 @@ viash run src/data_processors/crop_region/config.vsh.yaml -- \
--min_y 10000 \
--max_y 12000


###################################################################################
DATASET_ID="allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2"
TMP_DIR="temp/datasets/$DATASET_ID"
OUT_DIR="resources_test/common/2023_abca_Yao_mouse_brain_scRNAseq_10Xv2"


# generate sc reference
VIASH_TEMP=/tmp/allen_brain_cell_atlas \
viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml -- \
--output "$OUT_DIR/sc_reference.h5ad" --regions "OLF;TH"
--output "$TMP_DIR/tmp_sc_reference.h5ad" --regions "OLF;TH"

viash run src/data_processors/subset_reference/config.vsh.yaml -- \
--input "$TMP_DIR/tmp_sc_reference.h5ad" \
--output "$OUT_DIR/sc_reference.h5ad"



###################################################################################
aws s3 sync --profile op \
"resources_test/common/2023_10x_mouse_brain_xenium" \
"s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium" \
Expand Down
92 changes: 52 additions & 40 deletions src/data_loaders/download_allen_brain_cell_atlas/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,14 @@
import anndata as ad
from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

# env setup:
# pip install -U git+https://github.com/alleninstitute/abc_atlas_access

## VIASH START
par = {
"version": "20230630",
"regions": ["OLF", "TH"],
"output": f"abc_atlas_20230630.h5ad",
"output": "temp/datasets/allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2/dataset.h5ad",
}
meta = {
"temp_dir": "/tmp",
"temp_dir": "/tmp/allen_brain_cell_atlas",
}
## VIASH END

Expand All @@ -24,35 +21,37 @@
REGIONS = par["regions"]
TMP_DIR = Path(meta["temp_dir"] or "/tmp")

print("Loading manifest", flush=True)
# saved to TMPDIR / releases/{VERSION}/manifest.json
abc_cache = AbcProjectCache.from_cache_dir(TMP_DIR)
abc_cache.load_manifest(
f"releases/{VERSION}/manifest.json"
) # saved to TMPDIR / releases/{VERSION}/manifest.json

# From abc_cache.list_data_files('WMB-10Xv2') # TODO: potentially also load other chemistries (currently only 10Xv2)
count_matrix_files = [f'WMB-10Xv2-{region}/raw' for region in REGIONS]
)

print("Downloading metadata", flush=True)
# From abc_cache.list_metadata_files('WMB-10Xv2')
metadata_files = [
'cell_metadata_with_cluster_annotation',
#'gene',
#'region_of_interest_metadata'
]
for file in metadata_files:
abc_cache.get_metadata_path(directory='WMB-10X', file_name=file)

print("Downloading expression matrices", flush=True)
# From abc_cache.list_data_files('WMB-10Xv2') # TODO: potentially also load other chemistries (currently only 10Xv2)
count_matrix_files = [f'WMB-10Xv2-{region}/raw' for region in REGIONS]

# Download data
for file in count_matrix_files:
abc_cache.get_data_path(directory='WMB-10Xv2', file_name=file)

for file in metadata_files:
abc_cache.get_metadata_path(directory='WMB-10X', file_name=file)

# Read an concatenate the data
print("Reading obs", flush=True)
obs = pd.read_csv(
TMP_DIR
/ f"metadata/WMB-10X/{VERSION}/views/cell_metadata_with_cluster_annotation.csv",
TMP_DIR / f"metadata/WMB-10X/{VERSION}/views/cell_metadata_with_cluster_annotation.csv",
index_col=0,
)

print("Reading expression matrices", flush=True)
adatas = []
for region in REGIONS:
adata = ad.read_h5ad(
Expand All @@ -62,42 +61,59 @@
adata.obs["region"] = region
adatas.append(adata)

adata = ad.concat(adatas)
print("Concatenating data", flush=True)
adata = ad.concat(adatas, merge="first")

# Renaming etc. to match the api

# Layers
print("Processing .layers")
adata.layers["counts"] = adata.X

# Obs
new_to_old_obs_keys = {
"dataset_id": "dataset_label", "assay": "library_method", "cell_type":'class',
"cell_type_level2": "subclass", "cell_type_level3": "supertype", "cell_type_level4": "cluster",
"donor_id":'donor_label', "sex": "donor_sex", "tissue": "region_of_interest_acronym", "batch": "library_label",
del adata.X

print("Processing .obs")
adata.obs = obs.loc[adata.obs.index]

# rename fields
rename_obs_keys = {
"dataset_id": "dataset_label",
"assay": "library_method",
"cell_type": 'class',
"cell_type_level2": "subclass",
"cell_type_level3": "supertype",
"cell_type_level4": "cluster",
"donor_id":'donor_label',
"sex": "donor_sex",
"tissue": "region_of_interest_acronym",
"batch": "library_label",
# #TODO "cell_type_unified" (?), maybe call the unified one "cell_type" and the original one "cell_type_level1"
# other keys: "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id"
# "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity",
# "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type",
# "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid"
}
new_key_to_value = {
"disease": "healthy", "organism": "Mus musculus", "tissue_general": "brain",
}
adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})

# add additional information to obs
store_info = {
"disease": "normal",
"disese_ontology_term_id": "PATO:0000461",
"organism": "Mus musculus",
"organism_ontology_term_id": "NCBITaxon:10090",
"tissue_general": "brain",
"tissue_general_ontology_term_id": "UBERON:0000955",
"development_stage": "adult", # from metadata at GEO GSE246717: all ages >= 51 days
"development_stage_ontology_term_id": "MmusDv:0000110"
}

adata.obs = obs.rename(columns={old:new for new,old in new_to_old_obs_keys.items()})
for key, value in new_key_to_value.items():
for key, value in store_info.items():
adata.obs[key] = value

# remove undesired columns
for key in adata.obs.columns:
if (key not in new_to_old_obs_keys.keys()) and (key not in new_key_to_value.keys()):
if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()):
print(f"Removing .obs['{key}']")
del adata.obs[key]

# Var
adata.var["feature_id"] = adata.var_names
adata.var = adata.var.rename(columns={"gene_symbol":"feature_name"})
adata.var_names = adata.var["feature_name"]
adata.var_names_make_unique()
adata.var.index.name = None

# Uns
adata.uns["dataset_id"] = "2023_Yao_mouse_brain_scRNAseq_10Xv2"
Expand All @@ -110,7 +126,3 @@

# Write data
adata.write_h5ad(par["output"])

# Delete the temporary files and directories
import shutil
shutil.rmtree(TMP_DIR)

0 comments on commit 497ed55

Please sign in to comment.