biobricks-ai · tmohoric · Sep 14, 2024 · Oct 29, 2024 · Dec 16, 2024
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
@@ -4,4 +4,4 @@ biobricks==0.3.7
 fastparquet==2024.5.0
 pyarrow==16.1.0
 dvc==3.51.1
-dvc-s3==3.2.0
+dvc-s3==3.2.0
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "githubPullRequests.ignoredPullRequestBranches": [
+        "main"
+    ]
+}
diff --git a/dvc.yaml b/dvc.yaml
@@ -10,10 +10,24 @@ stages:
     outs:
       - download
 
+  process:
+    cmd: python stages/02_process.py
+    deps:
+      - stages/02_process.py
+    outs:
+      - download
+
+  verify:
+    cmd: python stages/03_verify.py
+    deps:
+      - stages/02_process.py
+    outs:
+      - download
+
   build: 
-    cmd: python stages/02_build.py
+    cmd: python stages/04_build.py
     deps:
-      - stages/02_build.py
+      - stages/03_verify.py
       - download
     outs:
       - brick
diff --git a/pathways.parquet b/pathways.parquet
diff --git a/stages/00_environment.sh b/stages/00_environment.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
 
 pip install edelweiss-data
+pip install gseapy
diff --git a/stages/01_download.py b/stages/01_download.py
@@ -11,7 +11,7 @@
 overview_path = pathlib.Path("download")
 dataset_path = overview_path / "temposeq"
 pathways_path = overview_path / "pathways"
-log_file_path = overview_path / "files.txt"
+log_file_path = overview_path / "downloaded_files.txt"
 log_fold_change_threshold = 2
 adjusted_p_value_threshold = 0.05
 
@@ -35,63 +35,9 @@
 log_file.write(str(file_path) + "\n")
 
 # process individual datasets
-for dataset_id in tqdm(datasets['Dataset id'], desc="Processing individual datasets"):
+for dataset_id in tqdm(datasets['Dataset id'], desc="Download individual datasets"):
     file_path = dataset_path / f'{dataset_id}.csv'
     dataset = api.get_published_dataset(id=dataset_id, version='latest')
     data_frame = dataset.get_data()
     data_frame.to_csv(file_path, index=False)
     log_file.write(str(file_path) + "\n")
-
-# process pathways
-def get_pathways_for_gene(gene_symbol):
-    """Fetch pathways associated with a gene symbol using the Enrichr API."""
-    ENRICHR_URL = "https://amp.pharm.mssm.edu/Enrichr/addList"
-    ENRICHR_URL_A = 'https://amp.pharm.mssm.edu/Enrichr/enrich?userListId=%s&backgroundType=%s'
-    GENE_SET_LIBRARY = "KEGG_2015"
-
-    payload = { 'list': (None, str(gene_symbol)), 'description': (None, "No description") }
-    response = requests.post(url=ENRICHR_URL, files=payload)
-    response.raise_for_status()
-
-    user_list_id = json.loads(response.text)['userListId']
-
-    time.sleep(2)
-
-    response_gene_list = requests.get(ENRICHR_URL_A % (str(user_list_id), GENE_SET_LIBRARY))
-    response_gene_list.raise_for_status()
-
-    enrichr_results = json.loads(response_gene_list.text)[GENE_SET_LIBRARY]
-
-    return pd.DataFrame([{
-        'Gene symbol': gene_symbol,
-        'Rank': result[0],
-        'Pathway': result[1],
-        'p-value': result[2],
-        'Adj. p-value': result[6],
-        'Odds Ratio': result[3],
-        'Combined score': result[4]
-    } for result in enrichr_results])
-
-columns = ['Gene symbol', 'Rank', 'Pathway', 'p-value', 'Adj. p-value', 'Odds Ratio', 'Combined score']
-degs_pathways = pd.DataFrame(columns=columns)
-
-for dataset_id in tqdm(datasets['Dataset id'], desc="Processing pathways"):
-    file_path = pathways_path / f'{dataset_id}_pathways.csv'
-    dataset = api.get_published_dataset(id=dataset_id, version='latest')
-    data_frame = dataset.get_data()
-
-    is_significant_fold_change = (data_frame['logFC'] > log_fold_change_threshold) | \
-                                    (data_frame['logFC'] < -log_fold_change_threshold)
-    is_significant_p_value = data_frame['padj'] < adjusted_p_value_threshold
-
-    degs_data_frame = data_frame[is_significant_fold_change & is_significant_p_value]
-
-    if not degs_data_frame.empty:
-        for _, gene_row in degs_data_frame.iterrows():
-            pathways = get_pathways_for_gene(gene_row['SYMBOL'])
-            degs_pathways = pd.concat([degs_pathways, pathways], ignore_index=True)
-
-        degs_pathways.to_csv(file_path, index=False)
-        log_file.write(str(file_path) + "\n")
-    else:
-        print(f"Missing required columns in dataset {dataset_id}")
diff --git a/stages/02_build.py b/stages/02_build.py
diff --git a/stages/02_process.py b/stages/02_process.py
@@ -0,0 +1,84 @@
+import pandas as pd
+import pathlib
+from tqdm import tqdm
+import gseapy as gp
+
+# Define paths and variables
+overview_path = pathlib.Path("download")
+dataset_path = overview_path / "temposeq"
+downloaded_file_path = overview_path / "downloaded_files.txt"
+overview_file_path = overview_path / "overview.csv"
+log_file_path = overview_path / "processed_files.txt"
+pathways_file_path = overview_path / "pathways.csv"
+
+LOG_FOLD_CHANGE_THRESHOLD = 2
+ADJUSTED_P_VALUE_THRESHOLD = 0.05
+GENE_SET_LIBRARY = "WikiPathway_2023_Human"
+
+# Ensure directories exist
+dataset_path.mkdir(parents=True, exist_ok=True)
+
+# Read the overview dataset
+datasets = pd.read_csv(overview_file_path)
+
+# Check that all temposeq datasets have been downloaded in the previous step
+downloaded_file = open(downloaded_file_path, "r")
+downloaded_files = downloaded_file.read().splitlines()
+downloaded_files = downloaded_files[1 : ]  # remove the first file, which is the overview file
+
+if len(downloaded_files) == datasets.shape[0]:
+    print("All temposeq datasets have been downloaded. Proceed with processing.")
+else:
+    print("""Number of temposeq datasets and downloaded files do not match.\n
+          Number of temposeq datasets: {0}\n
+          Number of downloaded files: {1}""".format(datasets.shape[0], len(downloaded_files)))
+    quit()
+
+
+# Initiate the log file that will store processed filepaths
+log_file = open(log_file_path, "w")
+
+degs_data_frame = pd.DataFrame()
+
+# Loop over downloaded temposeq files and identify deg and process the pathways
+for dataset_id in tqdm(datasets['Dataset id'], desc="Collecting de genes"):
+    temposeq_path = dataset_path / f'{dataset_id}.csv'
+    data_frame = pd.read_csv(temposeq_path)
+
+    is_significant_fold_change = (data_frame['logFC'] > LOG_FOLD_CHANGE_THRESHOLD) | \
+                                    (data_frame['logFC'] < -LOG_FOLD_CHANGE_THRESHOLD)
+
+    is_significant_p_value = data_frame['padj'] < ADJUSTED_P_VALUE_THRESHOLD
+
+    tmp_degs_data_frame = data_frame[is_significant_fold_change & is_significant_p_value][['SYMBOL']]
+    degs_data_frame = pd.concat([degs_data_frame, tmp_degs_data_frame], axis=0, ignore_index=True)
+
+    log_file.write(str(temposeq_path) + "\n")
+
+degs_data_frame = degs_data_frame.drop_duplicates(subset=['SYMBOL'])
+
+# process pathways
+def get_pathways_for_gene(gene_symbol, gene_set_library):
+    enr = gp.enrich(gene_list=[gene_symbol],
+                    gene_sets=gene_set_library,
+                    background=None,
+                    outdir=None,
+                    verbose=False)
+    return enr.results
+
+# Process deg genes
+degs_pathways = pd.DataFrame()
+for _, gene_row in tqdm(degs_data_frame.iterrows(), desc=f"Processing {degs_data_frame.shape[0]} de genes"):
+
+    if pd.isna(gene_row['SYMBOL']):
+        continue
+
+    if gene_row['SYMBOL'] == "":
+        continue
+
+    pathways = get_pathways_for_gene(gene_row['SYMBOL'], GENE_SET_LIBRARY)
+    if isinstance(pathways, pd.DataFrame):
+        if not pathways.empty:
+            degs_pathways = pd.concat([degs_pathways, pathways], ignore_index=True)
+
+degs_pathways.to_csv(pathways_file_path, index=False)
diff --git a/stages/03_verify.py b/stages/03_verify.py
@@ -0,0 +1,51 @@
+import pandas as pd
+import pathlib
+
+# Define paths and variables
+overview_path = pathlib.Path("download")
+downloaded_file_path = overview_path / "downloaded_files.txt"
+processed_file_path = overview_path / "processed_files.txt"
+overview_file_path = overview_path / "overview.csv"
+verification_file_path = overview_path / "verification_success.txt"
+
+# Check that verification file does not exist
+try:
+    verification_file_path.unlink()
+    print('Verification file deleted')
+except:
+    print("Verification file doesn't exist")
+
+
+# Read the overview dataset
+overview_file_path = overview_path / 'overview.csv'
+datasets = pd.read_csv(overview_file_path)
+
+# Read the downloaded file list
+downloaded_file = open(downloaded_file_path, "r")
+downloaded_files = downloaded_file.read().splitlines()
+downloaded_files = downloaded_files[1 : ]  # remove the first file, which is the overview file
+
+# Read the processed file list
+processed_file = open(processed_file_path, "r")
+processed_files = processed_file.read().splitlines()
+
+# Check that all temposeq datasets have been downloaded
+if len(downloaded_files) == datasets.shape[0]:
+    print("Verification step 1: OK.\nAll temposeq datasets have been downloaded.")
+else:
+    print("""Verification failed.\nNumber of temposeq datasets and downloaded files do not match.\n
+          Number of temposeq datasets: {0}\n
+          Number of downloaded files: {1}""".format(datasets.shape[0], len(downloaded_files)))
+    quit()
+
+# Check that all temposeq datasets have been downloaded
+if len(processed_files) == datasets.shape[0]:
+    print("Verification step 2: OK.\nAll temposeq datasets have been processed.")
+else:
+    print("""Verification failed.\nNumber of temposeq datasets and processed files do not match.\n
+          Number of temposeq datasets: {0}\n
+          Number of processed files: {1}""".format(datasets.shape[0], len(processed_files)))
+    quit()
+
+log_file = open(verification_file_path, "w")
+log_file.write(f"Verification successful.")
diff --git a/stages/04_build.py b/stages/04_build.py
@@ -0,0 +1,32 @@
+import os, pathlib
+import pandas as pd
+
+# Set list path where you can store additional information or lists, if needed
+dldir = pathlib.Path("download")
+
+# Check if verification file exists
+verification_success = dldir / "verification_success.txt"
+if not verification_success.is_file():
+    print("Stop building because verification failed.")
+    quit()
+
+# Create brick directory to store Parquet files
+brickpath = pathlib.Path("brick")
+(brickpath / "temposeq.parquet").mkdir(parents=True, exist_ok=True)
+
+# Build temposeq parquet files
+# Read the list of temposeq files
+file_paths = [line.strip() for line in open(dldir / "downloaded_files.txt", "r")]
+
+# Process each temposeq file 
+for inputpath in file_paths:
+    outputpath = inputpath.replace("download", "brick")\
+        .replace("temposeq", "temposeq.parquet")\
+        .replace(".csv", ".parquet")
+
+    pd.read_csv(inputpath).to_parquet(outputpath)
+
+# Build pathways parquet file
+inputpath = dldir / "pathways.csv"
+outputpath = inputpath.name.replace("download", "brick").replace(".csv", ".parquet")
+pd.read_csv(inputpath).to_parquet(outputpath)