Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial commit for RDF (see notebook eutr-rdf-first-try.ipynb) #10

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ biobricks==0.3.7
fastparquet==2024.5.0
pyarrow==16.1.0
dvc==3.51.1
dvc-s3==3.2.0
dvc-s3==3.2.0
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"githubPullRequests.ignoredPullRequestBranches": [
"main"
]
}
18 changes: 16 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,24 @@ stages:
outs:
- download

process:
cmd: python stages/02_process.py
deps:
- stages/02_process.py
outs:
- download

verify:
cmd: python stages/03_verify.py
deps:
- stages/02_process.py
outs:
- download

build:
cmd: python stages/02_build.py
cmd: python stages/04_build.py
deps:
- stages/02_build.py
- stages/03_verify.py
- download
outs:
- brick
Binary file added pathways.parquet
Binary file not shown.
1 change: 1 addition & 0 deletions stages/00_environment.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env bash

pip install edelweiss-data
pip install gseapy
58 changes: 2 additions & 56 deletions stages/01_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
overview_path = pathlib.Path("download")
dataset_path = overview_path / "temposeq"
pathways_path = overview_path / "pathways"
log_file_path = overview_path / "files.txt"
log_file_path = overview_path / "downloaded_files.txt"
log_fold_change_threshold = 2
adjusted_p_value_threshold = 0.05

Expand All @@ -35,63 +35,9 @@
log_file.write(str(file_path) + "\n")

# process individual datasets
for dataset_id in tqdm(datasets['Dataset id'], desc="Processing individual datasets"):
for dataset_id in tqdm(datasets['Dataset id'], desc="Download individual datasets"):
file_path = dataset_path / f'{dataset_id}.csv'
dataset = api.get_published_dataset(id=dataset_id, version='latest')
data_frame = dataset.get_data()
data_frame.to_csv(file_path, index=False)
log_file.write(str(file_path) + "\n")

# process pathways
def get_pathways_for_gene(gene_symbol):
"""Fetch pathways associated with a gene symbol using the Enrichr API."""
ENRICHR_URL = "https://amp.pharm.mssm.edu/Enrichr/addList"
ENRICHR_URL_A = 'https://amp.pharm.mssm.edu/Enrichr/enrich?userListId=%s&backgroundType=%s'
GENE_SET_LIBRARY = "KEGG_2015"

payload = { 'list': (None, str(gene_symbol)), 'description': (None, "No description") }
response = requests.post(url=ENRICHR_URL, files=payload)
response.raise_for_status()

user_list_id = json.loads(response.text)['userListId']

time.sleep(2)

response_gene_list = requests.get(ENRICHR_URL_A % (str(user_list_id), GENE_SET_LIBRARY))
response_gene_list.raise_for_status()

enrichr_results = json.loads(response_gene_list.text)[GENE_SET_LIBRARY]

return pd.DataFrame([{
'Gene symbol': gene_symbol,
'Rank': result[0],
'Pathway': result[1],
'p-value': result[2],
'Adj. p-value': result[6],
'Odds Ratio': result[3],
'Combined score': result[4]
} for result in enrichr_results])

columns = ['Gene symbol', 'Rank', 'Pathway', 'p-value', 'Adj. p-value', 'Odds Ratio', 'Combined score']
degs_pathways = pd.DataFrame(columns=columns)

for dataset_id in tqdm(datasets['Dataset id'], desc="Processing pathways"):
file_path = pathways_path / f'{dataset_id}_pathways.csv'
dataset = api.get_published_dataset(id=dataset_id, version='latest')
data_frame = dataset.get_data()

is_significant_fold_change = (data_frame['logFC'] > log_fold_change_threshold) | \
(data_frame['logFC'] < -log_fold_change_threshold)
is_significant_p_value = data_frame['padj'] < adjusted_p_value_threshold

degs_data_frame = data_frame[is_significant_fold_change & is_significant_p_value]

if not degs_data_frame.empty:
for _, gene_row in degs_data_frame.iterrows():
pathways = get_pathways_for_gene(gene_row['SYMBOL'])
degs_pathways = pd.concat([degs_pathways, pathways], ignore_index=True)

degs_pathways.to_csv(file_path, index=False)
log_file.write(str(file_path) + "\n")
else:
print(f"Missing required columns in dataset {dataset_id}")
23 changes: 0 additions & 23 deletions stages/02_build.py

This file was deleted.

84 changes: 84 additions & 0 deletions stages/02_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import pathlib
from tqdm import tqdm
import gseapy as gp

# Define paths and variables
overview_path = pathlib.Path("download")
dataset_path = overview_path / "temposeq"
downloaded_file_path = overview_path / "downloaded_files.txt"
overview_file_path = overview_path / "overview.csv"
log_file_path = overview_path / "processed_files.txt"
pathways_file_path = overview_path / "pathways.csv"

LOG_FOLD_CHANGE_THRESHOLD = 2
ADJUSTED_P_VALUE_THRESHOLD = 0.05
GENE_SET_LIBRARY = "WikiPathway_2023_Human"

# Ensure directories exist
dataset_path.mkdir(parents=True, exist_ok=True)

# Read the overview dataset
datasets = pd.read_csv(overview_file_path)

# Check that all temposeq datasets have been downloaded in the previous step
downloaded_file = open(downloaded_file_path, "r")
downloaded_files = downloaded_file.read().splitlines()
downloaded_files = downloaded_files[1 : ] # remove the first file, which is the overview file

if len(downloaded_files) == datasets.shape[0]:
print("All temposeq datasets have been downloaded. Proceed with processing.")
else:
print("""Number of temposeq datasets and downloaded files do not match.\n
Number of temposeq datasets: {0}\n
Number of downloaded files: {1}""".format(datasets.shape[0], len(downloaded_files)))
quit()


# Initiate the log file that will store processed filepaths
log_file = open(log_file_path, "w")

degs_data_frame = pd.DataFrame()

# Loop over downloaded temposeq files and identify deg and process the pathways
for dataset_id in tqdm(datasets['Dataset id'], desc="Collecting de genes"):
temposeq_path = dataset_path / f'{dataset_id}.csv'
data_frame = pd.read_csv(temposeq_path)

is_significant_fold_change = (data_frame['logFC'] > LOG_FOLD_CHANGE_THRESHOLD) | \
(data_frame['logFC'] < -LOG_FOLD_CHANGE_THRESHOLD)

is_significant_p_value = data_frame['padj'] < ADJUSTED_P_VALUE_THRESHOLD

tmp_degs_data_frame = data_frame[is_significant_fold_change & is_significant_p_value][['SYMBOL']]
degs_data_frame = pd.concat([degs_data_frame, tmp_degs_data_frame], axis=0, ignore_index=True)

log_file.write(str(temposeq_path) + "\n")

degs_data_frame = degs_data_frame.drop_duplicates(subset=['SYMBOL'])

# process pathways
def get_pathways_for_gene(gene_symbol, gene_set_library):
enr = gp.enrich(gene_list=[gene_symbol],
gene_sets=gene_set_library,
background=None,
outdir=None,
verbose=False)
return enr.results

# Process deg genes
degs_pathways = pd.DataFrame()
for _, gene_row in tqdm(degs_data_frame.iterrows(), desc=f"Processing {degs_data_frame.shape[0]} de genes"):

if pd.isna(gene_row['SYMBOL']):
continue

if gene_row['SYMBOL'] == "":
continue

pathways = get_pathways_for_gene(gene_row['SYMBOL'], GENE_SET_LIBRARY)
if isinstance(pathways, pd.DataFrame):
if not pathways.empty:
degs_pathways = pd.concat([degs_pathways, pathways], ignore_index=True)

degs_pathways.to_csv(pathways_file_path, index=False)
51 changes: 51 additions & 0 deletions stages/03_verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pandas as pd
import pathlib

# Define paths and variables
overview_path = pathlib.Path("download")
downloaded_file_path = overview_path / "downloaded_files.txt"
processed_file_path = overview_path / "processed_files.txt"
overview_file_path = overview_path / "overview.csv"
verification_file_path = overview_path / "verification_success.txt"

# Check that verification file does not exist
try:
verification_file_path.unlink()
print('Verification file deleted')
except:
print("Verification file doesn't exist")


# Read the overview dataset
overview_file_path = overview_path / 'overview.csv'
datasets = pd.read_csv(overview_file_path)

# Read the downloaded file list
downloaded_file = open(downloaded_file_path, "r")
downloaded_files = downloaded_file.read().splitlines()
downloaded_files = downloaded_files[1 : ] # remove the first file, which is the overview file

# Read the processed file list
processed_file = open(processed_file_path, "r")
processed_files = processed_file.read().splitlines()

# Check that all temposeq datasets have been downloaded
if len(downloaded_files) == datasets.shape[0]:
print("Verification step 1: OK.\nAll temposeq datasets have been downloaded.")
else:
print("""Verification failed.\nNumber of temposeq datasets and downloaded files do not match.\n
Number of temposeq datasets: {0}\n
Number of downloaded files: {1}""".format(datasets.shape[0], len(downloaded_files)))
quit()

# Check that all temposeq datasets have been downloaded
if len(processed_files) == datasets.shape[0]:
print("Verification step 2: OK.\nAll temposeq datasets have been processed.")
else:
print("""Verification failed.\nNumber of temposeq datasets and processed files do not match.\n
Number of temposeq datasets: {0}\n
Number of processed files: {1}""".format(datasets.shape[0], len(processed_files)))
quit()

log_file = open(verification_file_path, "w")
log_file.write(f"Verification successful.")
32 changes: 32 additions & 0 deletions stages/04_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os, pathlib
import pandas as pd

# Set list path where you can store additional information or lists, if needed
dldir = pathlib.Path("download")

# Check if verification file exists
verification_success = dldir / "verification_success.txt"
if not verification_success.is_file():
print("Stop building because verification failed.")
quit()

# Create brick directory to store Parquet files
brickpath = pathlib.Path("brick")
(brickpath / "temposeq.parquet").mkdir(parents=True, exist_ok=True)

# Build temposeq parquet files
# Read the list of temposeq files
file_paths = [line.strip() for line in open(dldir / "downloaded_files.txt", "r")]

# Process each temposeq file
for inputpath in file_paths:
outputpath = inputpath.replace("download", "brick")\
.replace("temposeq", "temposeq.parquet")\
.replace(".csv", ".parquet")

pd.read_csv(inputpath).to_parquet(outputpath)

# Build pathways parquet file
inputpath = dldir / "pathways.csv"
outputpath = inputpath.name.replace("download", "brick").replace(".csv", ".parquet")
pd.read_csv(inputpath).to_parquet(outputpath)
Loading