Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CHB-MIT pipeline #13

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
model/__pycache__/*
__pycache__/*
log/*
log-pretrain/*
log-pretrain/*
**/config_test.ini
2 changes: 2 additions & 0 deletions config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Paths]
root = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments
16 changes: 16 additions & 0 deletions datasets/CHB-MIT/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Paths]
signals_path = /srv/local/data/physionet.org/files/chbmit/1.0.0
clean_path = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals
root = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals
out = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments

[Patients]
test_pats = ["chb23", "chb24"]
val_pats = ["chb21", "chb22"]
train_pats = ["chb01", "chb02", "chb03", "chb04", "chb05", "chb06", "chb07", "chb08", "chb09", "chb10", "chb11", "chb12", "chb13", "chb14", "chb15", "chb16", "chb17", "chb18", "chb19", "chb20"]

[Channels]
channels = ["FP1-F7", "F7-T7", "T7-P7", "P7-O1", "FP2-F8", "F8-T8", "T8-P8", "P8-O2", "FP1-F3", "F3-C3", "C3-P3", "P3-O1", "FP2-F4", "F4-C4", "C4-P4", "P4-O2"]

[SAMPLING_RATE]
rate = 256
124 changes: 71 additions & 53 deletions datasets/CHB-MIT/process1.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import ast
import os
import argparse
import configparser
import multiprocessing as mp
from collections import defaultdict
import pyedflib
import pyedflib.highlevel as hl
import numpy as np
import copy
import shutil
import bz2
import pickle
import _pickle as cPickle
import multiprocessing as mp


# Pickle a file and then compress it into a file with extension
Expand Down Expand Up @@ -101,7 +101,9 @@ def move_channels(clean_dict, channels, target):


# Process edf files of a pacient from start number to end number
def process_files(pacient, valid_channels, channels, start, end):
def process_files(
pacient, valid_channels, channels, start, end, signals_path, clean_path
):
for num in range(start, end + 1):
to_keep = []

Expand Down Expand Up @@ -162,7 +164,7 @@ def process_files(pacient, valid_channels, channels, start, end):
move_channels(clean_dict, channels, target)


def start_process(pacient, num, start, end, sum_ind):
def start_process(pacient, num, start, end, sum_ind, signals_path, clean_path):
# Summary file
f = open(
"{path}/chb{p}/chb{p}-summary.txt".format(path=signals_path, p=pacient), "r"
Expand Down Expand Up @@ -227,50 +229,66 @@ def start_process(pacient, num, start, end, sum_ind):
compressed_pickle(target + ".pkl", clean_dict)

# Process the rest of the files to get same channels as reference file
process_files(pacient, valid_channels, channels, start, end)


# PARAMETERS
signals_path = "/srv/local/data/physionet.org/files/chbmit/1.0.0" # Path to the data main directory
clean_path = "/srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals" # Path where to store clean data

if not os.path.exists(clean_path):
os.makedirs(clean_path)

# Clean pacients one by one manually with these parameters
pacient = "04"
num = "01" # Reference file
summary_index = 0 # Index of channels summary reference
start = 28 # Number of first file to process
end = 28 # Number of last file to process
# Start the process
# start_process(pacient, num, start, end, summary_index)


# FULL DATA PROCESS
parameters = [
("01", "01", 2, 46, 0),
("02", "01", 2, 35, 0),
("03", "01", 2, 38, 0),
("05", "01", 2, 39, 0),
("06", "01", 2, 24, 0),
("07", "01", 2, 19, 0),
("08", "02", 3, 29, 0),
("10", "01", 2, 89, 0),
("11", "01", 2, 99, 0),
("14", "01", 2, 42, 0),
("20", "01", 2, 68, 0),
("21", "01", 2, 33, 0),
("22", "01", 2, 77, 0),
("23", "06", 7, 20, 0),
("24", "01", 3, 21, 0),
("04", "07", 1, 43, 1),
("09", "02", 1, 19, 1),
("15", "02", 1, 63, 1),
("16", "01", 2, 19, 0),
("18", "02", 1, 36, 1),
("19", "02", 1, 30, 1),
]

with mp.Pool(mp.cpu_count()) as pool:
res = pool.starmap(start_process, parameters)
process_files(
pacient, valid_channels, channels, start, end, signals_path, clean_path
)


def parse_args():
parser = argparse.ArgumentParser(description="Process EDF files")
parser.add_argument(
"--config",
type=str,
default="config.ini",
help="Path to the configuration file",
)
return parser.parse_args()


def read_config(config_path):
config = configparser.ConfigParser()
config.read(config_path)
return config


def main(
parameters=[
("01", "01", 2, 46, 0),
("02", "01", 2, 35, 0),
("03", "01", 2, 38, 0),
("05", "01", 2, 39, 0),
("06", "01", 2, 24, 0),
("07", "01", 2, 19, 0),
("08", "02", 3, 29, 0),
("10", "01", 2, 89, 0),
("11", "01", 2, 99, 0),
("14", "01", 2, 42, 0),
("20", "01", 2, 68, 0),
("21", "01", 2, 33, 0),
("22", "01", 2, 77, 0),
("23", "06", 7, 20, 0),
("24", "01", 3, 21, 0),
("04", "07", 1, 43, 1),
("09", "02", 1, 19, 1),
("15", "02", 1, 63, 1),
("16", "01", 2, 19, 0),
("18", "02", 1, 36, 1),
("19", "02", 1, 30, 1),
]
):
args = parse_args()
config = read_config(args.config)
signals_path = config.get("Paths", "signals_path")
clean_path = config.get("Paths", "clean_path")
if not os.path.exists(clean_path):
os.makedirs(clean_path)

parameters = [
(p[0], p[1], p[2], p[3], p[4], signals_path, clean_path) for p in parameters
]
with mp.Pool(mp.cpu_count()) as pool:
res = pool.starmap(start_process, parameters)


if __name__ == "__main__":
main()
133 changes: 59 additions & 74 deletions datasets/CHB-MIT/process2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,11 @@
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
import argparse
import configparser

root = "/srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals"
out = "/srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments"

# root = 'clean_signals'
# out = 'clean_segments'

if not os.path.exists(out):
os.makedirs(out)

# dump chb23 and chb24 to test, ch21 and ch22 to val, and the rest to train
test_pats = ["chb23", "chb24"]
val_pats = ["chb21", "chb22"]
train_pats = [
"chb01",
"chb02",
"chb03",
"chb04",
"chb05",
"chb06",
"chb07",
"chb08",
"chb09",
"chb10",
"chb11",
"chb12",
"chb13",
"chb14",
"chb15",
"chb16",
"chb17",
"chb18",
"chb19",
"chb20",
]
channels = [
"FP1-F7",
"F7-T7",
"T7-P7",
"P7-O1",
"FP2-F8",
"F8-T8",
"T8-P8",
"P8-O2",
"FP1-F3",
"F3-C3",
"C3-P3",
"P3-O1",
"FP2-F4",
"F4-C4",
"C4-P4",
"P4-O2",
]
SAMPLING_RATE = 256


def sub_to_segments(folder, out_folder):
def sub_to_segments(folder, out_folder, root, channels, SAMPLING_RATE):
print(f"Processing {folder}...")
# each recording
for f in tqdm(os.listdir(os.path.join(root, folder))):
Expand Down Expand Up @@ -148,22 +96,59 @@ def sub_to_segments(folder, out_folder):
)


# parallel parameters
folders = os.listdir(root)
out_folders = []
for folder in folders:
if folder in test_pats:
out_folder = os.path.join(out, "test")
elif folder in val_pats:
out_folder = os.path.join(out, "val")
else:
out_folder = os.path.join(out, "train")

if not os.path.exists(out_folder):
os.makedirs(out_folder)

out_folders.append(out_folder)

# process in parallel
with mp.Pool(mp.cpu_count()) as pool:
res = pool.starmap(sub_to_segments, zip(folders, out_folders))
def parse_args():
parser = argparse.ArgumentParser(description="Process EDF files")
parser.add_argument(
"--config",
type=str,
default="config.ini",
help="Path to the configuration file",
)
return parser.parse_args()


def read_config(config_path):
config = configparser.ConfigParser()
config.read(config_path)
return config


def main():
args = parse_args()
config = read_config(args.config)
root = config.get("Paths", "root")
out = config.get("Paths", "out")
test_pats = eval(config.get("Patients", "test_pats"))
val_pats = eval(config.get("Patients", "val_pats"))
channels = eval(config.get("Channels", "channels"))
SAMPLING_RATE = config.getint("SAMPLING_RATE", "rate")
if not os.path.exists(out):
os.makedirs(out)
folders = os.listdir(root)
out_folders = []
for folder in folders:
if folder in test_pats:
out_folder = os.path.join(out, "test")
elif folder in val_pats:
out_folder = os.path.join(out, "val")
else:
out_folder = os.path.join(out, "train")
if not os.path.exists(out_folder):
os.makedirs(out_folder)
out_folders.append(out_folder)
# process in parallel
with mp.Pool(mp.cpu_count()) as pool:
res = pool.starmap(
sub_to_segments,
zip(
folders,
out_folders,
[root] * len(folders),
[channels] * len(folders),
[SAMPLING_RATE] * len(folders),
),
)


if __name__ == "__main__":
main()
7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
torch
numpy
numpy==1.26.4
linear_attention_transformer
einops
pytorch_lightning
pytorch_lightning==1.9.5
tqdm
pyedflib==0.1.38
pyhealth==1.1.6
tensorboardX==2.6.2.2
Loading