ycq091044 · HireTheHero · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 2, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 model/__pycache__/*
 __pycache__/*
 log/*
-log-pretrain/*
+log-pretrain/*
+**/config_test.ini
diff --git a/config.ini b/config.ini
@@ -0,0 +1,2 @@
+[Paths]
+root = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments
diff --git a/datasets/CHB-MIT/config.ini b/datasets/CHB-MIT/config.ini
@@ -0,0 +1,16 @@
+[Paths]
+signals_path = /srv/local/data/physionet.org/files/chbmit/1.0.0
+clean_path = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals
+root = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals
+out = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments
+
+[Patients]
+test_pats = ["chb23", "chb24"]
+val_pats = ["chb21", "chb22"]
+train_pats = ["chb01", "chb02", "chb03", "chb04", "chb05", "chb06", "chb07", "chb08", "chb09", "chb10", "chb11", "chb12", "chb13", "chb14", "chb15", "chb16", "chb17", "chb18", "chb19", "chb20"]
+
+[Channels]
+channels = ["FP1-F7", "F7-T7", "T7-P7", "P7-O1", "FP2-F8", "F8-T8", "T8-P8", "P8-O2", "FP1-F3", "F3-C3", "C3-P3", "P3-O1", "FP2-F4", "F4-C4", "C4-P4", "P4-O2"]
+
+[SAMPLING_RATE]
+rate = 256
diff --git a/datasets/CHB-MIT/process1.py b/datasets/CHB-MIT/process1.py
@@ -1,14 +1,14 @@
+import ast
 import os
+import argparse
+import configparser
+import multiprocessing as mp
 from collections import defaultdict
 import pyedflib
 import pyedflib.highlevel as hl
 import numpy as np
-import copy
-import shutil
-import bz2
 import pickle
 import _pickle as cPickle
-import multiprocessing as mp
 
 
 # Pickle a file and then compress it into a file with extension
@@ -101,7 +101,9 @@ def move_channels(clean_dict, channels, target):
 
 
 # Process edf files of a pacient from start number to end number
-def process_files(pacient, valid_channels, channels, start, end):
+def process_files(
+    pacient, valid_channels, channels, start, end, signals_path, clean_path
+):
     for num in range(start, end + 1):
         to_keep = []
 
@@ -162,7 +164,7 @@ def process_files(pacient, valid_channels, channels, start, end):
         move_channels(clean_dict, channels, target)
 
 
-def start_process(pacient, num, start, end, sum_ind):
+def start_process(pacient, num, start, end, sum_ind, signals_path, clean_path):
     # Summary file
     f = open(
         "{path}/chb{p}/chb{p}-summary.txt".format(path=signals_path, p=pacient), "r"
@@ -227,50 +229,66 @@ def start_process(pacient, num, start, end, sum_ind):
     compressed_pickle(target + ".pkl", clean_dict)
 
     # Process the rest of the files to get same channels as reference file
-    process_files(pacient, valid_channels, channels, start, end)
-
-
-# PARAMETERS
-signals_path = "/srv/local/data/physionet.org/files/chbmit/1.0.0"  # Path to the data main directory
-clean_path = "/srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals"  # Path where to store clean data
-
-if not os.path.exists(clean_path):
-    os.makedirs(clean_path)
-
-# Clean pacients one by one manually with these parameters
-pacient = "04"
-num = "01"  # Reference file
-summary_index = 0  # Index of channels summary reference
-start = 28  # Number of first file to process
-end = 28  # Number of last file to process
-# Start the process
-# start_process(pacient, num, start, end, summary_index)
-
-
-# FULL DATA PROCESS
-parameters = [
-    ("01", "01", 2, 46, 0),
-    ("02", "01", 2, 35, 0),
-    ("03", "01", 2, 38, 0),
-    ("05", "01", 2, 39, 0),
-    ("06", "01", 2, 24, 0),
-    ("07", "01", 2, 19, 0),
-    ("08", "02", 3, 29, 0),
-    ("10", "01", 2, 89, 0),
-    ("11", "01", 2, 99, 0),
-    ("14", "01", 2, 42, 0),
-    ("20", "01", 2, 68, 0),
-    ("21", "01", 2, 33, 0),
-    ("22", "01", 2, 77, 0),
-    ("23", "06", 7, 20, 0),
-    ("24", "01", 3, 21, 0),
-    ("04", "07", 1, 43, 1),
-    ("09", "02", 1, 19, 1),
-    ("15", "02", 1, 63, 1),
-    ("16", "01", 2, 19, 0),
-    ("18", "02", 1, 36, 1),
-    ("19", "02", 1, 30, 1),
-]
-
-with mp.Pool(mp.cpu_count()) as pool:
-    res = pool.starmap(start_process, parameters)
+    process_files(
+        pacient, valid_channels, channels, start, end, signals_path, clean_path
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Process EDF files")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="config.ini",
+        help="Path to the configuration file",
+    )
+    return parser.parse_args()
+
+
+def read_config(config_path):
+    config = configparser.ConfigParser()
+    config.read(config_path)
+    return config
+
+
+def main(
+    parameters=[
+        ("01", "01", 2, 46, 0),
+        ("02", "01", 2, 35, 0),
+        ("03", "01", 2, 38, 0),
+        ("05", "01", 2, 39, 0),
+        ("06", "01", 2, 24, 0),
+        ("07", "01", 2, 19, 0),
+        ("08", "02", 3, 29, 0),
+        ("10", "01", 2, 89, 0),
+        ("11", "01", 2, 99, 0),
+        ("14", "01", 2, 42, 0),
+        ("20", "01", 2, 68, 0),
+        ("21", "01", 2, 33, 0),
+        ("22", "01", 2, 77, 0),
+        ("23", "06", 7, 20, 0),
+        ("24", "01", 3, 21, 0),
+        ("04", "07", 1, 43, 1),
+        ("09", "02", 1, 19, 1),
+        ("15", "02", 1, 63, 1),
+        ("16", "01", 2, 19, 0),
+        ("18", "02", 1, 36, 1),
+        ("19", "02", 1, 30, 1),
+    ]
+):
+    args = parse_args()
+    config = read_config(args.config)
+    signals_path = config.get("Paths", "signals_path")
+    clean_path = config.get("Paths", "clean_path")
+    if not os.path.exists(clean_path):
+        os.makedirs(clean_path)
+
+    parameters = [
+        (p[0], p[1], p[2], p[3], p[4], signals_path, clean_path) for p in parameters
+    ]
+    with mp.Pool(mp.cpu_count()) as pool:
+        res = pool.starmap(start_process, parameters)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/CHB-MIT/process2.py b/datasets/CHB-MIT/process2.py
@@ -3,63 +3,11 @@
 import numpy as np
 from tqdm import tqdm
 import multiprocessing as mp
+import argparse
+import configparser
 
-root = "/srv/local/data/physionet.org/files/chbmit/1.0.0/clean_signals"
-out = "/srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments"
 
-# root = 'clean_signals'
-# out = 'clean_segments'
-
-if not os.path.exists(out):
-    os.makedirs(out)
-
-# dump chb23 and chb24 to test, ch21 and ch22 to val, and the rest to train
-test_pats = ["chb23", "chb24"]
-val_pats = ["chb21", "chb22"]
-train_pats = [
-    "chb01",
-    "chb02",
-    "chb03",
-    "chb04",
-    "chb05",
-    "chb06",
-    "chb07",
-    "chb08",
-    "chb09",
-    "chb10",
-    "chb11",
-    "chb12",
-    "chb13",
-    "chb14",
-    "chb15",
-    "chb16",
-    "chb17",
-    "chb18",
-    "chb19",
-    "chb20",
-]
-channels = [
-    "FP1-F7",
-    "F7-T7",
-    "T7-P7",
-    "P7-O1",
-    "FP2-F8",
-    "F8-T8",
-    "T8-P8",
-    "P8-O2",
-    "FP1-F3",
-    "F3-C3",
-    "C3-P3",
-    "P3-O1",
-    "FP2-F4",
-    "F4-C4",
-    "C4-P4",
-    "P4-O2",
-]
-SAMPLING_RATE = 256
-
-
-def sub_to_segments(folder, out_folder):
+def sub_to_segments(folder, out_folder, root, channels, SAMPLING_RATE):
     print(f"Processing {folder}...")
     # each recording
     for f in tqdm(os.listdir(os.path.join(root, folder))):
@@ -148,22 +96,59 @@ def sub_to_segments(folder, out_folder):
                 )
 
 
-# parallel parameters
-folders = os.listdir(root)
-out_folders = []
-for folder in folders:
-    if folder in test_pats:
-        out_folder = os.path.join(out, "test")
-    elif folder in val_pats:
-        out_folder = os.path.join(out, "val")
-    else:
-        out_folder = os.path.join(out, "train")
-
-    if not os.path.exists(out_folder):
-        os.makedirs(out_folder)
-
-    out_folders.append(out_folder)
-
-# process in parallel
-with mp.Pool(mp.cpu_count()) as pool:
-    res = pool.starmap(sub_to_segments, zip(folders, out_folders))
+def parse_args():
+    parser = argparse.ArgumentParser(description="Process EDF files")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="config.ini",
+        help="Path to the configuration file",
+    )
+    return parser.parse_args()
+
+
+def read_config(config_path):
+    config = configparser.ConfigParser()
+    config.read(config_path)
+    return config
+
+
+def main():
+    args = parse_args()
+    config = read_config(args.config)
+    root = config.get("Paths", "root")
+    out = config.get("Paths", "out")
+    test_pats = eval(config.get("Patients", "test_pats"))
+    val_pats = eval(config.get("Patients", "val_pats"))
+    channels = eval(config.get("Channels", "channels"))
+    SAMPLING_RATE = config.getint("SAMPLING_RATE", "rate")
+    if not os.path.exists(out):
+        os.makedirs(out)
+    folders = os.listdir(root)
+    out_folders = []
+    for folder in folders:
+        if folder in test_pats:
+            out_folder = os.path.join(out, "test")
+        elif folder in val_pats:
+            out_folder = os.path.join(out, "val")
+        else:
+            out_folder = os.path.join(out, "train")
+        if not os.path.exists(out_folder):
+            os.makedirs(out_folder)
+        out_folders.append(out_folder)
+    # process in parallel
+    with mp.Pool(mp.cpu_count()) as pool:
+        res = pool.starmap(
+            sub_to_segments,
+            zip(
+                folders,
+                out_folders,
+                [root] * len(folders),
+                [channels] * len(folders),
+                [SAMPLING_RATE] * len(folders),
+            ),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,9 @@
 torch
-numpy
+numpy==1.26.4
 linear_attention_transformer
 einops
-pytorch_lightning
+pytorch_lightning==1.9.5
 tqdm
+pyedflib==0.1.38
+pyhealth==1.1.6
+tensorboardX==2.6.2.2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[Paths]
		root = /srv/local/data/physionet.org/files/chbmit/1.0.0/clean_segments