module_run_issues

gerstung-lab · Dec 13, 2024 · 39360e7 · 39360e7
1 parent 6cfc13b
commit 39360e7
Show file tree

Hide file tree

Showing 11 changed files with 46 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -51,7 +51,8 @@ Each module is designed for a specific task. Below are the basic functionalities
 #### :scissors: Tiling
 Purpose: Converts whole slide images (WSI) into manageable image tiles for further processing.
 ```bash
-python preprocessing/tiling/slide_tiling.py --source_dir <WSIs_store_path> --source_list <slide_path_list.txt> --save_dir <tiles_path> --patch_size 256 --step_size 256 --mag 20
+cd preprocessing
+python -m tiling.main_create_tiles --source_dir <WSIs_store_path> --source_list <slide_path_list.txt> --save_dir <tiles_path> --patch_size 256 --step_size 256 --mag 20
 ```
 Key arguments:
 - `source_dir`: Path to the source slide image (.svs/.ndpi/...) directory.
@@ -61,7 +62,7 @@ Key arguments:
 - `step_size`: Step size between neighboring tiles (default: 256).
 - `mag`: Nominal magnification level of the slide (default: 20).
 
-If you have access to an LSF cluster, you can use `python preprocessing/tiling/run.py` to parallelize the tiling process, significantly reducing the processing latency. Before that, please update the `preprocessing/tiling/run.py` file with the correct paths and parameters.
+If you have access to an LSF cluster, you can use `python run_tiling.py` to parallelize the tiling process, significantly reducing the processing latency. Before that, please update the `preprocessing/run_tiling.py` file with the correct paths and parameters.
 
 #### :wrench: Feature extraction
 Purpose: Extracts features from the image tiles using a pre-trained model.
@@ -73,12 +74,12 @@ Key arguments:
 - `feature_dir`: Path to the directory where the extracted features will be saved.
 - `batchsize`: Batch size for feature extraction (default: 384).
 
-Similarly, if an LSF cluster is available, you can use `python preprocessing/feature_extraction/run.py`. Update the parameters in `preprocessing/feature_extraction/run.py` file before running.
+Similarly, if an LSF cluster is available, you can use `python preprocessing/run_extracting.py`. Update the parameters in `preprocessing/run_extracting.py` file before running.
 
 #### :robot: Model training/evaluation
 Purpose: To Train and evaluate the Hetairos model using features extracted in the previous step.
 ```bash
-python aggregator_train_val/model_run.py --dataset <dataset_path> --label <label.csv> --label_map <label_mapping.yaml> --split <split_file.yaml> --mode <train/test> --data_aug --soft_labels --exp_name <experiment_name> 
+python -m aggregator_train_val.model_run --dataset <dataset_path> --label <label.csv> --label_map <label_mapping.yaml> --split <split_file.yaml> --mode <train/test> --data_aug --soft_labels --exp_name <experiment_name> 
 ``` 
 Key arguments:
 - `dataset`: Path to the directory containing the extracted features (saved in .pt format).
@@ -113,7 +114,7 @@ The tumor locations that are available are:
 The `pipeline.py` script is designed to run the complete pipeline from slide tiling to model training and evaluation in one go.
 
 ```bash
-python pipeline.py --tiling  --model_run --slide_dir <WSIs_store_path> --slide_list <slide_path_list.txt> --tile_savedir <tiles_path> --feature_extraction --batchsize 256 --feature_dir <features_path> --model_run --dataset <dataset_path> --label <label.csv> --label_map <label_mapping.yaml> --split <split_file.yaml> --mode <train/test> --data_aug --soft_labels --exp_name <experiment_name>
+python pipeline.py --tiling --slide_dir <WSIs_store_path> --slide_list <slide_path_list.txt> --tile_savedir <tiles_path> --feature_extraction --batchsize 256 --feature_dir <features_path> --model_run --dataset <dataset_path> --label <label.csv> --label_map <label_mapping.yaml> --split <split_file.yaml> --mode <train/test> --data_aug --soft_labels --exp_name <experiment_name>
 ```
 
 The key arguments `--tiling`, `--feature_extraction`, and `--model_run` are used to specify the tasks to be executed. At least one of them should be set as `True` when runnig the script. The rest of the arguments are the same as described in the individual modules.

diff --git a/aggregator_train_val/__init__.py b/aggregator_train_val/__init__.py
diff --git a/aggregator_train_val/config.yaml b/aggregator_train_val/config.yaml
@@ -4,17 +4,17 @@ General:
     grad_acc: 2
     patience: 10
     mode: test
-    log_path: logs/
+    log_path: ./aggregator_train_val/logs/
     seed: 13
 
 Data:
-    data_dir: ./data/dataset
-    data_split: ./split/split.yaml
-    label_file: ./labels/labels.csv
+    data_dir: ./aggregator_train_val/data/dataset
+    data_split: ./aggregator_train_val/annot_files/split.yaml
+    label_file: ./aggregator_train_val/annot_files/labels.csv
     aug: True
     soft_labels: False
-    preds_save: ./results/
-    label_mapping: ./102_classes.yaml
+    preds_save: ./aggregator_train_val/results/
+    label_mapping: ./aggregator_train_val/102_classes.yaml
     batch_size: 1 
     age_loc_drop_prob: 0.7
     aug_prob: 0.7

diff --git a/aggregator_train_val/model_run.py b/aggregator_train_val/model_run.py
@@ -13,25 +13,25 @@
 def parse_arguments():
     parser = argparse.ArgumentParser(description='Parameters for CNS tumor classification')
     parser.add_argument('--dataset', type=str, 
-                        default='./data/dataset', help='Path to the dataset directory containing embeddings generated by a feature extractor, saved in .pt format')
+                        default='./aggregator_train_val/data/dataset', help='Path to the dataset directory containing embeddings generated by a feature extractor, saved in .pt format')
     parser.add_argument('--label', type=str,
-                        default='./labels/labels.csv',
+                        default='./aggregator_train_val/annot_files/labels.csv',
                         help='Path to the slide label CSV file, which should contain columns including slide, family, probability vector, age, and location')
-    parser.add_argument('--split', type=str, default='./split/split.yaml',
+    parser.add_argument('--split', type=str, default='./aggregator_train_val/annot_files/split.yaml',
                         help='Path to the dataset split file (YAML) containing train and test slide IDs, structured as {"train": [slide_id], "test": [slide_id]}')
     parser.add_argument('--mode', type=str, default='train', help='Operation mode: train or test') 
     parser.add_argument('--data_aug', action='store_true', help='Apply data augmentation during training') 
     parser.add_argument('--soft_labels', action='store_true', help='Use soft labels during training') 
     parser.add_argument('--exp_name', type=str, default='default_exp', help='Identifier for the experiment') 
 
-    parser.add_argument('--output_dir', type=str, default='./predictions', help='Directory to save predictions') 
+    parser.add_argument('--output_dir', type=str, default='./aggregator_train_val/predictions', help='Directory to save predictions') 
 
     parser.add_argument('--model', type=str, default='ATransMIL', help='Model architecture to use') 
     parser.add_argument('--groups', type=int, default=3, help='Number of slide matrix divisions') 
     parser.add_argument('--classes', type=int, default=186, help='Output class number by the classifier') 
     parser.add_argument('--cl_weight', type=float, default=20, help='Weight for contrastive loss') 
-    parser.add_argument('--config', type=str, default='config.yaml', help='Path to configuration file') 
-    parser.add_argument('--label_map', type=str, default='./annot_files/class_ID.yaml', help='Path to label mapping file')  
+    parser.add_argument('--config', type=str, default='./aggregator_train_val/config.yaml', help='Path to configuration file') 
+    parser.add_argument('--label_map', type=str, default='./aggregator_train_val/annot_files/class_ID.yaml', help='Path to label mapping file')  
     parser.add_argument('--resume', action='store_true', help='Resume training from the latest checkpoint') 
     return parser.parse_args()
 

diff --git a/aggregator_train_val/utils.py b/aggregator_train_val/utils.py
@@ -51,7 +51,7 @@ def load_loggers(cfg):
 
     log_dir = cfg.log_path / 'wandb'
     Path(log_dir).mkdir(exist_ok=True, parents=True) # in case wandb fails to store the logs
-    wandb_logger = pl_loggers.WandbLogger(project='Paion', save_dir=str(cfg.log_path), name=cfg.Model.exp_name)
+    wandb_logger = pl_loggers.WandbLogger(project='Hetairos', save_dir=str(cfg.log_path), name=cfg.Model.exp_name)
     return wandb_logger
 
 

diff --git a/pipeline.py b/pipeline.py
@@ -34,15 +34,18 @@ def parse_arguments():
         model_group = parser.add_argument_group('Model run arguments')
         model_group.add_argument('--dataset', type=str, default=None, 
                             help='Path to the dataset directory. If not provided, the dataset will be generated from the feature extraction results')
-        model_group.add_argument('--label', type=str, default='./labels/labels.csv',
+        model_group.add_argument('--label', type=str, default='./aggregator_train_val/labels/labels.csv',
                             help='Path to the slide label CSV file, which should contain columns including slide, family, probability vector, age, and location')
+        model_group.add_argument('--label_map', type=str, default='./aggregator_train_val/annot_files/class_ID.yaml', help='Path to label mapping file')
         model_group.add_argument('--split', type=str, default=None,
                             help='Path to the dataset split file (YAML) containing train and test slide IDs, structured as {"train": [slide_id], "test": [slide_id]}. If not provided, the file will be generated from the dataset')
         model_group.add_argument('--mode', type=str, default='train', help='Operation mode: train or test') 
         model_group.add_argument('--exp_name', type=str, default='default_exp', help='Identifier for the experiment') 
-        model_group.add_argument('--output_dir', type=str, default='./predictions', help='Directory to save predictions') 
+        model_group.add_argument('--output_dir', type=str, default='./aggregator_train_val/predictions', help='Directory to save predictions') 
         model_group.add_argument('--resume', action='store_true', help='Resume training from the latest checkpoint') 
         model_group.add_argument('--config', type=str, default='./aggregator_train_val/config.yaml', help='Path to configuration file') 
+        model_group.add_argument('--data_aug', action='store_true', help='Apply data augmentation during training')
+        model_group.add_argument('--soft_labels', action='store_true', help='Use soft labels during training') 
 
     return parser.parse_args() 
 
@@ -78,18 +81,23 @@ def parse_arguments():
             try:
                 testset = os.listdir(args.dataset)
                 testset = [os.path.splitext(item)[0] for item in testset]
-                with open('./split.yaml', 'w') as f:
+                with open('./aggregator_train_val/split.yaml', 'w') as f:
                     yaml.dump({'train': [], 'test': testset}, f)
-                args.split = './split.yaml'
+                args.split = './aggregator_train_val/split.yaml'
             except:
                 print('No split file found, please provide the split file path')
                 exit()
-            cfg = read_yaml(args.config)
-            cfg['Data']['data_dir'] = args.dataset
-            cfg['Data']['data_split'] = args.split
-            cfg['Data']['label_file'] = args.label
-            cfg['General']['mode'] = args.mode
-            cfg['Model']['exp_name'] = args.exp_name
-            cfg['Model']['preds_save'] = args.output_dir
-            cfg['resume'] = args.resume
-            model_run(cfg)
+
+        cfg = read_yaml(args.config)
+        cfg['Data']['data_dir'] = args.dataset
+        cfg['Data']['data_split'] = args.split
+        cfg['Data']['label_file'] = args.label
+        cfg['Data']['soft_labels'] = args.soft_labels
+        cfg['Data']['aug'] = args.data_aug
+        cfg['Data']['label_file'] = args.label
+        cfg['Data']['label_mapping'] = args.label_map
+        cfg['General']['mode'] = args.mode
+        cfg['Model']['exp_name'] = args.exp_name
+        cfg['Model']['preds_save'] = args.output_dir
+        cfg['resume'] = args.resume
+        model_run(cfg)
diff --git a/preprocessing/feature_extraction/get_features.py b/preprocessing/feature_extraction/get_features.py
@@ -72,8 +72,8 @@ def extract_features(split, batchsize=768, feature_dir='./features'):
     database_loader = torch.utils.data.DataLoader(test_datat, batch_size=batchsize, shuffle=False)
 
     # change the name to the model you want to use here
-    os.environ['HF_HOME'] = './model_cache'
     model = timm.create_model("hf_hub:prov-gigapath/prov-gigapath", pretrained=True)  
+
     model.cuda()
     model.eval()
 

diff --git a/preprocessing/feature_extraction/run.py → preprocessing/run_extracting.py b/preprocessing/feature_extraction/run.py → preprocessing/run_extracting.py
@@ -29,7 +29,7 @@
             f.write("%s\n" % item)
 
     batchsize = 768
-    cmd = f"python -W ignore get_features.py --split '{list_loc_tmp}' --batchsize {batchsize} --feature_dir {save_dir}"
+    cmd = f"python -W ignore feature_extraction/get_features.py --split '{list_loc_tmp}' --batchsize {batchsize} --feature_dir {save_dir}"
     bsub_cmd = f'bsub -gpu num=1:j_exclusive=yes:gmem=23.5G -R "rusage[mem=20G]" -L /bin/bash -q gpu -J {dataset}_{i} -o ./log_{i}.log -e ./log_{i}.err "source ~/.bashrc && {cmd}"'
     try:
         subprocess.run(bsub_cmd, shell=True)

diff --git a/preprocessing/tiling/run.py → preprocessing/run_tiling.py b/preprocessing/tiling/run.py → preprocessing/run_tiling.py
@@ -37,8 +37,8 @@
         for item in slide_sub_list:
             f.write("%s\n" % item)
 
-    cmd = f'python -W ignore main_create_tiles.py --index {i} --source_list {list_loc_tmp} --save_dir {save_dir} --patch_size 256 --step_size 256 --mag 20'
-    bsub_cmd = f'bsub -R "rusage[mem=30G]" -J {dataset}_{i} -q long -o ./log_{i}.out -e ./log_{i}.err {cmd}'
+    cmd = f'python -W ignore -m tiling.main_create_tiles --index {i} --source_list {list_loc_tmp} --save_dir {save_dir} --patch_size 256 --step_size 256 --mag 20'
+    bsub_cmd = f'bsub -R "rusage[mem=30G]" -J {dataset}_{i} -q long -o tiling/log_file/log_{i}.out -e tiling/log_file/log_{i}.err {cmd}'
     try:
         subprocess.run(bsub_cmd, shell=True)
         time.sleep(1)

diff --git a/preprocessing/tiling/__init__.py b/preprocessing/tiling/__init__.py
diff --git a/requirements.txt b/requirements.txt
@@ -11,7 +11,7 @@ pytorch-lightning==2.0.8
 PyYAML==6.0
 scikit-image==0.21.0
 timm==1.0.8
-torch==2.5.1
+torch==1.13.0
 torchmetrics==1.1.2
-torchvision==0.20.1
+torchvision==0.14.0
 wandb==0.18.7