latest change

sccn · Sep 25, 2023 · 79527dc · 79527dc
1 parent 5497c12
commit 79527dc
Show file tree

Hide file tree

Showing 22 changed files with 752 additions and 24 deletions.
diff --git a/HED/summary/column_values_summary_cmd.json b/HED/summary/column_values_summary_cmd.json
@@ -0,0 +1,11 @@
+[{
+    "operation": "summarize_column_values",
+    "description": "Summarize the column values in an excerpt.",
+    "parameters": {
+        "summary_name": "column_values",
+        "summary_filename": "column_values",
+        "skip_columns": ["onset", "duration", "sample", "HED"],
+        "value_columns": ["stim_file", "response_time"],
+        "max_categorical": 100
+    }
+ }]
diff --git a/HED/summary/hed_summary_cmd.json b/HED/summary/hed_summary_cmd.json
@@ -0,0 +1,21 @@
+[{
+  "operation": "summarize_hed_tags",
+  "description": "Summarize the HED tags in the dataset.",
+  "parameters": {
+      "summary_name": "summarize_hed_tags",
+      "summary_filename": "summarize_hed_tags",
+      "tags": {
+          "Sensory events": ["Sensory-event", "Sensory-presentation", "Sensory-attribute",
+                             "Experimental-stimulus", "Task-stimulus-role", 
+               "Task-attentional-demand", "Incidental", "Instructional", "Warning"],
+          "Agent actions": ["Agent-action", "Agent", "Action", "Agent-task-role",
+                            "Task-action-type", "Participant-response"],
+      "Objects": ["Item"],
+      "Other events": ["Event", "Task-event-role", "Mishap"],
+          "Exclude tags": ["Def", "Definition", "Event-context", "Def-expand", "Label", "Description"] 
+        },
+    "include_context": true,
+    "replace_defs": true,
+    "remove_types": ["Condition-variable", "Task"]
+    }
+}]
diff --git a/HED/summary/hed_summary_sbatch b/HED/summary/hed_summary_sbatch
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH -J hed-summary
+#SBATCH --partition=shared
+#SBATCH --nodes=1
+#SBATCH --mem=128G
+#SBATCH --account=csd403
+#SBATCH --no-requeue
+#SBATCH -t 48:00:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --output="/home/dtyoung/NEMAR-pipeline/HED/summary/hed_summary.out"
+#SBATCH -e /home/dtyoung/NEMAR-pipeline/HED/summary/hed_summary.err
+cd /home/dtyoung/NEMAR-pipeline/HED/summary
+python run_hed_summary.py
diff --git a/HED/summary/hed_type_summary_cmd.json b/HED/summary/hed_type_summary_cmd.json
@@ -0,0 +1,10 @@
+[{
+   "operation": "summarize_hed_type",
+   "description": "Summarize conditional variable",
+   "parameters": {
+       "summary_name": "hed_type_summary",
+       "summary_filename": "hed_type_summary",
+       "type_tag": "condition-variable"
+   }
+}]
+
diff --git a/HED/summary/notes.py b/HED/summary/notes.py
@@ -0,0 +1,3 @@
+failed_and_has_events = ['ds004043', 'ds002691', 'ds004278', 'ds004011', 'ds004033', 'ds004603', 'ds000117', 'ds004368', 'ds003987', 'ds004019', 'ds003885', 'ds004315', 'ds003602', 'ds003844', 'ds002723', 'ds004252', 'ds004577', 'ds004078', 'ds004561', 'ds004256', 'ds002034', 'ds004317', 'ds002725', 'ds004080', 'ds003638', 'ds004357', 'ds003352', 'ds003710', 'ds004330', 'ds003848', 'ds003766', 'ds002761', 'ds004346', 'ds004212', 'ds004505', 'ds003195', 'ds004477', 'ds004152', 'ds004515', 'ds004264', 'ds004196', 'ds004395', 'ds002721', 'ds001787', 'ds001810', 'ds002893', 'ds004018', 'ds003816', 'ds004519', 'ds004554', 'ds004574', 'ds004107', 'ds004446', 'ds004572', 'ds003505', 'ds003801', 'ds004532', 'ds003570', 'ds004262', 'ds003800', 'ds004100', 'ds004147', 'ds004295', 'ds004306', 'ds004580', 'ds004444', 'ds004511', 'ds004197', 'ds002720', 'ds004473', 'ds002158', 'ds003194', 'ds004215', 'ds002833', 'ds004367', 'ds003670', 'ds004369', 'ds004151', 'ds004194', 'ds003039', 'ds004579', 'ds002778', 'ds002718', 'ds004460', 'ds003374', 'ds000248', 'ds003753', 'ds004229', 'ds004575', 'ds004457', 'ds004347', 'ds002791', 'ds001971', 'ds004017', 'ds003751', 'ds003876', 'ds003688', 'ds003754', 'ds003694', 'ds004502', 'ds003822', 'ds004356', 'ds003922', 'ds004148', 'ds002799', 'ds002722', 'ds002094', 'ds004024', 'ds003838', 'ds004584', 'ds003739', 'ds004040', 'ds004521', 'ds004276', 'ds004015', 'ds003190', 'ds004448', 'ds002680', 'ds004010', 'ds004588', 'ds002578', 'ds004520', 'ds004284', 'ds002218', 'ds004348', 'ds003846', 'ds002724', 'ds003887', 'ds004447', 'ds003774']
+failed = ['ds004043', 'ds002691', 'ds004278', 'ds004033', 'ds004011', 'ds004603', 'ds000117', 'ds004368', 'ds003775', 'ds003987', 'ds004019', 'ds004551', 'ds004315', 'ds003602', 'ds003885', 'ds003844', 'ds002723', 'ds003947', 'ds002885', 'ds004252', 'ds004577', 'ds002001', 'ds004078', 'ds004166', 'ds004561', 'ds004256', 'ds002034', 'ds003702', 'ds004317', 'ds002725', 'ds004080', 'ds004200', 'ds003638', 'ds004357', 'ds003352', 'ds003710', 'ds004330', 'ds003848', 'ds002336', 'ds003766', 'ds002761', 'ds004346', 'ds004212', 'ds004447', 'ds003195', 'ds004477', 'ds004152', 'ds003810', 'ds004515', 'ds004264', 'ds004196', 'ds004395', 'ds002721', 'ds001787', 'ds001810', 'ds002893', 'ds004018', 'ds003816', 'ds004519', 'ds004554', 'ds004574', 'ds003555', 'ds004381', 'ds004107', 'ds004446', 'ds004572', 'ds003505', 'ds003801', 'ds004532', 'ds003570', 'ds004262', 'ds004398', 'ds004127', 'ds003800', 'ds004100', 'ds004147', 'ds004295', 'ds004306', 'ds004580', 'ds004444', 'ds004511', 'ds004197', 'ds004000', 'ds002720', 'ds004473', 'ds002158', 'ds003194', 'ds004215', 'ds003944', 'ds002833', 'ds004367', 'ds003670', 'ds004369', 'ds003078', 'ds004151', 'ds003969', 'ds004075', 'ds004408', 'ds004194', 'ds003039', 'ds004579', 'ds003626', 'ds002718', 'ds002778', 'ds004460', 'ds003374', 'ds000248', 'ds001784', 'ds000246', 'ds003753', 'ds003768', 'ds004229', 'ds002908', 'ds004575', 'ds004457', 'ds004347', 'ds002791', 'ds001971', 'ds004017', 'ds003751', 'ds002338', 'ds003876', 'ds003688', 'ds003754', 'ds003694', 'ds004502', 'ds003822', 'ds004356', 'ds003922', 'metadata', 'ds001849', 'ds004148', 'ds002799', 'ds002722', 'ds002094', 'ds004024', 'ds003838', 'ds003805', 'ds004022', 'ds004584', 'ds003739', 'ds004040', 'ds004521', 'ds004276', 'ds003190', 'ds000247', 'ds004015', 'ds004448', 'ds002680', 'ds004067', 'ds004010', 'ds004588', 'ds002578', 'ds004520', 'ds004284', 'ds004186', 'ds002218', 'ds004504', 'ds004348', 'ds003846', 'ds002724', 'ds003887', 'ds003380', 'ds004505', 'ds003774']
+finished = ['ds001785', 'ds002550', 'ds002712', 'ds002814', 'ds003004', 'ds003029', 'ds003061', 'ds003082', 'ds003104', 'ds003343', 'ds003392', 'ds003420', 'ds003421', 'ds003458', 'ds003474', 'ds003478', 'ds003483', 'ds003490', 'ds003498', 'ds003506', 'ds003509', 'ds003516', 'ds003517', 'ds003518', 'ds003519', 'ds003522', 'ds003523', 'ds003568', 'ds003574', 'ds003620', 'ds003633', 'ds003645', 'ds003655', 'ds003682', 'ds003690', 'ds003703', 'ds003708', 'ds003825', 'ds004105', 'ds004106', 'ds004117', 'ds004118', 'ds004119', 'ds004120', 'ds004121', 'ds004122', 'ds004123', 'ds004350', 'ds004362']
diff --git a/HED/summary/run_event_summary.py b/HED/summary/run_event_summary.py
@@ -0,0 +1,36 @@
+
+import sys
+sys.path.insert(0, "../hed_python")
+from hed.tools.remodeling.cli.run_remodel import main
+import os
+import re
+import shutil
+
+raw_dir = '/expanse/projects/nemar/openneuro'
+model_path = './column_values_summary_cmd.json'
+outputdir = '/expanse/projects/nemar/openneuro/processed/event_summaries'
+start = False
+# dsnumbers = ['ds004043', 'ds002691', 'ds004278', 'ds004033', 'ds004011', 'ds004603', 'ds000117', 'ds004368', 'ds003775', 'ds003987', 'ds004019', 'ds004551', 'ds004315', 'ds003602', 'ds003885', 'ds003844', 'ds002723', 'ds003947', 'ds002885', 'ds004252', 'ds004577', 'ds002001', 'ds004078', 'ds004166', 'ds004561', 'ds004256', 'ds002034', 'ds003702', 'ds004317', 'ds002725', 'ds004080', 'ds004200', 'ds003638', 'ds004357', 'ds003352', 'ds003710', 'ds004330', 'ds003848', 'ds002336', 'ds003766', 'ds002761', 'ds004346', 'ds004212', 'ds004447', 'ds003195', 'ds004477', 'ds004152', 'ds003810', 'ds004515', 'ds004264', 'ds004196', 'ds004395', 'ds002721', 'ds001787', 'ds001810', 'ds002893', 'ds004018', 'ds003816', 'ds004519', 'ds004554', 'ds004574', 'ds003555', 'ds004381', 'ds004107', 'ds004446', 'ds004572', 'ds003505', 'ds003801', 'ds004532', 'ds003570', 'ds004262', 'ds004398', 'ds004127', 'ds003800', 'ds004100', 'ds004147', 'ds004295', 'ds004306', 'ds004580', 'ds004444', 'ds004511', 'ds004197', 'ds004000', 'ds002720', 'ds004473', 'ds002158', 'ds003194', 'ds004215', 'ds003944', 'ds002833', 'ds004367', 'ds003670', 'ds004369', 'ds003078', 'ds004151', 'ds003969', 'ds004075', 'ds004408', 'ds004194', 'ds003039', 'ds004579', 'ds003626', 'ds002718', 'ds002778', 'ds004460', 'ds003374', 'ds000248', 'ds001784', 'ds000246', 'ds003753', 'ds003768', 'ds004229', 'ds002908', 'ds004575', 'ds004457', 'ds004347', 'ds002791', 'ds001971', 'ds004017', 'ds003751', 'ds002338', 'ds003876', 'ds003688', 'ds003754', 'ds003694', 'ds004502', 'ds003822', 'ds004356', 'ds003922', 'metadata', 'ds001849', 'ds004148', 'ds002799', 'ds002722', 'ds002094', 'ds004024', 'ds003838', 'ds003805', 'ds004022', 'ds004584', 'ds003739', 'ds004040', 'ds004521', 'ds004276', 'ds003190', 'ds000247', 'ds004015', 'ds004448', 'ds002680', 'ds004067', 'ds004010', 'ds004588', 'ds002578', 'ds004520', 'ds004284', 'ds004186', 'ds002218', 'ds004504', 'ds004348', 'ds003846', 'ds002724', 'ds003887', 'ds003380', 'ds004505', 'ds003774']
+dsnumbers = ['ds003380', 'ds003768', 'ds002338', 'ds003944', 'ds004460', 'ds004398', 'ds004381', 'ds003775', 'ds003805', 'ds004022', 'ds000247', 'ds004551', 'ds004408', 'ds004127', 'ds003555', 'ds002885', 'ds002718', 'ds002001']
+dsnumbers = ['ds003645']
+# for idx, f in enumerate(os.listdir(processed_dir)):
+for f in dsnumbers:
+    print(f'processing {f}')
+    data_root = os.path.join(raw_dir, f)
+    work_dir = os.path.join(outputdir, f)
+    if not os.path.exists(work_dir):
+        os.mkdir(work_dir)
+    if os.path.isdir(data_root):
+        arg_list1 = [data_root, model_path, '-x', 'derivatives', 'code', 'stimuli', '-nb', '-nu', '-w', work_dir, '-b', '-i', 'none', '-v']
+        try:
+            main(arg_list1)
+            summary_outputdir = os.path.join(work_dir, 'remodel', 'summaries', 'column_values')
+            summaries = [file for file in os.listdir(summary_outputdir) if re.match('column_values.*.json', file)]
+            if len(summaries) > 0:
+                summaries.sort()
+                summary_outputfile = os.path.join(summary_outputdir, summaries[-1])
+                shutil.copyfile(summary_outputfile, work_dir+'/events_report.json')
+        except Exception as e:
+            print(f"Error for {f}")
+            print(e)
+
diff --git a/HED/summary/run_hed_summary.py b/HED/summary/run_hed_summary.py
@@ -0,0 +1,109 @@
+
+import sys
+sys.path.insert(0, "../hed_python")
+from hed.tools.remodeling.cli.run_remodel import main
+from hed.tools.visualization import summary_to_dict, create_wordcloud, word_cloud_to_svg
+import os
+import re
+import json
+
+def generate_json_report(hed_summary, output):
+    summary = {}
+    summary['Main tags'] = {}
+    summary['Other tags'] = []
+    summary['Condition variables'] = {}
+    with open(hed_summary,'r') as f:
+        hed_summary = json.load(f)
+        nfiles = hed_summary['Overall summary']['Total files']
+        nevents = hed_summary['Overall summary']['Total events']
+        summary['event files'] = nfiles
+        summary['events'] = nevents
+        summary['events/file'] = nevents/nfiles
+        main_tags_summary_dict = hed_summary['Overall summary']['Specifics']['Main tags']
+        other_tags_summary_dict = hed_summary['Overall summary']['Specifics']['Other tags']
+        # iterate through main tags
+        for key in main_tags_summary_dict.keys():
+            main_tag = key
+            summary['Main tags'][main_tag] = []
+
+            for tag_dict in main_tags_summary_dict[main_tag]:
+                summary['Main tags'][main_tag].append({'tag': tag_dict['tag'], 'events': tag_dict['events']})
+
+        for tag_dict in other_tags_summary_dict:
+            summary['Other tags'].append({'tag': tag_dict['tag'], 'events': tag_dict['events']})
+
+    # if hed_type_summary:
+    #     with open(hed_type_summary,'r') as f:
+    #         hed_type_summary = json.load(f)
+    #         for key in hed_type_summary['Overall summary']['Specifics']['details'].keys():
+    #             main_tag = key
+    #             summary['Condition variables'][main_tag] = []
+    #             for level_key in hed_type_summary['Overall summary']['Specifics']['details'][main_tag]['level_counts'].keys():
+    #                 desc = hed_type_summary['Overall summary']['Specifics']['details'][main_tag]['level_counts'][level_key]['description']
+    #                 file_count = hed_type_summary['Overall summary']['Specifics']['details'][main_tag]['level_counts'][level_key]['files']
+    #                 evt_count = hed_type_summary['Overall summary']['Specifics']['details'][main_tag]['level_counts'][level_key]['events']
+    #                 summary['Condition variables'][main_tag].append({'level': level_key, 'description': desc, 'events': evt_count, 'files': file_count})
+
+    with open(output, 'w') as out:
+        json.dump(summary, out)
+
+    return summary
+
+def generate_wordcloud(summary_file, work_dir):
+    with open(summary_file) as fin:
+        hed_summary = json.load(fin)
+        loaded_dict = summary_to_dict(hed_summary)
+
+        word_cloud = create_wordcloud(loaded_dict, mask_path="./word_mask.png", height=400, width=None)
+        svg_data = word_cloud_to_svg(word_cloud)
+        with open(work_dir+"/word_cloud.svg", "w") as outfile:
+            outfile.writelines(svg_data)
+
+
+raw_dir = '/expanse/projects/nemar/openneuro'
+hed_summary_model_path = './hed_summary_cmd.json'
+outputdir = '/expanse/projects/nemar/openneuro/processed/event_summaries'
+start = False
+error_logfile = './run_hed_summary.err'
+fid_err = open(error_logfile, 'w')
+run_wordcloud = False
+
+# TODO: use NEMAR database
+dsnumbers = ['ds004635','ds004588','ds004554','ds004521','ds004520','ds004519','ds004362','ds004350','ds004166','ds004123','ds004122','ds004121','ds004120','ds004119','ds004118','ds004117','ds004106','ds004105','ds003645','ds003061','ds002718']
+for f in dsnumbers:
+    print(f'processing {f}')
+    try:
+        data_root = os.path.join(raw_dir, f)
+        work_dir = os.path.join(outputdir, f)
+        if not os.path.exists(work_dir):
+            os.mkdir(work_dir)
+        if os.path.isdir(data_root):
+            arg_list1 = [data_root, hed_summary_model_path, '-x', 'derivatives', 'code', 'stimuli', 'sourcedata', '.datalad', 
+                '-nu', '-nb', '-w', work_dir, '-b', '-i', 'none', "-v"]
+            main(arg_list1)
+            hed_summary_outputdir = os.path.join(work_dir, 'remodel', 'summaries', 'summarize_hed_tags')
+            hed_summaries = [file for file in os.listdir(hed_summary_outputdir) if re.match('summarize_hed_tags.json', file)]
+            if len(hed_summaries) > 0:
+                hed_summaries.sort()
+                hed_summary_outputfile = os.path.join(hed_summary_outputdir, hed_summaries[-1])
+
+                # hed_type_summary_outputdir = os.path.join(work_dir, 'remodel', 'summaries', 'hed_type_summary')
+                # hed_type_summaries = [file for file in os.listdir(hed_type_summary_outputdir) if re.match('hed_type_summary.json', file)]
+                # hed_type_summary_outputfile = None
+                # if len(hed_type_summaries) > 0:
+                #     hed_type_summaries.sort()
+                #     hed_type_summary_outputfile = os.path.join(hed_type_summary_outputdir, hed_type_summaries[-1])
+                # generate_json_report(hed_summary_outputfile, hed_type_summary_outputfile, work_dir+'/hed_report.json')
+                generate_json_report(hed_summary_outputfile, work_dir+'/hed_report.json')
+
+                if run_wordcloud:
+                    # Generate word cloud
+                    generate_wordcloud(hed_summary_outputfile, work_dir)
+    except Exception as e:
+        fid_err.write(f'Error processing {f}: {e}\n')
+
+fid_err.close()
+
+def get_hed_datasets():
+    # TODO
+    return
diff --git a/HED/summary/run_word_cloud.py b/HED/summary/run_word_cloud.py
@@ -0,0 +1,39 @@
+
+import sys
+sys.path.insert(0, "../hed_python")
+from hed.tools.visualization import summary_to_dict, create_wordcloud, word_cloud_to_svg
+import os
+import re
+import shutil
+import json
+
+raw_dir = '/expanse/projects/nemar/openneuro'
+hed_summary_model_path = './hed_summary_cmd.json'
+hed_type_summary_model_path = './hed_type_summary_cmd.json'
+outputdir = '/expanse/projects/nemar/openneuro/processed/event_summaries'
+
+dsnumbers = ['ds004635','ds004588','ds004554','ds004521','ds004520','ds004519','ds004362','ds004350','ds004166','ds004123','ds004122','ds004121','ds004120','ds004119','ds004118','ds004117','ds004106','ds004105','ds003645','ds003061','ds002718']
+#dsnumbers = ['ds004123','ds004122','ds004121','ds004120','ds004119','ds004118','ds004117','ds004106','ds004105'] # Kay's datasets
+# dsnumbers = ['ds004123']
+for f in dsnumbers:
+    print(f'processing {f}')
+    data_root = os.path.join(raw_dir, f)
+    work_dir = os.path.join(outputdir, f)
+    hed_summary_outputfile = work_dir+'/remodel/summaries/summarize_hed_tags/summarize_hed_tags.json'
+    if not os.path.exists(work_dir):
+        os.mkdir(work_dir)
+    if os.path.isdir(data_root):
+        try:
+            with open(hed_summary_outputfile,'r') as fin:
+                hed_summary = json.load(fin)
+                loaded_dict = summary_to_dict(hed_summary)
+
+                word_cloud = create_wordcloud(loaded_dict, mask_path="./word_mask.png", height=400, width=None)
+                svg_data = word_cloud_to_svg(word_cloud)
+                with open(work_dir+"/word_cloud.svg", "w") as outfile:
+                    outfile.writelines(svg_data)
+        except Exception as e:
+            print(f"Error for {f}")
+            print(e)
+
+
diff --git a/HED/summary/word_mask.png b/HED/summary/word_mask.png