ARCAFF · samaloney · Feb 12, 2024 · Sep 16, 2024
diff --git a/arccnet/catalogs/datasets.py b/arccnet/catalogs/datasets.py
@@ -0,0 +1,97 @@
+import numpy as np
+
+from astropy.table import QTable
+
+from arccnet.data_generation.utils.utils import grouped_stratified_split
+
+# CLI to tar and compress
+# tar -cf - fits quicklook cutout-magnetic-catalog-v20231027.parq cutout-mcintosh-catalog-v20231027.parq | pigz > arccnet-cutout-dataset-v20231027.tar.gz
+
+
+def create_cutout_classification_datasets(cutout_catalog_path, random_seed=42):
+    # Set seed
+    np.random.seed(random_seed)
+
+    cutout_cat = QTable.read(cutout_catalog_path)
+
+    filtered_cat = cutout_cat[~(cutout_cat["filtered_mdi"] | cutout_cat["filtered_hmi"])]
+
+    # Can only convert 1D to pands
+    cols = [name for name in filtered_cat.colnames if len(filtered_cat[name].shape) <= 1]
+
+    filtered_cat_df = filtered_cat[cols].to_pandas()
+
+    # Make new QS and PLG classes in both magnetic and hale
+    filtered_cat_df.loc[filtered_cat_df.region_type == "QS", "magnetic_class"] = "QS"
+    filtered_cat_df.loc[filtered_cat_df.region_type == "QS", "mcintosh_class"] = "QS"
+
+    filtered_cat_df.loc[filtered_cat_df.region_type == "IA", "magnetic_class"] = "PLG"  # plage
+    filtered_cat_df.loc[filtered_cat_df.region_type == "IA", "mcintosh_class"] = "PLG"
+
+    # Will only tar up the local fits not source so need to change to be relative to that folder
+    def make_paths_relative(cat, col):
+        relpahts = ["/".join(p.split("/")[-2:]) for p in cat[col][~cat[col].mask]]
+        cat[col][~cat[col].mask] = relpahts
+
+    make_paths_relative(filtered_cat, "path_image_cutout_hmi")
+    make_paths_relative(filtered_cat, "quicklook_path_hmi")
+    make_paths_relative(filtered_cat, "path_image_cutout_mdi")
+    make_paths_relative(filtered_cat, "quicklook_path_mdi")
+
+    # Make train / test split
+    mag_train, mag_test = grouped_stratified_split(
+        filtered_cat_df, group_col="number", class_col="magnetic_class", train_size=0.8, test_size=0.2
+    )
+    hale_train, hale_test = grouped_stratified_split(
+        filtered_cat_df, group_col="number", class_col="mcintosh_class", train_size=0.8, test_size=0.2
+    )
+
+    mag_train_cat = filtered_cat[mag_train]
+    # mag_test_cat = filtered_cat[mag_test]
+
+    hale_train_cat = filtered_cat[hale_train]
+    # hale_test_cat = filtered_cat[hale_test]
+
+    mag_train_cat.write("cutout-magnetic-catalog-v20231027.parq")
+    hale_train_cat.write("cutout-mcintosh-catalog-v20231027.parq")
+
+
+# CLI to tar and compress
+# tar -cf - fits quicklook cutout-magnetic-catalog-v20231027.parq cutout-mcintosh-catalog-v20231027.parq | pigz > arccnet-cutout-dataset-v20231027.tar.gz
+
+
+def create_detection_datasets(detection_catalog):
+    np.random.seed(42)
+
+    extraction_cat = QTable.read(detection_catalog)
+    extraction_cat.rename_column("NOAA", "number")
+
+    # Can't convert to pandas
+    cols = extraction_cat.colnames
+    [cols.remove(n) for n in ["top_right_cutout", "bottom_left_cutout"]]
+
+    extraction_df = extraction_cat.to_pandas()
+
+    # Will only tar up the local fits not source so need to change to be relative to that folder
+    # def make_paths_relative(cat, col):
+    #     relpahts = ["/".join(p.split("/")[-2:]) for p in cat[col]]
+    #     cat[col] = relpahts
+    #
+    # make_paths_relative(extraction_cat, "processed_path")
+
+    # Make train / test split
+    mag_train, mag_test = grouped_stratified_split(extraction_df, group_col="number", class_col="magnetic_class")
+    hale_train, hale_test = grouped_stratified_split(extraction_df, group_col="number", class_col="mcintosh_class")
+
+    mag_train_cat = extraction_cat[mag_train]
+    extraction_cat[mag_test]
+
+    hale_train_cat = extraction_cat[hale_train]
+    extraction_cat[hale_test]
+
+    mag_train_cat.write("cutout-magnetic-catalog-v20231027.parq")
+    hale_train_cat.write("cutout-mcintosh-catalog-v20231027.parq")
+
+
+# CLI to tar and compress
+# tar -cf - fits quicklook cutout-magnetic-catalog-v20231027.parq cutout-mcintosh-catalog-v20231027.parq | pigz > arccnet-cutout-dataset-v20231027.tar.gz
diff --git a/arccnet/catalogs/flares/helio.py b/arccnet/catalogs/flares/helio.py
@@ -7,7 +7,6 @@
 from sunpy.net.helio import HECResponse
 from sunpy.time import TimeRange
 
-import astropy.units as u
 from astropy.table import QTable, vstack
 from astropy.time import Time
 
@@ -87,12 +86,12 @@ def search(
 
         # There is a hard server side limit of 20,000 records, making many small queries will be blocked so use
         # different size windows for different time periods so for 2022 on use 15 day intervals
-        years = [tr.end.to_datetime().year for tr in windows]
-        if 2022 in years:
-            first_2022 = years.index(2022)
-            tr_2022_plus = TimeRange(windows[first_2022].start, windows[first_2022 + 1].end)
-            new_windows = tr_2022_plus.window(5 * u.day, 5 * u.day)
-            windows = windows[: first_2022 - 1] + new_windows + windows[first_2022 + 1 :]
+        # years = [tr.end.to_datetime().year for tr in windows]
+        # if 2022 in years:
+        #     first_2022 = years.index(2022)
+        #     tr_2022_plus = TimeRange(windows[first_2022].start, windows[first_2022 + 1].end)
+        #     new_windows = tr_2022_plus.window(5 * u.day, 5 * u.day)
+        #     windows = windows[: first_2022 - 1] + new_windows + windows[first_2022 + 1 :]
 
         flares = []
         for i, window in enumerate(windows):