ARCAFF · edoardolegnaro · Dec 12, 2025 · Dec 13, 2025
diff --git a/arccnet/data_generation/tests/test_data_manager.py b/arccnet/data_generation/tests/test_data_manager.py
@@ -42,7 +42,6 @@ def test_query_missing(example_query):
 
 # Define a fixture for creating a DataManager instance with default arguments
 @pytest.fixture
-@pytest.mark.remote_data
 def data_manager_default():
     dm = DataManager(
         start_date=str(datetime(2010, 4, 15)),

diff --git a/arccnet/data_generation/tests/test_mag_processing.py b/arccnet/data_generation/tests/test_mag_processing.py
@@ -33,7 +33,6 @@ def cleanup():
     return (Path(temp_dir), Path(raw_data_dir), Path(processed_data_dir))
 
 
-@pytest.mark.remote_data
 @pytest.fixture
 def sunpy_hmi_copies(temp_path_fixture):
     n = 5

diff --git a/arccnet/notebooks/analysis/EDA_cutouts.py b/arccnet/notebooks/analysis/EDA_cutouts.py
@@ -9,7 +9,7 @@
 #       format_version: '1.3'
 #       jupytext_version: 1.11.2
 #   kernelspec:
-#     display_name: py_3.11
+#     display_name: venv
 #     language: python
 #     name: python3
 # ---
@@ -29,6 +29,8 @@
 import seaborn as sns
 from p_tqdm import p_map
 
+from astropy.time import Time
+
 from arccnet import load_config
 from arccnet.models import dataset_utils as ut_d
 from arccnet.visualisation import utils as ut_v
@@ -50,50 +52,123 @@
 dataset_title = "arccnet v20251017"
 
 # %%
-df, _, filtered_ql_df = ut_d.make_dataframe(data_folder, dataset_folder, df_file_name)
-ut_v.make_classes_histogram(df["label"], figsz=(18, 6), text_fontsize=11, title=dataset_title)
+# complete dataframe
+df_raw = pd.read_parquet(os.path.join(data_folder, dataset_folder, df_file_name))
+df_raw["label"] = np.where(df_raw["magnetic_class"] == "", df_raw["region_type"], df_raw["magnetic_class"])
+ut_v.make_classes_histogram(df_raw["label"], figsz=(18, 6), text_fontsize=11, title=dataset_title)
 plt.show()
-df
+
+
+# %% [markdown]
+# # Quality Flags
+
+
+# %%
+# Filter out rows without either quicklook path
+def has_quicklook_path(df: pd.DataFrame) -> pd.Series:
+    """Return boolean mask for rows with at least one valid quicklook path."""
+    has_mdi = df["quicklook_path_mdi"].notna() & (df["quicklook_path_mdi"] != "") & (df["quicklook_path_mdi"] != "None")
+    has_hmi = df["quicklook_path_hmi"].notna() & (df["quicklook_path_hmi"] != "") & (df["quicklook_path_hmi"] != "None")
+    return has_mdi | has_hmi
+
+
+df_with_quicklook = df_raw[has_quicklook_path(df_raw)].copy()
+print(
+    f"Filtered dataframe: {len(df_with_quicklook):,} rows retained from {len(df_raw):,} ({len(df_with_quicklook) / len(df_raw) * 100:.1f}%)"
+)
+
+
+# %%
+def propagate_quality(df: pd.DataFrame, instrument: str = "hmi") -> pd.DataFrame:
+    qual_col = f"QUALITY_{instrument}"
+    qual_mask_col = f"{qual_col}.mask"
+    qlk_col = f"quicklook_path_{instrument}"
+    qlk_mask_col = f"{qlk_col}.mask"
+
+    # keep rows with a usable quicklook key
+    keyed = df[~df[qlk_mask_col] & df[qlk_col].notna()].copy()
+
+    # canonical quality values (mask/empty/nan -> NA)
+    qvals = keyed[qual_col].mask(keyed[qual_mask_col])
+    qvals = qvals.astype(str).str.strip().replace({"": pd.NA, "nan": pd.NA})
+
+    # consensus quality per quicklook (only if exactly one non-empty unique value)
+    consensus = (
+        qvals.groupby(keyed[qlk_col])
+        .agg(lambda s: s.dropna().unique())
+        .loc[lambda s: s.str.len() == 1]
+        .map(lambda arr: arr[0])
+    )
+
+    # map consensus back to all rows sharing that quicklook
+    fill_values = df[qlk_col].map(consensus)
+    to_fill = fill_values.notna()
+    df.loc[to_fill, qual_col] = fill_values[to_fill]
+    df.loc[to_fill, qual_mask_col] = False
+    return df
+
+
+df_quality_propagated = propagate_quality(df_with_quicklook.copy(), "hmi")
+df_quality_propagated = propagate_quality(df_quality_propagated, "mdi")
 
 # %%
 mdi_color = "royalblue"
 hmi_color = "tomato"
 
-df_MDI = df[df["path_image_cutout_hmi"] == ""].copy()
-df_HMI = df[df["path_image_cutout_mdi"] == ""].copy()
+df_raw_mdi_only = df_raw[df_raw["path_image_cutout_hmi"] == ""].copy()
+df_raw_hmi_only = df_raw[df_raw["path_image_cutout_mdi"] == ""].copy()
 
-# %% [markdown]
-# # Quality Flags
+df_quality_propagated_mdi_only = df_quality_propagated[df_quality_propagated["path_image_cutout_hmi"] == ""].copy()
+df_quality_propagated_hmi_only = df_quality_propagated[df_quality_propagated["path_image_cutout_mdi"] == ""].copy()
 
 # %%
-quality_mdi_df = analyze_quality_flags(df_MDI, "SOHO/MDI")
+quality_mdi_df = analyze_quality_flags(df_raw_mdi_only, "SOHO/MDI")
 quality_mdi_df
 
 # %%
-quality_hmi_df = analyze_quality_flags(df_HMI, "SDO/HMI")
+quality_hmi_df = analyze_quality_flags(df_raw_hmi_only, "SDO/HMI")
 quality_hmi_df
 
 
+# %%
+# filtered df
+analyze_quality_flags(df_quality_propagated_hmi_only, "SDO/HMI")
+
+# %%
+analyze_quality_flags(df_quality_propagated_mdi_only, "SOHO/MDI")
+
 # %% [markdown]
 # ## Data Filtering
 
 # %%
-# Remove bad quality data
-hmi_good_flags = ["", "0x00000000", "0x00000400"]
-mdi_good_flags = ["", "00000000", "00000200"]
+# Remove bad quality data using df_quality_propagated (with propagated quality flags)
+hmi_good_flags = ["0x00000000", "0x00000400"]
+mdi_good_flags = ["00000000", "00000200"]
 
-df_clean = df[df["QUALITY_hmi"].isin(hmi_good_flags) & df["QUALITY_mdi"].isin(mdi_good_flags)]
-df_HMI_clean = df_HMI[df_HMI["QUALITY_hmi"].isin(hmi_good_flags)]
-df_MDI_clean = df_MDI[df_MDI["QUALITY_mdi"].isin(mdi_good_flags)]
+# Filter based on quality flags - rows need good HMI OR good MDI (not both required)
+hmi_good = df_quality_propagated["QUALITY_hmi"].isin(hmi_good_flags) | (
+    df_quality_propagated["path_image_cutout_hmi"] == ""
+)
+mdi_good = df_quality_propagated["QUALITY_mdi"].isin(mdi_good_flags) | (
+    df_quality_propagated["path_image_cutout_mdi"] == ""
+)
+df_quality_filtered = df_quality_propagated[hmi_good & mdi_good].copy()
+
+df_quality_filtered_hmi = df_quality_propagated_hmi_only[
+    df_quality_propagated_hmi_only["QUALITY_hmi"].isin(hmi_good_flags)
+]
+df_quality_filtered_mdi = df_quality_propagated_mdi_only[
+    df_quality_propagated_mdi_only["QUALITY_mdi"].isin(mdi_good_flags)
+]
 
 print("DATA FILTERING Stats")
 print("-" * 40)
-hmi_orig, hmi_clean = len(df_HMI), len(df_HMI_clean)
-mdi_orig, mdi_clean = len(df_MDI), len(df_MDI_clean)
-total_orig, total_clean = len(df), len(df_clean)
+hmi_orig, hmi_clean_count = len(df_quality_propagated_hmi_only), len(df_quality_filtered_hmi)
+mdi_orig, mdi_clean_count = len(df_quality_propagated_mdi_only), len(df_quality_filtered_mdi)
+total_orig, total_clean = len(df_quality_propagated), len(df_quality_filtered)
 
-print(f"HMI: {hmi_clean:,}/{hmi_orig:,} ({hmi_clean / hmi_orig * 100:.1f}% retained)")
-print(f"MDI: {mdi_clean:,}/{mdi_orig:,} ({mdi_clean / mdi_orig * 100:.1f}% retained)")
+print(f"HMI: {hmi_clean_count:,}/{hmi_orig:,} ({hmi_clean_count / hmi_orig * 100:.1f}% retained)")
+print(f"MDI: {mdi_clean_count:,}/{mdi_orig:,} ({mdi_clean_count / mdi_orig * 100:.1f}% retained)")
 print(f"Total: {total_clean:,}/{total_orig:,} ({total_clean / total_orig * 100:.1f}% retained)")
 print("-" * 40)
 
@@ -107,43 +182,64 @@ def is_missing(series):
     return series.isna() | (series == "") | (series == "None")
 
 
-hmi_missing = is_missing(df_clean["path_image_cutout_hmi"])
-mdi_missing = is_missing(df_clean["path_image_cutout_mdi"])
+hmi_missing = is_missing(df_quality_filtered["path_image_cutout_hmi"])
+mdi_missing = is_missing(df_quality_filtered["path_image_cutout_mdi"])
 both_missing = hmi_missing & mdi_missing
 
 # Count statistics
 stats = {
-    "total": len(df_clean),
-    "hmi_none": (df_clean["path_image_cutout_hmi"] == "None").sum(),
-    "hmi_empty": (df_clean["path_image_cutout_hmi"] == "").sum(),
-    "mdi_none": (df_clean["path_image_cutout_mdi"] == "None").sum(),
-    "mdi_empty": (df_clean["path_image_cutout_mdi"] == "").sum(),
+    "total": len(df_quality_filtered),
+    "hmi_none": (df_quality_filtered["path_image_cutout_hmi"] == "None").sum(),
+    "hmi_empty": (df_quality_filtered["path_image_cutout_hmi"] == "").sum(),
+    "mdi_none": (df_quality_filtered["path_image_cutout_mdi"] == "None").sum(),
+    "mdi_empty": (df_quality_filtered["path_image_cutout_mdi"] == "").sum(),
     "both_missing": both_missing.sum(),
     "at_least_one": (~hmi_missing | ~mdi_missing).sum(),
     "both_exist": (~hmi_missing & ~mdi_missing).sum(),
 }
 
-print("PATH ANALYSIS:")
+print("PATH ANALYSIS (path_image_cutout columns):")
 print("-" * 40)
-print(f"Total rows in df_clean: {stats['total']:,}")
+print(f"Total rows in quality-filtered df: {stats['total']:,}")
 print(f"HMI paths - None: {stats['hmi_none']:,}, Empty: {stats['hmi_empty']:,}")
 print(f"MDI paths - None: {stats['mdi_none']:,}, Empty: {stats['mdi_empty']:,}")
 print(f"Both paths missing: {stats['both_missing']:,} ({stats['both_missing'] / stats['total'] * 100:.1f}%)")
 print(f"At least one path exists: {stats['at_least_one']:,}")
 print(f"Both paths exist: {stats['both_exist']:,}")
 
 # Remove rows where both paths are missing
-df_clean = df_clean[~both_missing].copy()
+df_final = df_quality_filtered[~both_missing].copy()
+df_final = df_final.reset_index(drop=True)
+
+print(f"\nAfter removing rows with both paths missing: {len(df_final):,} rows")
+
+
+# %%
+df_quality_filtered.loc[both_missing]
+
+
+# %%
+def _convert_jd_to_datetime(df):
+    """Convert Julian dates to datetime objects."""
+    df["time"] = df["target_time.jd1"] + df["target_time.jd2"]
+    times = Time(df["time"], format="jd")
+    df["dates"] = pd.to_datetime(times.iso)
+    return df
 
-df_clean = df_clean.reset_index(drop=True)
 
+final_df = _convert_jd_to_datetime(df_final)
+final_df
+
+# %%
+ut_v.make_classes_histogram(final_df["label"], figsz=(14, 6), text_fontsize=11, title=f"{dataset_title} - Processed")
+plt.show()
 
 # %% [markdown]
 # # Location of ARs on the Sun
 
 # %%
 AR_IA_lbs = ["Alpha", "Beta", "IA", "Beta-Gamma-Delta", "Beta-Gamma", "Beta-Delta", "Gamma-Delta", "Gamma"]
-AR_IA_df = df_clean[df_clean["label"].isin(AR_IA_lbs)].reset_index(drop=True)
+AR_IA_df = final_df[final_df["label"].isin(AR_IA_lbs)].reset_index(drop=True)
 
 # %%
 ut_v.make_classes_histogram(AR_IA_df["label"], figsz=(12, 7), text_fontsize=11, title=f"{dataset_title} ARs", y_off=100)
@@ -158,10 +254,11 @@ def get_coordinates(df, coord_type):
     return np.deg2rad(np.where(df["path_image_cutout_hmi"] != "", df[hmi_col], df[mdi_col]))
 
 
-def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
+def plot_histogram(ax, data, degree_ticks, degree_bins, title, color="#4C72B0"):
     """Plot histogram with degree labels."""
     rad_ticks = np.deg2rad(degree_ticks)
-    ax.hist(data, bins=rad_ticks, color=color, edgecolor="black")
+    rad_bins = np.deg2rad(degree_bins)
+    ax.hist(data, bins=rad_bins, color=color, edgecolor="black", linewidth=0.5)
     ax.set_xticks(rad_ticks)
     ax.set_xticklabels([f"{deg}°" for deg in degree_ticks])
     ax.set_xlabel(f"{title} (degrees)")
@@ -172,12 +269,15 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
 lonV = get_coordinates(AR_IA_df, "longitude")
 latV = get_coordinates(AR_IA_df, "latitude")
 degree_ticks = np.arange(-90, 91, 15)
+degree_bins = np.arange(-90, 91, 2)
 
 # Plot histograms
 with plt.style.context("seaborn-v0_8-darkgrid"):
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
-    plot_histogram(ax1, lonV, degree_ticks, "Longitude")
-    plot_histogram(ax2, latV, degree_ticks, "Latitude")
+    plot_histogram(ax1, lonV, degree_ticks, degree_bins, title="Longitude")
+    plot_histogram(ax2, latV, degree_ticks, degree_bins, title="Latitude")
+    plot_histogram(ax1, lonV, degree_ticks, degree_bins, title="Longitude")
+    plot_histogram(ax2, latV, degree_ticks, degree_bins, title="Latitude")
     plt.tight_layout()
     plt.show()
 
@@ -221,6 +321,69 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
 ut_v.make_classes_histogram(results["df_rear"]["label"], title="Rear ARs", y_off=10, figsz=(11, 5))
 plt.show()
 
+# %% [markdown]
+# # Quality Data Issues
+
+# %%
+problematic_quicklooks = [
+    ql.strip().lstrip("\\").strip() for ql in config.get("magnetograms", "problematic_quicklooks").split(",")
+]
+
+# %%
+final_cleaned_df = AR_IA_df[~AR_IA_df["quicklook_path_hmi"].isin(problematic_quicklooks)].reset_index(drop=True)
+basenames = final_df["quicklook_path_mdi"].apply(os.path.basename)
+mask = basenames.isin(problematic_quicklooks)
+problematic_rows = final_df[mask]
+problematic_rows.to_parquet("problematic_rows_good_quality.parq")
+
+# %%
+problematic_rows["QUALITY_mdi"].value_counts()
+
+# %%
+problematic_rows[problematic_rows["QUALITY_mdi"] == "00000000"]["quicklook_path_mdi"].unique()
+
+# %%
+problematic_rows[problematic_rows["QUALITY_mdi"] == "00000200"]["quicklook_path_mdi"].unique()
+
+
+# %%
+def _remove_problematic_quicklooks(df):
+    """Remove problematic magnetograms from the dataset."""
+
+    def is_problematic(path):
+        """Check if a path's basename matches any problematic quicklook."""
+        if not path or pd.isna(path) or path == "" or path == "None":
+            return False
+        return os.path.basename(path) in problematic_quicklooks
+
+    # Check both MDI and HMI quicklook paths
+    mask_mdi = df["quicklook_path_mdi"].apply(is_problematic)
+    mask_hmi = df["quicklook_path_hmi"].apply(is_problematic)
+    mask = mask_mdi | mask_hmi
+    filtered_df = df[mask]
+    df = df[~mask].reset_index(drop=True)
+    return df, filtered_df
+
+
+# Apply the function to final_df
+final_df_cleaned, problematic_final_df = _remove_problematic_quicklooks(final_df)
+
+print(f"Original dataframe: {len(final_df):,} rows")
+print(f"Cleaned dataframe: {len(final_df_cleaned):,} rows")
+print(
+    f"Problematic rows removed: {len(problematic_final_df):,} rows ({len(problematic_final_df) / len(final_df) * 100:.2f}%)"
+)
+
+# %%
+# Create AR_IA_df from cleaned dataframe
+AR_IA_df = final_df_cleaned[final_df_cleaned["label"].isin(AR_IA_lbs)].reset_index(drop=True)
+
+print(f"AR_IA_df created from cleaned data: {len(AR_IA_df):,} rows")
+ut_v.make_classes_histogram(
+    AR_IA_df["label"], figsz=(12, 7), text_fontsize=11, title=f"{dataset_title} ARs (Cleaned)", y_off=100
+)
+plt.show()
+
 # %% [markdown]
 # # Time Distribution
 # %%
@@ -267,7 +430,7 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
 # # McIntosh Classification
 
 # %%
-AR_df = df[df["magnetic_class"] != ""].copy()
+AR_df = df_with_quicklook[df_with_quicklook["magnetic_class"] != ""].copy()
 
 ut_v.make_classes_histogram(
     AR_df["mcintosh_class"],
@@ -282,9 +445,8 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
 plt.show()
 
 # %%
-
 # McIntosh classification components
-AR_df = df[df["magnetic_class"] != ""].copy()
+AR_df = df_with_quicklook[df_with_quicklook["magnetic_class"] != ""].copy()
 for comp in ["Z_component", "p_component", "c_component"]:
     AR_df[comp] = AR_df["mcintosh_class"].str[{"Z_component": 0, "p_component": 1, "c_component": 2}[comp]]
 
@@ -435,7 +597,7 @@ def process_row_wrapper(idx):
 fig, axes = plt.subplots(2, 4, figsize=(20, 8))
 for i, (col, title, color) in enumerate(stats_config):
     ax = axes.flat[i]
-    ax.hist(stats_df[col], bins=50, color=color, alpha=0.7, edgecolor="black", linewidth=0.5, log=True)
+    ax.hist(stats_df[col], bins=70, color=color, alpha=0.7, edgecolor="black", linewidth=0.5, log=True)
     ax.set_title(title, fontsize=16, pad=12)
     ax.set_xlabel("Value", fontsize=14)
     if i % 4 == 0:  # First column gets y-label
@@ -465,4 +627,3 @@ def process_row_wrapper(idx):
     fig.autofmt_xdate(ha="right")
     plt.tight_layout()
     plt.show()
-# %%