diff --git a/arccnet/data_generation/tests/test_data_manager.py b/arccnet/data_generation/tests/test_data_manager.py index e7fade62..0cc9b7dc 100644 --- a/arccnet/data_generation/tests/test_data_manager.py +++ b/arccnet/data_generation/tests/test_data_manager.py @@ -42,7 +42,6 @@ def test_query_missing(example_query): # Define a fixture for creating a DataManager instance with default arguments @pytest.fixture -@pytest.mark.remote_data def data_manager_default(): dm = DataManager( start_date=str(datetime(2010, 4, 15)), diff --git a/arccnet/data_generation/tests/test_mag_processing.py b/arccnet/data_generation/tests/test_mag_processing.py index 7077907d..4357d92a 100644 --- a/arccnet/data_generation/tests/test_mag_processing.py +++ b/arccnet/data_generation/tests/test_mag_processing.py @@ -33,7 +33,6 @@ def cleanup(): return (Path(temp_dir), Path(raw_data_dir), Path(processed_data_dir)) -@pytest.mark.remote_data @pytest.fixture def sunpy_hmi_copies(temp_path_fixture): n = 5 diff --git a/arccnet/notebooks/analysis/EDA_cutouts.py b/arccnet/notebooks/analysis/EDA_cutouts.py index 8e9f5cc5..5f4e3bfd 100644 --- a/arccnet/notebooks/analysis/EDA_cutouts.py +++ b/arccnet/notebooks/analysis/EDA_cutouts.py @@ -9,7 +9,7 @@ # format_version: '1.3' # jupytext_version: 1.11.2 # kernelspec: -# display_name: py_3.11 +# display_name: venv # language: python # name: python3 # --- @@ -29,6 +29,8 @@ import seaborn as sns from p_tqdm import p_map +from astropy.time import Time + from arccnet import load_config from arccnet.models import dataset_utils as ut_d from arccnet.visualisation import utils as ut_v @@ -50,50 +52,123 @@ dataset_title = "arccnet v20251017" # %% -df, _, filtered_ql_df = ut_d.make_dataframe(data_folder, dataset_folder, df_file_name) -ut_v.make_classes_histogram(df["label"], figsz=(18, 6), text_fontsize=11, title=dataset_title) +# complete dataframe +df_raw = pd.read_parquet(os.path.join(data_folder, dataset_folder, df_file_name)) +df_raw["label"] = np.where(df_raw["magnetic_class"] == "", df_raw["region_type"], df_raw["magnetic_class"]) +ut_v.make_classes_histogram(df_raw["label"], figsz=(18, 6), text_fontsize=11, title=dataset_title) plt.show() -df + + +# %% [markdown] +# # Quality Flags + + +# %% +# Filter out rows without either quicklook path +def has_quicklook_path(df: pd.DataFrame) -> pd.Series: + """Return boolean mask for rows with at least one valid quicklook path.""" + has_mdi = df["quicklook_path_mdi"].notna() & (df["quicklook_path_mdi"] != "") & (df["quicklook_path_mdi"] != "None") + has_hmi = df["quicklook_path_hmi"].notna() & (df["quicklook_path_hmi"] != "") & (df["quicklook_path_hmi"] != "None") + return has_mdi | has_hmi + + +df_with_quicklook = df_raw[has_quicklook_path(df_raw)].copy() +print( + f"Filtered dataframe: {len(df_with_quicklook):,} rows retained from {len(df_raw):,} ({len(df_with_quicklook) / len(df_raw) * 100:.1f}%)" +) + + +# %% +def propagate_quality(df: pd.DataFrame, instrument: str = "hmi") -> pd.DataFrame: + qual_col = f"QUALITY_{instrument}" + qual_mask_col = f"{qual_col}.mask" + qlk_col = f"quicklook_path_{instrument}" + qlk_mask_col = f"{qlk_col}.mask" + + # keep rows with a usable quicklook key + keyed = df[~df[qlk_mask_col] & df[qlk_col].notna()].copy() + + # canonical quality values (mask/empty/nan -> NA) + qvals = keyed[qual_col].mask(keyed[qual_mask_col]) + qvals = qvals.astype(str).str.strip().replace({"": pd.NA, "nan": pd.NA}) + + # consensus quality per quicklook (only if exactly one non-empty unique value) + consensus = ( + qvals.groupby(keyed[qlk_col]) + .agg(lambda s: s.dropna().unique()) + .loc[lambda s: s.str.len() == 1] + .map(lambda arr: arr[0]) + ) + + # map consensus back to all rows sharing that quicklook + fill_values = df[qlk_col].map(consensus) + to_fill = fill_values.notna() + df.loc[to_fill, qual_col] = fill_values[to_fill] + df.loc[to_fill, qual_mask_col] = False + return df + + +df_quality_propagated = propagate_quality(df_with_quicklook.copy(), "hmi") +df_quality_propagated = propagate_quality(df_quality_propagated, "mdi") # %% mdi_color = "royalblue" hmi_color = "tomato" -df_MDI = df[df["path_image_cutout_hmi"] == ""].copy() -df_HMI = df[df["path_image_cutout_mdi"] == ""].copy() +df_raw_mdi_only = df_raw[df_raw["path_image_cutout_hmi"] == ""].copy() +df_raw_hmi_only = df_raw[df_raw["path_image_cutout_mdi"] == ""].copy() -# %% [markdown] -# # Quality Flags +df_quality_propagated_mdi_only = df_quality_propagated[df_quality_propagated["path_image_cutout_hmi"] == ""].copy() +df_quality_propagated_hmi_only = df_quality_propagated[df_quality_propagated["path_image_cutout_mdi"] == ""].copy() # %% -quality_mdi_df = analyze_quality_flags(df_MDI, "SOHO/MDI") +quality_mdi_df = analyze_quality_flags(df_raw_mdi_only, "SOHO/MDI") quality_mdi_df # %% -quality_hmi_df = analyze_quality_flags(df_HMI, "SDO/HMI") +quality_hmi_df = analyze_quality_flags(df_raw_hmi_only, "SDO/HMI") quality_hmi_df +# %% +# filtered df +analyze_quality_flags(df_quality_propagated_hmi_only, "SDO/HMI") + +# %% +analyze_quality_flags(df_quality_propagated_mdi_only, "SOHO/MDI") + # %% [markdown] # ## Data Filtering # %% -# Remove bad quality data -hmi_good_flags = ["", "0x00000000", "0x00000400"] -mdi_good_flags = ["", "00000000", "00000200"] +# Remove bad quality data using df_quality_propagated (with propagated quality flags) +hmi_good_flags = ["0x00000000", "0x00000400"] +mdi_good_flags = ["00000000", "00000200"] -df_clean = df[df["QUALITY_hmi"].isin(hmi_good_flags) & df["QUALITY_mdi"].isin(mdi_good_flags)] -df_HMI_clean = df_HMI[df_HMI["QUALITY_hmi"].isin(hmi_good_flags)] -df_MDI_clean = df_MDI[df_MDI["QUALITY_mdi"].isin(mdi_good_flags)] +# Filter based on quality flags - rows need good HMI OR good MDI (not both required) +hmi_good = df_quality_propagated["QUALITY_hmi"].isin(hmi_good_flags) | ( + df_quality_propagated["path_image_cutout_hmi"] == "" +) +mdi_good = df_quality_propagated["QUALITY_mdi"].isin(mdi_good_flags) | ( + df_quality_propagated["path_image_cutout_mdi"] == "" +) +df_quality_filtered = df_quality_propagated[hmi_good & mdi_good].copy() + +df_quality_filtered_hmi = df_quality_propagated_hmi_only[ + df_quality_propagated_hmi_only["QUALITY_hmi"].isin(hmi_good_flags) +] +df_quality_filtered_mdi = df_quality_propagated_mdi_only[ + df_quality_propagated_mdi_only["QUALITY_mdi"].isin(mdi_good_flags) +] print("DATA FILTERING Stats") print("-" * 40) -hmi_orig, hmi_clean = len(df_HMI), len(df_HMI_clean) -mdi_orig, mdi_clean = len(df_MDI), len(df_MDI_clean) -total_orig, total_clean = len(df), len(df_clean) +hmi_orig, hmi_clean_count = len(df_quality_propagated_hmi_only), len(df_quality_filtered_hmi) +mdi_orig, mdi_clean_count = len(df_quality_propagated_mdi_only), len(df_quality_filtered_mdi) +total_orig, total_clean = len(df_quality_propagated), len(df_quality_filtered) -print(f"HMI: {hmi_clean:,}/{hmi_orig:,} ({hmi_clean / hmi_orig * 100:.1f}% retained)") -print(f"MDI: {mdi_clean:,}/{mdi_orig:,} ({mdi_clean / mdi_orig * 100:.1f}% retained)") +print(f"HMI: {hmi_clean_count:,}/{hmi_orig:,} ({hmi_clean_count / hmi_orig * 100:.1f}% retained)") +print(f"MDI: {mdi_clean_count:,}/{mdi_orig:,} ({mdi_clean_count / mdi_orig * 100:.1f}% retained)") print(f"Total: {total_clean:,}/{total_orig:,} ({total_clean / total_orig * 100:.1f}% retained)") print("-" * 40) @@ -107,25 +182,25 @@ def is_missing(series): return series.isna() | (series == "") | (series == "None") -hmi_missing = is_missing(df_clean["path_image_cutout_hmi"]) -mdi_missing = is_missing(df_clean["path_image_cutout_mdi"]) +hmi_missing = is_missing(df_quality_filtered["path_image_cutout_hmi"]) +mdi_missing = is_missing(df_quality_filtered["path_image_cutout_mdi"]) both_missing = hmi_missing & mdi_missing # Count statistics stats = { - "total": len(df_clean), - "hmi_none": (df_clean["path_image_cutout_hmi"] == "None").sum(), - "hmi_empty": (df_clean["path_image_cutout_hmi"] == "").sum(), - "mdi_none": (df_clean["path_image_cutout_mdi"] == "None").sum(), - "mdi_empty": (df_clean["path_image_cutout_mdi"] == "").sum(), + "total": len(df_quality_filtered), + "hmi_none": (df_quality_filtered["path_image_cutout_hmi"] == "None").sum(), + "hmi_empty": (df_quality_filtered["path_image_cutout_hmi"] == "").sum(), + "mdi_none": (df_quality_filtered["path_image_cutout_mdi"] == "None").sum(), + "mdi_empty": (df_quality_filtered["path_image_cutout_mdi"] == "").sum(), "both_missing": both_missing.sum(), "at_least_one": (~hmi_missing | ~mdi_missing).sum(), "both_exist": (~hmi_missing & ~mdi_missing).sum(), } -print("PATH ANALYSIS:") +print("PATH ANALYSIS (path_image_cutout columns):") print("-" * 40) -print(f"Total rows in df_clean: {stats['total']:,}") +print(f"Total rows in quality-filtered df: {stats['total']:,}") print(f"HMI paths - None: {stats['hmi_none']:,}, Empty: {stats['hmi_empty']:,}") print(f"MDI paths - None: {stats['mdi_none']:,}, Empty: {stats['mdi_empty']:,}") print(f"Both paths missing: {stats['both_missing']:,} ({stats['both_missing'] / stats['total'] * 100:.1f}%)") @@ -133,17 +208,38 @@ def is_missing(series): print(f"Both paths exist: {stats['both_exist']:,}") # Remove rows where both paths are missing -df_clean = df_clean[~both_missing].copy() +df_final = df_quality_filtered[~both_missing].copy() +df_final = df_final.reset_index(drop=True) + +print(f"\nAfter removing rows with both paths missing: {len(df_final):,} rows") + + +# %% +df_quality_filtered.loc[both_missing] + + +# %% +def _convert_jd_to_datetime(df): + """Convert Julian dates to datetime objects.""" + df["time"] = df["target_time.jd1"] + df["target_time.jd2"] + times = Time(df["time"], format="jd") + df["dates"] = pd.to_datetime(times.iso) + return df -df_clean = df_clean.reset_index(drop=True) +final_df = _convert_jd_to_datetime(df_final) +final_df + +# %% +ut_v.make_classes_histogram(final_df["label"], figsz=(14, 6), text_fontsize=11, title=f"{dataset_title} - Processed") +plt.show() # %% [markdown] # # Location of ARs on the Sun # %% AR_IA_lbs = ["Alpha", "Beta", "IA", "Beta-Gamma-Delta", "Beta-Gamma", "Beta-Delta", "Gamma-Delta", "Gamma"] -AR_IA_df = df_clean[df_clean["label"].isin(AR_IA_lbs)].reset_index(drop=True) +AR_IA_df = final_df[final_df["label"].isin(AR_IA_lbs)].reset_index(drop=True) # %% ut_v.make_classes_histogram(AR_IA_df["label"], figsz=(12, 7), text_fontsize=11, title=f"{dataset_title} ARs", y_off=100) @@ -158,10 +254,11 @@ def get_coordinates(df, coord_type): return np.deg2rad(np.where(df["path_image_cutout_hmi"] != "", df[hmi_col], df[mdi_col])) -def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"): +def plot_histogram(ax, data, degree_ticks, degree_bins, title, color="#4C72B0"): """Plot histogram with degree labels.""" rad_ticks = np.deg2rad(degree_ticks) - ax.hist(data, bins=rad_ticks, color=color, edgecolor="black") + rad_bins = np.deg2rad(degree_bins) + ax.hist(data, bins=rad_bins, color=color, edgecolor="black", linewidth=0.5) ax.set_xticks(rad_ticks) ax.set_xticklabels([f"{deg}°" for deg in degree_ticks]) ax.set_xlabel(f"{title} (degrees)") @@ -172,12 +269,15 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"): lonV = get_coordinates(AR_IA_df, "longitude") latV = get_coordinates(AR_IA_df, "latitude") degree_ticks = np.arange(-90, 91, 15) +degree_bins = np.arange(-90, 91, 2) # Plot histograms with plt.style.context("seaborn-v0_8-darkgrid"): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) - plot_histogram(ax1, lonV, degree_ticks, "Longitude") - plot_histogram(ax2, latV, degree_ticks, "Latitude") + plot_histogram(ax1, lonV, degree_ticks, degree_bins, title="Longitude") + plot_histogram(ax2, latV, degree_ticks, degree_bins, title="Latitude") + plot_histogram(ax1, lonV, degree_ticks, degree_bins, title="Longitude") + plot_histogram(ax2, latV, degree_ticks, degree_bins, title="Latitude") plt.tight_layout() plt.show() @@ -221,6 +321,69 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"): ut_v.make_classes_histogram(results["df_rear"]["label"], title="Rear ARs", y_off=10, figsz=(11, 5)) plt.show() +# %% [markdown] +# # Quality Data Issues + +# %% +problematic_quicklooks = [ + ql.strip().lstrip("\\").strip() for ql in config.get("magnetograms", "problematic_quicklooks").split(",") +] + +# %% +final_cleaned_df = AR_IA_df[~AR_IA_df["quicklook_path_hmi"].isin(problematic_quicklooks)].reset_index(drop=True) +basenames = final_df["quicklook_path_mdi"].apply(os.path.basename) +mask = basenames.isin(problematic_quicklooks) +problematic_rows = final_df[mask] +problematic_rows.to_parquet("problematic_rows_good_quality.parq") + +# %% +problematic_rows["QUALITY_mdi"].value_counts() + +# %% +problematic_rows[problematic_rows["QUALITY_mdi"] == "00000000"]["quicklook_path_mdi"].unique() + +# %% +problematic_rows[problematic_rows["QUALITY_mdi"] == "00000200"]["quicklook_path_mdi"].unique() + + +# %% +def _remove_problematic_quicklooks(df): + """Remove problematic magnetograms from the dataset.""" + + def is_problematic(path): + """Check if a path's basename matches any problematic quicklook.""" + if not path or pd.isna(path) or path == "" or path == "None": + return False + return os.path.basename(path) in problematic_quicklooks + + # Check both MDI and HMI quicklook paths + mask_mdi = df["quicklook_path_mdi"].apply(is_problematic) + mask_hmi = df["quicklook_path_hmi"].apply(is_problematic) + mask = mask_mdi | mask_hmi + filtered_df = df[mask] + df = df[~mask].reset_index(drop=True) + return df, filtered_df + + +# Apply the function to final_df +final_df_cleaned, problematic_final_df = _remove_problematic_quicklooks(final_df) + +print(f"Original dataframe: {len(final_df):,} rows") +print(f"Cleaned dataframe: {len(final_df_cleaned):,} rows") +print( + f"Problematic rows removed: {len(problematic_final_df):,} rows ({len(problematic_final_df) / len(final_df) * 100:.2f}%)" +) + +# %% +# Create AR_IA_df from cleaned dataframe +AR_IA_df = final_df_cleaned[final_df_cleaned["label"].isin(AR_IA_lbs)].reset_index(drop=True) + +print(f"AR_IA_df created from cleaned data: {len(AR_IA_df):,} rows") +ut_v.make_classes_histogram( + AR_IA_df["label"], figsz=(12, 7), text_fontsize=11, title=f"{dataset_title} ARs (Cleaned)", y_off=100 +) +plt.show() + # %% [markdown] # # Time Distribution # %% @@ -267,7 +430,7 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"): # # McIntosh Classification # %% -AR_df = df[df["magnetic_class"] != ""].copy() +AR_df = df_with_quicklook[df_with_quicklook["magnetic_class"] != ""].copy() ut_v.make_classes_histogram( AR_df["mcintosh_class"], @@ -282,9 +445,8 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"): plt.show() # %% - # McIntosh classification components -AR_df = df[df["magnetic_class"] != ""].copy() +AR_df = df_with_quicklook[df_with_quicklook["magnetic_class"] != ""].copy() for comp in ["Z_component", "p_component", "c_component"]: AR_df[comp] = AR_df["mcintosh_class"].str[{"Z_component": 0, "p_component": 1, "c_component": 2}[comp]] @@ -435,7 +597,7 @@ def process_row_wrapper(idx): fig, axes = plt.subplots(2, 4, figsize=(20, 8)) for i, (col, title, color) in enumerate(stats_config): ax = axes.flat[i] - ax.hist(stats_df[col], bins=50, color=color, alpha=0.7, edgecolor="black", linewidth=0.5, log=True) + ax.hist(stats_df[col], bins=70, color=color, alpha=0.7, edgecolor="black", linewidth=0.5, log=True) ax.set_title(title, fontsize=16, pad=12) ax.set_xlabel("Value", fontsize=14) if i % 4 == 0: # First column gets y-label @@ -465,4 +627,3 @@ def process_row_wrapper(idx): fig.autofmt_xdate(ha="right") plt.tight_layout() plt.show() -# %% diff --git a/tox.ini b/tox.ini index ef24b564..245dad28 100644 --- a/tox.ini +++ b/tox.ini @@ -32,6 +32,9 @@ commands = [testenv:build_docs] changedir = docs description = Invoke sphinx-build to build the HTML docs +setenv = + LC_ALL = C.UTF-8 + LANG = C.UTF-8 extras = docs commands = pip freeze --all --no-input