Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion arccnet/data_generation/tests/test_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def test_query_missing(example_query):

# Define a fixture for creating a DataManager instance with default arguments
@pytest.fixture
@pytest.mark.remote_data
def data_manager_default():
dm = DataManager(
start_date=str(datetime(2010, 4, 15)),
Expand Down
1 change: 0 additions & 1 deletion arccnet/data_generation/tests/test_mag_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def cleanup():
return (Path(temp_dir), Path(raw_data_dir), Path(processed_data_dir))


@pytest.mark.remote_data
@pytest.fixture
def sunpy_hmi_copies(temp_path_fixture):
n = 5
Expand Down
245 changes: 203 additions & 42 deletions arccnet/notebooks/analysis/EDA_cutouts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# format_version: '1.3'
# jupytext_version: 1.11.2
# kernelspec:
# display_name: py_3.11
# display_name: venv
# language: python
# name: python3
# ---
Expand All @@ -29,6 +29,8 @@
import seaborn as sns
from p_tqdm import p_map

from astropy.time import Time

from arccnet import load_config
from arccnet.models import dataset_utils as ut_d
from arccnet.visualisation import utils as ut_v
Expand All @@ -50,50 +52,123 @@
dataset_title = "arccnet v20251017"

# %%
df, _, filtered_ql_df = ut_d.make_dataframe(data_folder, dataset_folder, df_file_name)
ut_v.make_classes_histogram(df["label"], figsz=(18, 6), text_fontsize=11, title=dataset_title)
# complete dataframe
df_raw = pd.read_parquet(os.path.join(data_folder, dataset_folder, df_file_name))
df_raw["label"] = np.where(df_raw["magnetic_class"] == "", df_raw["region_type"], df_raw["magnetic_class"])
ut_v.make_classes_histogram(df_raw["label"], figsz=(18, 6), text_fontsize=11, title=dataset_title)
plt.show()
df


# %% [markdown]
# # Quality Flags


# %%
# Filter out rows without either quicklook path
def has_quicklook_path(df: pd.DataFrame) -> pd.Series:
"""Return boolean mask for rows with at least one valid quicklook path."""
has_mdi = df["quicklook_path_mdi"].notna() & (df["quicklook_path_mdi"] != "") & (df["quicklook_path_mdi"] != "None")
has_hmi = df["quicklook_path_hmi"].notna() & (df["quicklook_path_hmi"] != "") & (df["quicklook_path_hmi"] != "None")
return has_mdi | has_hmi


df_with_quicklook = df_raw[has_quicklook_path(df_raw)].copy()
print(
f"Filtered dataframe: {len(df_with_quicklook):,} rows retained from {len(df_raw):,} ({len(df_with_quicklook) / len(df_raw) * 100:.1f}%)"
)


# %%
def propagate_quality(df: pd.DataFrame, instrument: str = "hmi") -> pd.DataFrame:
qual_col = f"QUALITY_{instrument}"
qual_mask_col = f"{qual_col}.mask"
qlk_col = f"quicklook_path_{instrument}"
qlk_mask_col = f"{qlk_col}.mask"

# keep rows with a usable quicklook key
keyed = df[~df[qlk_mask_col] & df[qlk_col].notna()].copy()

# canonical quality values (mask/empty/nan -> NA)
qvals = keyed[qual_col].mask(keyed[qual_mask_col])
qvals = qvals.astype(str).str.strip().replace({"": pd.NA, "nan": pd.NA})

# consensus quality per quicklook (only if exactly one non-empty unique value)
consensus = (
qvals.groupby(keyed[qlk_col])
.agg(lambda s: s.dropna().unique())
.loc[lambda s: s.str.len() == 1]
.map(lambda arr: arr[0])
)

# map consensus back to all rows sharing that quicklook
fill_values = df[qlk_col].map(consensus)
to_fill = fill_values.notna()
df.loc[to_fill, qual_col] = fill_values[to_fill]
df.loc[to_fill, qual_mask_col] = False
return df


df_quality_propagated = propagate_quality(df_with_quicklook.copy(), "hmi")
df_quality_propagated = propagate_quality(df_quality_propagated, "mdi")

# %%
mdi_color = "royalblue"
hmi_color = "tomato"

df_MDI = df[df["path_image_cutout_hmi"] == ""].copy()
df_HMI = df[df["path_image_cutout_mdi"] == ""].copy()
df_raw_mdi_only = df_raw[df_raw["path_image_cutout_hmi"] == ""].copy()
df_raw_hmi_only = df_raw[df_raw["path_image_cutout_mdi"] == ""].copy()

# %% [markdown]
# # Quality Flags
df_quality_propagated_mdi_only = df_quality_propagated[df_quality_propagated["path_image_cutout_hmi"] == ""].copy()
df_quality_propagated_hmi_only = df_quality_propagated[df_quality_propagated["path_image_cutout_mdi"] == ""].copy()

# %%
quality_mdi_df = analyze_quality_flags(df_MDI, "SOHO/MDI")
quality_mdi_df = analyze_quality_flags(df_raw_mdi_only, "SOHO/MDI")
quality_mdi_df

# %%
quality_hmi_df = analyze_quality_flags(df_HMI, "SDO/HMI")
quality_hmi_df = analyze_quality_flags(df_raw_hmi_only, "SDO/HMI")
quality_hmi_df


# %%
# filtered df
analyze_quality_flags(df_quality_propagated_hmi_only, "SDO/HMI")

# %%
analyze_quality_flags(df_quality_propagated_mdi_only, "SOHO/MDI")

# %% [markdown]
# ## Data Filtering

# %%
# Remove bad quality data
hmi_good_flags = ["", "0x00000000", "0x00000400"]
mdi_good_flags = ["", "00000000", "00000200"]
# Remove bad quality data using df_quality_propagated (with propagated quality flags)
hmi_good_flags = ["0x00000000", "0x00000400"]
mdi_good_flags = ["00000000", "00000200"]

df_clean = df[df["QUALITY_hmi"].isin(hmi_good_flags) & df["QUALITY_mdi"].isin(mdi_good_flags)]
df_HMI_clean = df_HMI[df_HMI["QUALITY_hmi"].isin(hmi_good_flags)]
df_MDI_clean = df_MDI[df_MDI["QUALITY_mdi"].isin(mdi_good_flags)]
# Filter based on quality flags - rows need good HMI OR good MDI (not both required)
hmi_good = df_quality_propagated["QUALITY_hmi"].isin(hmi_good_flags) | (
df_quality_propagated["path_image_cutout_hmi"] == ""
)
mdi_good = df_quality_propagated["QUALITY_mdi"].isin(mdi_good_flags) | (
df_quality_propagated["path_image_cutout_mdi"] == ""
)
df_quality_filtered = df_quality_propagated[hmi_good & mdi_good].copy()

df_quality_filtered_hmi = df_quality_propagated_hmi_only[
df_quality_propagated_hmi_only["QUALITY_hmi"].isin(hmi_good_flags)
]
df_quality_filtered_mdi = df_quality_propagated_mdi_only[
df_quality_propagated_mdi_only["QUALITY_mdi"].isin(mdi_good_flags)
]

print("DATA FILTERING Stats")
print("-" * 40)
hmi_orig, hmi_clean = len(df_HMI), len(df_HMI_clean)
mdi_orig, mdi_clean = len(df_MDI), len(df_MDI_clean)
total_orig, total_clean = len(df), len(df_clean)
hmi_orig, hmi_clean_count = len(df_quality_propagated_hmi_only), len(df_quality_filtered_hmi)
mdi_orig, mdi_clean_count = len(df_quality_propagated_mdi_only), len(df_quality_filtered_mdi)
total_orig, total_clean = len(df_quality_propagated), len(df_quality_filtered)

print(f"HMI: {hmi_clean:,}/{hmi_orig:,} ({hmi_clean / hmi_orig * 100:.1f}% retained)")
print(f"MDI: {mdi_clean:,}/{mdi_orig:,} ({mdi_clean / mdi_orig * 100:.1f}% retained)")
print(f"HMI: {hmi_clean_count:,}/{hmi_orig:,} ({hmi_clean_count / hmi_orig * 100:.1f}% retained)")
print(f"MDI: {mdi_clean_count:,}/{mdi_orig:,} ({mdi_clean_count / mdi_orig * 100:.1f}% retained)")
print(f"Total: {total_clean:,}/{total_orig:,} ({total_clean / total_orig * 100:.1f}% retained)")
print("-" * 40)

Expand All @@ -107,43 +182,64 @@ def is_missing(series):
return series.isna() | (series == "") | (series == "None")


hmi_missing = is_missing(df_clean["path_image_cutout_hmi"])
mdi_missing = is_missing(df_clean["path_image_cutout_mdi"])
hmi_missing = is_missing(df_quality_filtered["path_image_cutout_hmi"])
mdi_missing = is_missing(df_quality_filtered["path_image_cutout_mdi"])
both_missing = hmi_missing & mdi_missing

# Count statistics
stats = {
"total": len(df_clean),
"hmi_none": (df_clean["path_image_cutout_hmi"] == "None").sum(),
"hmi_empty": (df_clean["path_image_cutout_hmi"] == "").sum(),
"mdi_none": (df_clean["path_image_cutout_mdi"] == "None").sum(),
"mdi_empty": (df_clean["path_image_cutout_mdi"] == "").sum(),
"total": len(df_quality_filtered),
"hmi_none": (df_quality_filtered["path_image_cutout_hmi"] == "None").sum(),
"hmi_empty": (df_quality_filtered["path_image_cutout_hmi"] == "").sum(),
"mdi_none": (df_quality_filtered["path_image_cutout_mdi"] == "None").sum(),
"mdi_empty": (df_quality_filtered["path_image_cutout_mdi"] == "").sum(),
"both_missing": both_missing.sum(),
"at_least_one": (~hmi_missing | ~mdi_missing).sum(),
"both_exist": (~hmi_missing & ~mdi_missing).sum(),
}

print("PATH ANALYSIS:")
print("PATH ANALYSIS (path_image_cutout columns):")
print("-" * 40)
print(f"Total rows in df_clean: {stats['total']:,}")
print(f"Total rows in quality-filtered df: {stats['total']:,}")
print(f"HMI paths - None: {stats['hmi_none']:,}, Empty: {stats['hmi_empty']:,}")
print(f"MDI paths - None: {stats['mdi_none']:,}, Empty: {stats['mdi_empty']:,}")
print(f"Both paths missing: {stats['both_missing']:,} ({stats['both_missing'] / stats['total'] * 100:.1f}%)")
print(f"At least one path exists: {stats['at_least_one']:,}")
print(f"Both paths exist: {stats['both_exist']:,}")

# Remove rows where both paths are missing
df_clean = df_clean[~both_missing].copy()
df_final = df_quality_filtered[~both_missing].copy()
df_final = df_final.reset_index(drop=True)

print(f"\nAfter removing rows with both paths missing: {len(df_final):,} rows")


# %%
df_quality_filtered.loc[both_missing]


# %%
def _convert_jd_to_datetime(df):
"""Convert Julian dates to datetime objects."""
df["time"] = df["target_time.jd1"] + df["target_time.jd2"]
times = Time(df["time"], format="jd")
df["dates"] = pd.to_datetime(times.iso)
return df

df_clean = df_clean.reset_index(drop=True)

final_df = _convert_jd_to_datetime(df_final)
final_df

# %%
ut_v.make_classes_histogram(final_df["label"], figsz=(14, 6), text_fontsize=11, title=f"{dataset_title} - Processed")
plt.show()

# %% [markdown]
# # Location of ARs on the Sun

# %%
AR_IA_lbs = ["Alpha", "Beta", "IA", "Beta-Gamma-Delta", "Beta-Gamma", "Beta-Delta", "Gamma-Delta", "Gamma"]
AR_IA_df = df_clean[df_clean["label"].isin(AR_IA_lbs)].reset_index(drop=True)
AR_IA_df = final_df[final_df["label"].isin(AR_IA_lbs)].reset_index(drop=True)

# %%
ut_v.make_classes_histogram(AR_IA_df["label"], figsz=(12, 7), text_fontsize=11, title=f"{dataset_title} ARs", y_off=100)
Expand All @@ -158,10 +254,11 @@ def get_coordinates(df, coord_type):
return np.deg2rad(np.where(df["path_image_cutout_hmi"] != "", df[hmi_col], df[mdi_col]))


def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
def plot_histogram(ax, data, degree_ticks, degree_bins, title, color="#4C72B0"):
"""Plot histogram with degree labels."""
rad_ticks = np.deg2rad(degree_ticks)
ax.hist(data, bins=rad_ticks, color=color, edgecolor="black")
rad_bins = np.deg2rad(degree_bins)
ax.hist(data, bins=rad_bins, color=color, edgecolor="black", linewidth=0.5)
ax.set_xticks(rad_ticks)
ax.set_xticklabels([f"{deg}°" for deg in degree_ticks])
ax.set_xlabel(f"{title} (degrees)")
Expand All @@ -172,12 +269,15 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
lonV = get_coordinates(AR_IA_df, "longitude")
latV = get_coordinates(AR_IA_df, "latitude")
degree_ticks = np.arange(-90, 91, 15)
degree_bins = np.arange(-90, 91, 2)

# Plot histograms
with plt.style.context("seaborn-v0_8-darkgrid"):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
plot_histogram(ax1, lonV, degree_ticks, "Longitude")
plot_histogram(ax2, latV, degree_ticks, "Latitude")
plot_histogram(ax1, lonV, degree_ticks, degree_bins, title="Longitude")
plot_histogram(ax2, latV, degree_ticks, degree_bins, title="Latitude")
plot_histogram(ax1, lonV, degree_ticks, degree_bins, title="Longitude")
plot_histogram(ax2, latV, degree_ticks, degree_bins, title="Latitude")
plt.tight_layout()
plt.show()

Expand Down Expand Up @@ -221,6 +321,69 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
ut_v.make_classes_histogram(results["df_rear"]["label"], title="Rear ARs", y_off=10, figsz=(11, 5))
plt.show()

# %% [markdown]
# # Quality Data Issues

# %%
problematic_quicklooks = [
ql.strip().lstrip("\\").strip() for ql in config.get("magnetograms", "problematic_quicklooks").split(",")
]

# %%
final_cleaned_df = AR_IA_df[~AR_IA_df["quicklook_path_hmi"].isin(problematic_quicklooks)].reset_index(drop=True)
basenames = final_df["quicklook_path_mdi"].apply(os.path.basename)
mask = basenames.isin(problematic_quicklooks)
problematic_rows = final_df[mask]
problematic_rows.to_parquet("problematic_rows_good_quality.parq")

# %%
problematic_rows["QUALITY_mdi"].value_counts()

# %%
problematic_rows[problematic_rows["QUALITY_mdi"] == "00000000"]["quicklook_path_mdi"].unique()

# %%
problematic_rows[problematic_rows["QUALITY_mdi"] == "00000200"]["quicklook_path_mdi"].unique()


# %%
def _remove_problematic_quicklooks(df):
"""Remove problematic magnetograms from the dataset."""

def is_problematic(path):
"""Check if a path's basename matches any problematic quicklook."""
if not path or pd.isna(path) or path == "" or path == "None":
return False
return os.path.basename(path) in problematic_quicklooks

# Check both MDI and HMI quicklook paths
mask_mdi = df["quicklook_path_mdi"].apply(is_problematic)
mask_hmi = df["quicklook_path_hmi"].apply(is_problematic)
mask = mask_mdi | mask_hmi
filtered_df = df[mask]
df = df[~mask].reset_index(drop=True)
return df, filtered_df


# Apply the function to final_df
final_df_cleaned, problematic_final_df = _remove_problematic_quicklooks(final_df)

print(f"Original dataframe: {len(final_df):,} rows")
print(f"Cleaned dataframe: {len(final_df_cleaned):,} rows")
print(
f"Problematic rows removed: {len(problematic_final_df):,} rows ({len(problematic_final_df) / len(final_df) * 100:.2f}%)"
)

# %%
# Create AR_IA_df from cleaned dataframe
AR_IA_df = final_df_cleaned[final_df_cleaned["label"].isin(AR_IA_lbs)].reset_index(drop=True)

print(f"AR_IA_df created from cleaned data: {len(AR_IA_df):,} rows")
ut_v.make_classes_histogram(
AR_IA_df["label"], figsz=(12, 7), text_fontsize=11, title=f"{dataset_title} ARs (Cleaned)", y_off=100
)
plt.show()

# %% [markdown]
# # Time Distribution
# %%
Expand Down Expand Up @@ -267,7 +430,7 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
# # McIntosh Classification

# %%
AR_df = df[df["magnetic_class"] != ""].copy()
AR_df = df_with_quicklook[df_with_quicklook["magnetic_class"] != ""].copy()

ut_v.make_classes_histogram(
AR_df["mcintosh_class"],
Expand All @@ -282,9 +445,8 @@ def plot_histogram(ax, data, degree_ticks, title, color="#4C72B0"):
plt.show()

# %%

# McIntosh classification components
AR_df = df[df["magnetic_class"] != ""].copy()
AR_df = df_with_quicklook[df_with_quicklook["magnetic_class"] != ""].copy()
for comp in ["Z_component", "p_component", "c_component"]:
AR_df[comp] = AR_df["mcintosh_class"].str[{"Z_component": 0, "p_component": 1, "c_component": 2}[comp]]

Expand Down Expand Up @@ -435,7 +597,7 @@ def process_row_wrapper(idx):
fig, axes = plt.subplots(2, 4, figsize=(20, 8))
for i, (col, title, color) in enumerate(stats_config):
ax = axes.flat[i]
ax.hist(stats_df[col], bins=50, color=color, alpha=0.7, edgecolor="black", linewidth=0.5, log=True)
ax.hist(stats_df[col], bins=70, color=color, alpha=0.7, edgecolor="black", linewidth=0.5, log=True)
ax.set_title(title, fontsize=16, pad=12)
ax.set_xlabel("Value", fontsize=14)
if i % 4 == 0: # First column gets y-label
Expand Down Expand Up @@ -465,4 +627,3 @@ def process_row_wrapper(idx):
fig.autofmt_xdate(ha="right")
plt.tight_layout()
plt.show()
# %%
Loading