JohT
diff --git a/‎domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb‎
Lines changed: 281 additions & 8 deletions b/‎domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb‎
Lines changed: 281 additions & 8 deletions
@@ -67,6 +67,7 @@
     "from optuna import Study, create_study\n",
     "\n",
     "import shap # Explainable AI tool\n",
+    "import umap\n",
     "\n",
     "import matplotlib.pyplot as plot"
    ]
@@ -921,6 +922,7 @@
     "    cluster_label_column: str = \"clusterLabel\",\n",
     "    cluster_medoid_column: str = \"clusterMedoid\",\n",
     "    cluster_size_column: str = \"clusterSize\",\n",
+    "    cluster_color_map: str = \"tab20\",\n",
     "    anomaly_label_column: str = \"anomalyLabel\",\n",
     "    anomaly_score_column: str = \"anomalyScore\",\n",
     "    size_column: str = \"articleRank\",\n",
@@ -929,6 +931,8 @@
     "    annotate_top_n_anomalies: int = 10,\n",
     "    annotate_top_n_non_anomalies: int = 5,\n",
     "    annotate_top_n_clusters: int = 20,\n",
+    "    percentile_of_distance_to_center: float = 0.8,\n",
+    "    no_cluster_coloring: bool = False,\n",
     ") -> None:\n",
     "    \n",
     "    if clustering_visualization_dataframe.empty:\n",
@@ -966,7 +970,7 @@
     "\n",
     "    distances_to_center = calculate_distances_to_center(clustering_visualization_dataframe, x_position_column, y_position_column)\n",
     "    top_anomaly_columns_mask = mask_top_anomaly_columns(clustering_visualization_dataframe, anomaly_score_column, annotate_top_n_anomalies)\n",
-    "    clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask)\n",
+    "    clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask, percentile_of_distance_to_center)\n",
     "\n",
     "    cluster_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] == 1]\n",
     "    cluster_without_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] != 1]\n",
@@ -982,7 +986,7 @@
     "        y=cluster_noise[y_position_column],\n",
     "        s=cluster_noise[size_column] * 60 + 2,\n",
     "        color='lightgrey',\n",
-    "        alpha=0.4,\n",
+    "        alpha=0.3,\n",
     "        label='Noise'\n",
     "    )\n",
     "\n",
@@ -991,9 +995,9 @@
     "        x=cluster_non_noise[x_position_column],\n",
     "        y=cluster_non_noise[y_position_column],\n",
     "        s=cluster_non_noise[size_column] * 60 + 2,\n",
-    "        c=cluster_non_noise[cluster_label_column],\n",
-    "        cmap='tab20',\n",
-    "        alpha=0.7,\n",
+    "        c=cluster_non_noise[cluster_label_column] if not no_cluster_coloring else 'silver',\n",
+    "        cmap=cluster_color_map if not no_cluster_coloring else None,\n",
+    "        alpha=0.5,\n",
     "        label='Clusters'\n",
     "    )\n",
     "\n",
@@ -1085,7 +1089,7 @@
     "        plot.annotate(\n",
     "            text=f\"#{index + 1}: {truncate(row[code_unit_column])} ({row[anomaly_score_column]:.3f})\",\n",
     "            xy=(row[x_position_column], row[y_position_column]),\n",
-    "            xytext=(5, 5 + (index % 5) * 10),\n",
+    "            xytext=(5, 5 + (index % 5) * 15),\n",
     "            color='red',\n",
     "            **plot_annotation_style\n",
     "        )\n",
@@ -1103,12 +1107,210 @@
     "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "77dee89a",
+   "metadata": {},
+   "source": [
+    "#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n",
+    "\n",
+    "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
+    "\n",
+    "- Red: detected anomalies  \n",
+    "- Lightgrey: code units labeled as noise by HDBSCAN  \n",
+    "- Greys: cluster labels  \n",
+    "- Size: Article Rank (larger = more important)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c30a29f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n",
+    "    see https://umap-learn.readthedocs.io\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Check if features are empty\n",
+    "    if features is None or len(features) == 0:\n",
+    "        print(\"No feature data available\")\n",
+    "        return anomaly_detection_results\n",
+    "\n",
+    "    # Check if features and anomaly_detection_results have compatible lengths\n",
+    "    if features.shape[0] != anomaly_detection_results.shape[0]:\n",
+    "        raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n",
+    "\n",
+    "    # Use UMAP to reduce the dimensionality to 2D for visualization\n",
+    "    umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n",
+    "    two_dimensional_features = umap_reducer.fit_transform(features)\n",
+    "    \n",
+    "    # Convert to dense numpy array (works for both sparse and dense input)\n",
+    "    feature_coordinates = np.asarray(two_dimensional_features)\n",
+    "\n",
+    "    anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n",
+    "    anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n",
+    "\n",
+    "    return anomaly_detection_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f02b5dec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_features_with_anomalies(\n",
+    "    clustering_visualization_dataframe: pd.DataFrame,\n",
+    "    title_prefix: str,\n",
+    "    code_unit_column: str = \"shortCodeUnitName\",\n",
+    "    cluster_label_column: str = \"clusterLabel\",\n",
+    "    anomaly_label_column: str = \"anomalyLabel\",\n",
+    "    anomaly_score_column: str = \"anomalyScore\",\n",
+    "    size_column: str = \"articleRank\",\n",
+    "    x_position_column: str = 'embeddingVisualizationX',\n",
+    "    y_position_column: str = 'embeddingVisualizationY',\n",
+    "    annotate_top_n_anomalies: int = 10,\n",
+    "    annotate_fully_top_n_anomalies: int = 3,\n",
+    ") -> None:\n",
+    "    \n",
+    "    if clustering_visualization_dataframe.empty:\n",
+    "        print(\"No projected data to plot available\")\n",
+    "        return\n",
+    "    \n",
+    "    def truncate(text: str, max_length: int = 22):\n",
+    "        if len(text) <= max_length:\n",
+    "            return text\n",
+    "        return text[:max_length - 3] + \"...\"\n",
+    "\n",
+    "\n",
+    "    cluster_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] == 1]\n",
+    "    cluster_without_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] != 1]\n",
+    "    cluster_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] == -1]\n",
+    "    cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n",
+    "\n",
+    "    plot.figure(figsize=(10, 10))\n",
+    "    plot.title(f\"{title_prefix} (size={size_column}, red=anomaly, blue=noise)\", pad=20)\n",
+    "\n",
+    "    # Plot noise (from clustering)\n",
+    "    plot.scatter(\n",
+    "        x=cluster_noise[x_position_column],\n",
+    "        y=cluster_noise[y_position_column],\n",
+    "        s=cluster_noise[size_column] * 20 + 2,\n",
+    "        color='lightblue',\n",
+    "        alpha=0.4,\n",
+    "        label='Noise'\n",
+    "    )\n",
+    "\n",
+    "    # Plot clusters\n",
+    "    plot.scatter(\n",
+    "        x=cluster_non_noise[x_position_column],\n",
+    "        y=cluster_non_noise[y_position_column],\n",
+    "        s=cluster_non_noise[size_column] * 20 + 2,\n",
+    "        color='lightgrey',\n",
+    "        alpha=0.6,\n",
+    "        label='Clusters'\n",
+    "    )\n",
+    "\n",
+    "    # Plot anomalies\n",
+    "    plot.scatter(\n",
+    "        x=cluster_anomalies[x_position_column],\n",
+    "        y=cluster_anomalies[y_position_column],\n",
+    "        s=cluster_anomalies[size_column] * 10 + 2,\n",
+    "        c=cluster_anomalies[anomaly_score_column],\n",
+    "        cmap=\"Reds\",\n",
+    "        alpha=0.95,\n",
+    "        label='Anomaly',\n",
+    "    )\n",
+    "\n",
+    "    # Annotate top anomalies\n",
+    "    anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(annotate_top_n_anomalies)\n",
+    "    anomalies_in_reversed_order = anomalies.iloc[::-1] # plot most important annotations last to overlap less important ones\n",
+    "    for dataframe_index, row in anomalies_in_reversed_order.iterrows():\n",
+    "        index = typing.cast(int, dataframe_index)\n",
+    "        text = f\"{index + 1}\"\n",
+    "        xytext = (5, 5)\n",
+    "        if index < annotate_fully_top_n_anomalies:\n",
+    "            text = f\"{text}: {truncate(row[code_unit_column])}\"\n",
+    "            xytext = (5, 5 + (index % 4) * 12)\n",
+    "\n",
+    "        plot.annotate(\n",
+    "            text=text,\n",
+    "            xy=(row[x_position_column], row[y_position_column]),\n",
+    "            xytext=xytext,\n",
+    "            color='red',\n",
+    "            **plot_annotation_style\n",
+    "        )\n",
+    "\n",
+    "    plot.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6af9eb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
+    "    java_package_anomaly_detection_features_prepared,\n",
+    "    java_package_anomaly_detection_features\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a679562",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_features_with_anomalies(\n",
+    "    java_package_anomaly_detection_features,\n",
+    "    title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n",
+    "    x_position_column='featureVisualizationX',\n",
+    "    y_position_column='featureVisualizationY',\n",
+    "    annotate_top_n_anomalies=5,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9832cc9",
+   "metadata": {},
+   "source": [
+    "##### 1.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acbe2034",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomalies(\n",
+    "    java_package_anomaly_detection_features,\n",
+    "    title_prefix=\"Java Package Anomalies (2D Feature Visualization Zoomed)\",\n",
+    "    x_position_column='featureVisualizationX',\n",
+    "    y_position_column='featureVisualizationY',\n",
+    "    annotate_top_n_clusters=0,\n",
+    "    annotate_top_n_non_anomalies=0,\n",
+    "    percentile_of_distance_to_center=0.7,\n",
+    "    no_cluster_coloring=True\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "0f1b08b6",
    "metadata": {},
    "source": [
-    "#### 1.4b Plot anomalies solely based on embeddings"
+    "#### 1.4c Plot anomalies solely based on embeddings"
    ]
   },
   {
@@ -1914,12 +2116,83 @@
     "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6eb52ab0",
+   "metadata": {},
+   "source": [
+    "#### 2.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n",
+    "\n",
+    "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
+    "\n",
+    "- Red: detected anomalies  \n",
+    "- Lightgrey: code units labeled as noise by HDBSCAN  \n",
+    "- Greys: cluster labels  \n",
+    "- Size: Article Rank (larger = more important)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "129cced0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
+    "    java_type_anomaly_detection_features_prepared,\n",
+    "    java_type_anomaly_detection_features\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f05ef08c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_features_with_anomalies(\n",
+    "    java_type_anomaly_detection_features,\n",
+    "    title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n",
+    "    x_position_column='featureVisualizationX',\n",
+    "    y_position_column='featureVisualizationY',\n",
+    "    annotate_top_n_anomalies=30\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3472efed",
+   "metadata": {},
+   "source": [
+    "##### 2.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c44f04e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomalies(\n",
+    "    java_type_anomaly_detection_features,\n",
+    "    title_prefix=\"Java Type Anomalies (2D Feature Visualization Zoomed)\",\n",
+    "    x_position_column='featureVisualizationX',\n",
+    "    y_position_column='featureVisualizationY',\n",
+    "    annotate_top_n_clusters=0,\n",
+    "    annotate_top_n_non_anomalies=0,\n",
+    "    percentile_of_distance_to_center=0.7,\n",
+    "    no_cluster_coloring=True\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "05275be7",
    "metadata": {},
    "source": [
-    "#### 2.4.b Plot anomalies solely based on embeddings"
+    "#### 2.4c Plot anomalies solely based on embeddings"
    ]
   },
   {