Skip to content

Commit 76e751b

Browse files
committed
Add anomaly detector input feature visualization
1 parent 771a14b commit 76e751b

File tree

2 files changed

+419
-12
lines changed

2 files changed

+419
-12
lines changed

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 281 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
"from optuna import Study, create_study\n",
6868
"\n",
6969
"import shap # Explainable AI tool\n",
70+
"import umap\n",
7071
"\n",
7172
"import matplotlib.pyplot as plot"
7273
]
@@ -921,6 +922,7 @@
921922
" cluster_label_column: str = \"clusterLabel\",\n",
922923
" cluster_medoid_column: str = \"clusterMedoid\",\n",
923924
" cluster_size_column: str = \"clusterSize\",\n",
925+
" cluster_color_map: str = \"tab20\",\n",
924926
" anomaly_label_column: str = \"anomalyLabel\",\n",
925927
" anomaly_score_column: str = \"anomalyScore\",\n",
926928
" size_column: str = \"articleRank\",\n",
@@ -929,6 +931,8 @@
929931
" annotate_top_n_anomalies: int = 10,\n",
930932
" annotate_top_n_non_anomalies: int = 5,\n",
931933
" annotate_top_n_clusters: int = 20,\n",
934+
" percentile_of_distance_to_center: float = 0.8,\n",
935+
" no_cluster_coloring: bool = False,\n",
932936
") -> None:\n",
933937
" \n",
934938
" if clustering_visualization_dataframe.empty:\n",
@@ -966,7 +970,7 @@
966970
"\n",
967971
" distances_to_center = calculate_distances_to_center(clustering_visualization_dataframe, x_position_column, y_position_column)\n",
968972
" top_anomaly_columns_mask = mask_top_anomaly_columns(clustering_visualization_dataframe, anomaly_score_column, annotate_top_n_anomalies)\n",
969-
" clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask)\n",
973+
" clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask, percentile_of_distance_to_center)\n",
970974
"\n",
971975
" cluster_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] == 1]\n",
972976
" cluster_without_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] != 1]\n",
@@ -982,7 +986,7 @@
982986
" y=cluster_noise[y_position_column],\n",
983987
" s=cluster_noise[size_column] * 60 + 2,\n",
984988
" color='lightgrey',\n",
985-
" alpha=0.4,\n",
989+
" alpha=0.3,\n",
986990
" label='Noise'\n",
987991
" )\n",
988992
"\n",
@@ -991,9 +995,9 @@
991995
" x=cluster_non_noise[x_position_column],\n",
992996
" y=cluster_non_noise[y_position_column],\n",
993997
" s=cluster_non_noise[size_column] * 60 + 2,\n",
994-
" c=cluster_non_noise[cluster_label_column],\n",
995-
" cmap='tab20',\n",
996-
" alpha=0.7,\n",
998+
" c=cluster_non_noise[cluster_label_column] if not no_cluster_coloring else 'silver',\n",
999+
" cmap=cluster_color_map if not no_cluster_coloring else None,\n",
1000+
" alpha=0.5,\n",
9971001
" label='Clusters'\n",
9981002
" )\n",
9991003
"\n",
@@ -1085,7 +1089,7 @@
10851089
" plot.annotate(\n",
10861090
" text=f\"#{index + 1}: {truncate(row[code_unit_column])} ({row[anomaly_score_column]:.3f})\",\n",
10871091
" xy=(row[x_position_column], row[y_position_column]),\n",
1088-
" xytext=(5, 5 + (index % 5) * 10),\n",
1092+
" xytext=(5, 5 + (index % 5) * 15),\n",
10891093
" color='red',\n",
10901094
" **plot_annotation_style\n",
10911095
" )\n",
@@ -1103,12 +1107,210 @@
11031107
"plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")"
11041108
]
11051109
},
1110+
{
1111+
"cell_type": "markdown",
1112+
"id": "77dee89a",
1113+
"metadata": {},
1114+
"source": [
1115+
"#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n",
1116+
"\n",
1117+
"This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
1118+
"\n",
1119+
"- Red: detected anomalies \n",
1120+
"- Lightgrey: code units labeled as noise by HDBSCAN \n",
1121+
"- Greys: cluster labels \n",
1122+
"- Size: Article Rank (larger = more important)"
1123+
]
1124+
},
1125+
{
1126+
"cell_type": "code",
1127+
"execution_count": null,
1128+
"id": "c30a29f8",
1129+
"metadata": {},
1130+
"outputs": [],
1131+
"source": [
1132+
"def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n",
1133+
" \"\"\"\n",
1134+
" Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n",
1135+
" see https://umap-learn.readthedocs.io\n",
1136+
" \"\"\"\n",
1137+
"\n",
1138+
" # Check if features are empty\n",
1139+
" if features is None or len(features) == 0:\n",
1140+
" print(\"No feature data available\")\n",
1141+
" return anomaly_detection_results\n",
1142+
"\n",
1143+
" # Check if features and anomaly_detection_results have compatible lengths\n",
1144+
" if features.shape[0] != anomaly_detection_results.shape[0]:\n",
1145+
" raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n",
1146+
"\n",
1147+
" # Use UMAP to reduce the dimensionality to 2D for visualization\n",
1148+
" umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n",
1149+
" two_dimensional_features = umap_reducer.fit_transform(features)\n",
1150+
" \n",
1151+
" # Convert to dense numpy array (works for both sparse and dense input)\n",
1152+
" feature_coordinates = np.asarray(two_dimensional_features)\n",
1153+
"\n",
1154+
" anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n",
1155+
" anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n",
1156+
"\n",
1157+
" return anomaly_detection_results"
1158+
]
1159+
},
1160+
{
1161+
"cell_type": "code",
1162+
"execution_count": null,
1163+
"id": "f02b5dec",
1164+
"metadata": {},
1165+
"outputs": [],
1166+
"source": [
1167+
"def plot_features_with_anomalies(\n",
1168+
" clustering_visualization_dataframe: pd.DataFrame,\n",
1169+
" title_prefix: str,\n",
1170+
" code_unit_column: str = \"shortCodeUnitName\",\n",
1171+
" cluster_label_column: str = \"clusterLabel\",\n",
1172+
" anomaly_label_column: str = \"anomalyLabel\",\n",
1173+
" anomaly_score_column: str = \"anomalyScore\",\n",
1174+
" size_column: str = \"articleRank\",\n",
1175+
" x_position_column: str = 'embeddingVisualizationX',\n",
1176+
" y_position_column: str = 'embeddingVisualizationY',\n",
1177+
" annotate_top_n_anomalies: int = 10,\n",
1178+
" annotate_fully_top_n_anomalies: int = 3,\n",
1179+
") -> None:\n",
1180+
" \n",
1181+
" if clustering_visualization_dataframe.empty:\n",
1182+
" print(\"No projected data to plot available\")\n",
1183+
" return\n",
1184+
" \n",
1185+
" def truncate(text: str, max_length: int = 22):\n",
1186+
" if len(text) <= max_length:\n",
1187+
" return text\n",
1188+
" return text[:max_length - 3] + \"...\"\n",
1189+
"\n",
1190+
"\n",
1191+
" cluster_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] == 1]\n",
1192+
" cluster_without_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] != 1]\n",
1193+
" cluster_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] == -1]\n",
1194+
" cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n",
1195+
"\n",
1196+
" plot.figure(figsize=(10, 10))\n",
1197+
" plot.title(f\"{title_prefix} (size={size_column}, red=anomaly, blue=noise)\", pad=20)\n",
1198+
"\n",
1199+
" # Plot noise (from clustering)\n",
1200+
" plot.scatter(\n",
1201+
" x=cluster_noise[x_position_column],\n",
1202+
" y=cluster_noise[y_position_column],\n",
1203+
" s=cluster_noise[size_column] * 20 + 2,\n",
1204+
" color='lightblue',\n",
1205+
" alpha=0.4,\n",
1206+
" label='Noise'\n",
1207+
" )\n",
1208+
"\n",
1209+
" # Plot clusters\n",
1210+
" plot.scatter(\n",
1211+
" x=cluster_non_noise[x_position_column],\n",
1212+
" y=cluster_non_noise[y_position_column],\n",
1213+
" s=cluster_non_noise[size_column] * 20 + 2,\n",
1214+
" color='lightgrey',\n",
1215+
" alpha=0.6,\n",
1216+
" label='Clusters'\n",
1217+
" )\n",
1218+
"\n",
1219+
" # Plot anomalies\n",
1220+
" plot.scatter(\n",
1221+
" x=cluster_anomalies[x_position_column],\n",
1222+
" y=cluster_anomalies[y_position_column],\n",
1223+
" s=cluster_anomalies[size_column] * 10 + 2,\n",
1224+
" c=cluster_anomalies[anomaly_score_column],\n",
1225+
" cmap=\"Reds\",\n",
1226+
" alpha=0.95,\n",
1227+
" label='Anomaly',\n",
1228+
" )\n",
1229+
"\n",
1230+
" # Annotate top anomalies\n",
1231+
" anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(annotate_top_n_anomalies)\n",
1232+
" anomalies_in_reversed_order = anomalies.iloc[::-1] # plot most important annotations last to overlap less important ones\n",
1233+
" for dataframe_index, row in anomalies_in_reversed_order.iterrows():\n",
1234+
" index = typing.cast(int, dataframe_index)\n",
1235+
" text = f\"{index + 1}\"\n",
1236+
" xytext = (5, 5)\n",
1237+
" if index < annotate_fully_top_n_anomalies:\n",
1238+
" text = f\"{text}: {truncate(row[code_unit_column])}\"\n",
1239+
" xytext = (5, 5 + (index % 4) * 12)\n",
1240+
"\n",
1241+
" plot.annotate(\n",
1242+
" text=text,\n",
1243+
" xy=(row[x_position_column], row[y_position_column]),\n",
1244+
" xytext=xytext,\n",
1245+
" color='red',\n",
1246+
" **plot_annotation_style\n",
1247+
" )\n",
1248+
"\n",
1249+
" plot.show()"
1250+
]
1251+
},
1252+
{
1253+
"cell_type": "code",
1254+
"execution_count": null,
1255+
"id": "f6af9eb9",
1256+
"metadata": {},
1257+
"outputs": [],
1258+
"source": [
1259+
"java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
1260+
" java_package_anomaly_detection_features_prepared,\n",
1261+
" java_package_anomaly_detection_features\n",
1262+
")"
1263+
]
1264+
},
1265+
{
1266+
"cell_type": "code",
1267+
"execution_count": null,
1268+
"id": "7a679562",
1269+
"metadata": {},
1270+
"outputs": [],
1271+
"source": [
1272+
"plot_features_with_anomalies(\n",
1273+
" java_package_anomaly_detection_features,\n",
1274+
" title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n",
1275+
" x_position_column='featureVisualizationX',\n",
1276+
" y_position_column='featureVisualizationY',\n",
1277+
" annotate_top_n_anomalies=5,\n",
1278+
")"
1279+
]
1280+
},
1281+
{
1282+
"cell_type": "markdown",
1283+
"id": "f9832cc9",
1284+
"metadata": {},
1285+
"source": [
1286+
"##### 1.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)"
1287+
]
1288+
},
1289+
{
1290+
"cell_type": "code",
1291+
"execution_count": null,
1292+
"id": "acbe2034",
1293+
"metadata": {},
1294+
"outputs": [],
1295+
"source": [
1296+
"plot_anomalies(\n",
1297+
" java_package_anomaly_detection_features,\n",
1298+
" title_prefix=\"Java Package Anomalies (2D Feature Visualization Zoomed)\",\n",
1299+
" x_position_column='featureVisualizationX',\n",
1300+
" y_position_column='featureVisualizationY',\n",
1301+
" annotate_top_n_clusters=0,\n",
1302+
" annotate_top_n_non_anomalies=0,\n",
1303+
" percentile_of_distance_to_center=0.7,\n",
1304+
" no_cluster_coloring=True\n",
1305+
")"
1306+
]
1307+
},
11061308
{
11071309
"cell_type": "markdown",
11081310
"id": "0f1b08b6",
11091311
"metadata": {},
11101312
"source": [
1111-
"#### 1.4b Plot anomalies solely based on embeddings"
1313+
"#### 1.4c Plot anomalies solely based on embeddings"
11121314
]
11131315
},
11141316
{
@@ -1914,12 +2116,83 @@
19142116
"plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")"
19152117
]
19162118
},
2119+
{
2120+
"cell_type": "markdown",
2121+
"id": "6eb52ab0",
2122+
"metadata": {},
2123+
"source": [
2124+
"#### 2.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n",
2125+
"\n",
2126+
"This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
2127+
"\n",
2128+
"- Red: detected anomalies \n",
2129+
"- Lightgrey: code units labeled as noise by HDBSCAN \n",
2130+
"- Greys: cluster labels \n",
2131+
"- Size: Article Rank (larger = more important)"
2132+
]
2133+
},
2134+
{
2135+
"cell_type": "code",
2136+
"execution_count": null,
2137+
"id": "129cced0",
2138+
"metadata": {},
2139+
"outputs": [],
2140+
"source": [
2141+
"java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
2142+
" java_type_anomaly_detection_features_prepared,\n",
2143+
" java_type_anomaly_detection_features\n",
2144+
")"
2145+
]
2146+
},
2147+
{
2148+
"cell_type": "code",
2149+
"execution_count": null,
2150+
"id": "f05ef08c",
2151+
"metadata": {},
2152+
"outputs": [],
2153+
"source": [
2154+
"plot_features_with_anomalies(\n",
2155+
" java_type_anomaly_detection_features,\n",
2156+
" title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n",
2157+
" x_position_column='featureVisualizationX',\n",
2158+
" y_position_column='featureVisualizationY',\n",
2159+
" annotate_top_n_anomalies=30\n",
2160+
")"
2161+
]
2162+
},
2163+
{
2164+
"cell_type": "markdown",
2165+
"id": "3472efed",
2166+
"metadata": {},
2167+
"source": [
2168+
"##### 2.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)"
2169+
]
2170+
},
2171+
{
2172+
"cell_type": "code",
2173+
"execution_count": null,
2174+
"id": "c44f04e9",
2175+
"metadata": {},
2176+
"outputs": [],
2177+
"source": [
2178+
"plot_anomalies(\n",
2179+
" java_type_anomaly_detection_features,\n",
2180+
" title_prefix=\"Java Type Anomalies (2D Feature Visualization Zoomed)\",\n",
2181+
" x_position_column='featureVisualizationX',\n",
2182+
" y_position_column='featureVisualizationY',\n",
2183+
" annotate_top_n_clusters=0,\n",
2184+
" annotate_top_n_non_anomalies=0,\n",
2185+
" percentile_of_distance_to_center=0.7,\n",
2186+
" no_cluster_coloring=True\n",
2187+
")"
2188+
]
2189+
},
19172190
{
19182191
"cell_type": "markdown",
19192192
"id": "05275be7",
19202193
"metadata": {},
19212194
"source": [
1922-
"#### 2.4.b Plot anomalies solely based on embeddings"
2195+
"#### 2.4c Plot anomalies solely based on embeddings"
19232196
]
19242197
},
19252198
{

0 commit comments

Comments
 (0)