|
67 | 67 | "from optuna import Study, create_study\n", |
68 | 68 | "\n", |
69 | 69 | "import shap # Explainable AI tool\n", |
| 70 | + "import umap\n", |
70 | 71 | "\n", |
71 | 72 | "import matplotlib.pyplot as plot" |
72 | 73 | ] |
|
921 | 922 | " cluster_label_column: str = \"clusterLabel\",\n", |
922 | 923 | " cluster_medoid_column: str = \"clusterMedoid\",\n", |
923 | 924 | " cluster_size_column: str = \"clusterSize\",\n", |
| 925 | + " cluster_color_map: str = \"tab20\",\n", |
924 | 926 | " anomaly_label_column: str = \"anomalyLabel\",\n", |
925 | 927 | " anomaly_score_column: str = \"anomalyScore\",\n", |
926 | 928 | " size_column: str = \"articleRank\",\n", |
|
929 | 931 | " annotate_top_n_anomalies: int = 10,\n", |
930 | 932 | " annotate_top_n_non_anomalies: int = 5,\n", |
931 | 933 | " annotate_top_n_clusters: int = 20,\n", |
| 934 | + " percentile_of_distance_to_center: float = 0.8,\n", |
| 935 | + " no_cluster_coloring: bool = False,\n", |
932 | 936 | ") -> None:\n", |
933 | 937 | " \n", |
934 | 938 | " if clustering_visualization_dataframe.empty:\n", |
|
966 | 970 | "\n", |
967 | 971 | " distances_to_center = calculate_distances_to_center(clustering_visualization_dataframe, x_position_column, y_position_column)\n", |
968 | 972 | " top_anomaly_columns_mask = mask_top_anomaly_columns(clustering_visualization_dataframe, anomaly_score_column, annotate_top_n_anomalies)\n", |
969 | | - " clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask)\n", |
| 973 | + " clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask, percentile_of_distance_to_center)\n", |
970 | 974 | "\n", |
971 | 975 | " cluster_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] == 1]\n", |
972 | 976 | " cluster_without_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] != 1]\n", |
|
982 | 986 | " y=cluster_noise[y_position_column],\n", |
983 | 987 | " s=cluster_noise[size_column] * 60 + 2,\n", |
984 | 988 | " color='lightgrey',\n", |
985 | | - " alpha=0.4,\n", |
| 989 | + " alpha=0.3,\n", |
986 | 990 | " label='Noise'\n", |
987 | 991 | " )\n", |
988 | 992 | "\n", |
|
991 | 995 | " x=cluster_non_noise[x_position_column],\n", |
992 | 996 | " y=cluster_non_noise[y_position_column],\n", |
993 | 997 | " s=cluster_non_noise[size_column] * 60 + 2,\n", |
994 | | - " c=cluster_non_noise[cluster_label_column],\n", |
995 | | - " cmap='tab20',\n", |
996 | | - " alpha=0.7,\n", |
| 998 | + " c=cluster_non_noise[cluster_label_column] if not no_cluster_coloring else 'silver',\n", |
| 999 | + " cmap=cluster_color_map if not no_cluster_coloring else None,\n", |
| 1000 | + " alpha=0.5,\n", |
997 | 1001 | " label='Clusters'\n", |
998 | 1002 | " )\n", |
999 | 1003 | "\n", |
|
1085 | 1089 | " plot.annotate(\n", |
1086 | 1090 | " text=f\"#{index + 1}: {truncate(row[code_unit_column])} ({row[anomaly_score_column]:.3f})\",\n", |
1087 | 1091 | " xy=(row[x_position_column], row[y_position_column]),\n", |
1088 | | - " xytext=(5, 5 + (index % 5) * 10),\n", |
| 1092 | + " xytext=(5, 5 + (index % 5) * 15),\n", |
1089 | 1093 | " color='red',\n", |
1090 | 1094 | " **plot_annotation_style\n", |
1091 | 1095 | " )\n", |
|
1103 | 1107 | "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")" |
1104 | 1108 | ] |
1105 | 1109 | }, |
| 1110 | + { |
| 1111 | + "cell_type": "markdown", |
| 1112 | + "id": "77dee89a", |
| 1113 | + "metadata": {}, |
| 1114 | + "source": [ |
| 1115 | + "#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n", |
| 1116 | + "\n", |
| 1117 | + "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n", |
| 1118 | + "\n", |
| 1119 | + "- Red: detected anomalies \n", |
| 1120 | + "- Lightgrey: code units labeled as noise by HDBSCAN \n", |
| 1121 | + "- Greys: cluster labels \n", |
| 1122 | + "- Size: Article Rank (larger = more important)" |
| 1123 | + ] |
| 1124 | + }, |
| 1125 | + { |
| 1126 | + "cell_type": "code", |
| 1127 | + "execution_count": null, |
| 1128 | + "id": "c30a29f8", |
| 1129 | + "metadata": {}, |
| 1130 | + "outputs": [], |
| 1131 | + "source": [ |
| 1132 | + "def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n", |
| 1133 | + " \"\"\"\n", |
| 1134 | + " Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n", |
| 1135 | + " see https://umap-learn.readthedocs.io\n", |
| 1136 | + " \"\"\"\n", |
| 1137 | + "\n", |
| 1138 | + " # Check if features are empty\n", |
| 1139 | + " if features is None or len(features) == 0:\n", |
| 1140 | + " print(\"No feature data available\")\n", |
| 1141 | + " return anomaly_detection_results\n", |
| 1142 | + "\n", |
| 1143 | + " # Check if features and anomaly_detection_results have compatible lengths\n", |
| 1144 | + " if features.shape[0] != anomaly_detection_results.shape[0]:\n", |
| 1145 | + " raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n", |
| 1146 | + "\n", |
| 1147 | + " # Use UMAP to reduce the dimensionality to 2D for visualization\n", |
| 1148 | + " umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n", |
| 1149 | + " two_dimensional_features = umap_reducer.fit_transform(features)\n", |
| 1150 | + " \n", |
| 1151 | + " # Convert to dense numpy array (works for both sparse and dense input)\n", |
| 1152 | + " feature_coordinates = np.asarray(two_dimensional_features)\n", |
| 1153 | + "\n", |
| 1154 | + " anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n", |
| 1155 | + " anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n", |
| 1156 | + "\n", |
| 1157 | + " return anomaly_detection_results" |
| 1158 | + ] |
| 1159 | + }, |
| 1160 | + { |
| 1161 | + "cell_type": "code", |
| 1162 | + "execution_count": null, |
| 1163 | + "id": "f02b5dec", |
| 1164 | + "metadata": {}, |
| 1165 | + "outputs": [], |
| 1166 | + "source": [ |
| 1167 | + "def plot_features_with_anomalies(\n", |
| 1168 | + " clustering_visualization_dataframe: pd.DataFrame,\n", |
| 1169 | + " title_prefix: str,\n", |
| 1170 | + " code_unit_column: str = \"shortCodeUnitName\",\n", |
| 1171 | + " cluster_label_column: str = \"clusterLabel\",\n", |
| 1172 | + " anomaly_label_column: str = \"anomalyLabel\",\n", |
| 1173 | + " anomaly_score_column: str = \"anomalyScore\",\n", |
| 1174 | + " size_column: str = \"articleRank\",\n", |
| 1175 | + " x_position_column: str = 'embeddingVisualizationX',\n", |
| 1176 | + " y_position_column: str = 'embeddingVisualizationY',\n", |
| 1177 | + " annotate_top_n_anomalies: int = 10,\n", |
| 1178 | + " annotate_fully_top_n_anomalies: int = 3,\n", |
| 1179 | + ") -> None:\n", |
| 1180 | + " \n", |
| 1181 | + " if clustering_visualization_dataframe.empty:\n", |
| 1182 | + " print(\"No projected data to plot available\")\n", |
| 1183 | + " return\n", |
| 1184 | + " \n", |
| 1185 | + " def truncate(text: str, max_length: int = 22):\n", |
| 1186 | + " if len(text) <= max_length:\n", |
| 1187 | + " return text\n", |
| 1188 | + " return text[:max_length - 3] + \"...\"\n", |
| 1189 | + "\n", |
| 1190 | + "\n", |
| 1191 | + " cluster_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] == 1]\n", |
| 1192 | + " cluster_without_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] != 1]\n", |
| 1193 | + " cluster_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] == -1]\n", |
| 1194 | + " cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n", |
| 1195 | + "\n", |
| 1196 | + " plot.figure(figsize=(10, 10))\n", |
| 1197 | + " plot.title(f\"{title_prefix} (size={size_column}, red=anomaly, blue=noise)\", pad=20)\n", |
| 1198 | + "\n", |
| 1199 | + " # Plot noise (from clustering)\n", |
| 1200 | + " plot.scatter(\n", |
| 1201 | + " x=cluster_noise[x_position_column],\n", |
| 1202 | + " y=cluster_noise[y_position_column],\n", |
| 1203 | + " s=cluster_noise[size_column] * 20 + 2,\n", |
| 1204 | + " color='lightblue',\n", |
| 1205 | + " alpha=0.4,\n", |
| 1206 | + " label='Noise'\n", |
| 1207 | + " )\n", |
| 1208 | + "\n", |
| 1209 | + " # Plot clusters\n", |
| 1210 | + " plot.scatter(\n", |
| 1211 | + " x=cluster_non_noise[x_position_column],\n", |
| 1212 | + " y=cluster_non_noise[y_position_column],\n", |
| 1213 | + " s=cluster_non_noise[size_column] * 20 + 2,\n", |
| 1214 | + " color='lightgrey',\n", |
| 1215 | + " alpha=0.6,\n", |
| 1216 | + " label='Clusters'\n", |
| 1217 | + " )\n", |
| 1218 | + "\n", |
| 1219 | + " # Plot anomalies\n", |
| 1220 | + " plot.scatter(\n", |
| 1221 | + " x=cluster_anomalies[x_position_column],\n", |
| 1222 | + " y=cluster_anomalies[y_position_column],\n", |
| 1223 | + " s=cluster_anomalies[size_column] * 10 + 2,\n", |
| 1224 | + " c=cluster_anomalies[anomaly_score_column],\n", |
| 1225 | + " cmap=\"Reds\",\n", |
| 1226 | + " alpha=0.95,\n", |
| 1227 | + " label='Anomaly',\n", |
| 1228 | + " )\n", |
| 1229 | + "\n", |
| 1230 | + " # Annotate top anomalies\n", |
| 1231 | + " anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(annotate_top_n_anomalies)\n", |
| 1232 | + " anomalies_in_reversed_order = anomalies.iloc[::-1] # plot most important annotations last to overlap less important ones\n", |
| 1233 | + " for dataframe_index, row in anomalies_in_reversed_order.iterrows():\n", |
| 1234 | + " index = typing.cast(int, dataframe_index)\n", |
| 1235 | + " text = f\"{index + 1}\"\n", |
| 1236 | + " xytext = (5, 5)\n", |
| 1237 | + " if index < annotate_fully_top_n_anomalies:\n", |
| 1238 | + " text = f\"{text}: {truncate(row[code_unit_column])}\"\n", |
| 1239 | + " xytext = (5, 5 + (index % 4) * 12)\n", |
| 1240 | + "\n", |
| 1241 | + " plot.annotate(\n", |
| 1242 | + " text=text,\n", |
| 1243 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 1244 | + " xytext=xytext,\n", |
| 1245 | + " color='red',\n", |
| 1246 | + " **plot_annotation_style\n", |
| 1247 | + " )\n", |
| 1248 | + "\n", |
| 1249 | + " plot.show()" |
| 1250 | + ] |
| 1251 | + }, |
| 1252 | + { |
| 1253 | + "cell_type": "code", |
| 1254 | + "execution_count": null, |
| 1255 | + "id": "f6af9eb9", |
| 1256 | + "metadata": {}, |
| 1257 | + "outputs": [], |
| 1258 | + "source": [ |
| 1259 | + "java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n", |
| 1260 | + " java_package_anomaly_detection_features_prepared,\n", |
| 1261 | + " java_package_anomaly_detection_features\n", |
| 1262 | + ")" |
| 1263 | + ] |
| 1264 | + }, |
| 1265 | + { |
| 1266 | + "cell_type": "code", |
| 1267 | + "execution_count": null, |
| 1268 | + "id": "7a679562", |
| 1269 | + "metadata": {}, |
| 1270 | + "outputs": [], |
| 1271 | + "source": [ |
| 1272 | + "plot_features_with_anomalies(\n", |
| 1273 | + " java_package_anomaly_detection_features,\n", |
| 1274 | + " title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n", |
| 1275 | + " x_position_column='featureVisualizationX',\n", |
| 1276 | + " y_position_column='featureVisualizationY',\n", |
| 1277 | + " annotate_top_n_anomalies=5,\n", |
| 1278 | + ")" |
| 1279 | + ] |
| 1280 | + }, |
| 1281 | + { |
| 1282 | + "cell_type": "markdown", |
| 1283 | + "id": "f9832cc9", |
| 1284 | + "metadata": {}, |
| 1285 | + "source": [ |
| 1286 | + "##### 1.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)" |
| 1287 | + ] |
| 1288 | + }, |
| 1289 | + { |
| 1290 | + "cell_type": "code", |
| 1291 | + "execution_count": null, |
| 1292 | + "id": "acbe2034", |
| 1293 | + "metadata": {}, |
| 1294 | + "outputs": [], |
| 1295 | + "source": [ |
| 1296 | + "plot_anomalies(\n", |
| 1297 | + " java_package_anomaly_detection_features,\n", |
| 1298 | + " title_prefix=\"Java Package Anomalies (2D Feature Visualization Zoomed)\",\n", |
| 1299 | + " x_position_column='featureVisualizationX',\n", |
| 1300 | + " y_position_column='featureVisualizationY',\n", |
| 1301 | + " annotate_top_n_clusters=0,\n", |
| 1302 | + " annotate_top_n_non_anomalies=0,\n", |
| 1303 | + " percentile_of_distance_to_center=0.7,\n", |
| 1304 | + " no_cluster_coloring=True\n", |
| 1305 | + ")" |
| 1306 | + ] |
| 1307 | + }, |
1106 | 1308 | { |
1107 | 1309 | "cell_type": "markdown", |
1108 | 1310 | "id": "0f1b08b6", |
1109 | 1311 | "metadata": {}, |
1110 | 1312 | "source": [ |
1111 | | - "#### 1.4b Plot anomalies solely based on embeddings" |
| 1313 | + "#### 1.4c Plot anomalies solely based on embeddings" |
1112 | 1314 | ] |
1113 | 1315 | }, |
1114 | 1316 | { |
|
1914 | 2116 | "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")" |
1915 | 2117 | ] |
1916 | 2118 | }, |
| 2119 | + { |
| 2120 | + "cell_type": "markdown", |
| 2121 | + "id": "6eb52ab0", |
| 2122 | + "metadata": {}, |
| 2123 | + "source": [ |
| 2124 | + "#### 2.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n", |
| 2125 | + "\n", |
| 2126 | + "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n", |
| 2127 | + "\n", |
| 2128 | + "- Red: detected anomalies \n", |
| 2129 | + "- Lightgrey: code units labeled as noise by HDBSCAN \n", |
| 2130 | + "- Greys: cluster labels \n", |
| 2131 | + "- Size: Article Rank (larger = more important)" |
| 2132 | + ] |
| 2133 | + }, |
| 2134 | + { |
| 2135 | + "cell_type": "code", |
| 2136 | + "execution_count": null, |
| 2137 | + "id": "129cced0", |
| 2138 | + "metadata": {}, |
| 2139 | + "outputs": [], |
| 2140 | + "source": [ |
| 2141 | + "java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n", |
| 2142 | + " java_type_anomaly_detection_features_prepared,\n", |
| 2143 | + " java_type_anomaly_detection_features\n", |
| 2144 | + ")" |
| 2145 | + ] |
| 2146 | + }, |
| 2147 | + { |
| 2148 | + "cell_type": "code", |
| 2149 | + "execution_count": null, |
| 2150 | + "id": "f05ef08c", |
| 2151 | + "metadata": {}, |
| 2152 | + "outputs": [], |
| 2153 | + "source": [ |
| 2154 | + "plot_features_with_anomalies(\n", |
| 2155 | + " java_type_anomaly_detection_features,\n", |
| 2156 | + " title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n", |
| 2157 | + " x_position_column='featureVisualizationX',\n", |
| 2158 | + " y_position_column='featureVisualizationY',\n", |
| 2159 | + " annotate_top_n_anomalies=30\n", |
| 2160 | + ")" |
| 2161 | + ] |
| 2162 | + }, |
| 2163 | + { |
| 2164 | + "cell_type": "markdown", |
| 2165 | + "id": "3472efed", |
| 2166 | + "metadata": {}, |
| 2167 | + "source": [ |
| 2168 | + "##### 2.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)" |
| 2169 | + ] |
| 2170 | + }, |
| 2171 | + { |
| 2172 | + "cell_type": "code", |
| 2173 | + "execution_count": null, |
| 2174 | + "id": "c44f04e9", |
| 2175 | + "metadata": {}, |
| 2176 | + "outputs": [], |
| 2177 | + "source": [ |
| 2178 | + "plot_anomalies(\n", |
| 2179 | + " java_type_anomaly_detection_features,\n", |
| 2180 | + " title_prefix=\"Java Type Anomalies (2D Feature Visualization Zoomed)\",\n", |
| 2181 | + " x_position_column='featureVisualizationX',\n", |
| 2182 | + " y_position_column='featureVisualizationY',\n", |
| 2183 | + " annotate_top_n_clusters=0,\n", |
| 2184 | + " annotate_top_n_non_anomalies=0,\n", |
| 2185 | + " percentile_of_distance_to_center=0.7,\n", |
| 2186 | + " no_cluster_coloring=True\n", |
| 2187 | + ")" |
| 2188 | + ] |
| 2189 | + }, |
1917 | 2190 | { |
1918 | 2191 | "cell_type": "markdown", |
1919 | 2192 | "id": "05275be7", |
1920 | 2193 | "metadata": {}, |
1921 | 2194 | "source": [ |
1922 | | - "#### 2.4.b Plot anomalies solely based on embeddings" |
| 2195 | + "#### 2.4c Plot anomalies solely based on embeddings" |
1923 | 2196 | ] |
1924 | 2197 | }, |
1925 | 2198 | { |
|
0 commit comments