Skip to content

Commit 1debf85

Browse files
authored
Merge pull request #474 from JohT/feature/document-anomaly-detection-architecture
Document anomaly detection pipeline architecture
2 parents 5d8275e + a53aa52 commit 1debf85

File tree

11 files changed

+1434
-12
lines changed

11 files changed

+1434
-12
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Anomaly Detection Domain
2+
3+
This directory contains the implementation and resources related to the Anomaly Detection domain within the Code Graph Analysis Pipeline project.
4+
5+
## Entry Points
6+
7+
The following scripts serve as entry points for various anomaly detection tasks and reports. They will be invoked by [AllReports.sh](./../../scripts/reports/compilations/AllReports.sh) an its sub-scripts dynamically by their names.
8+
9+
- [anomalyDetectionCsv.sh](./anomalyDetectionCsv.sh): Entry point for CSV reports based solely on Graph queries.
10+
- [anomalyDetectionPython.sh](./anomalyDetectionPython.sh): Entry point for Python-based anomaly detection tasks and reports.
11+
- [anomalyDetectionVisualization.sh](./anomalyDetectionVisualization.sh): Entry point for Graph visualization reports.
12+
- [anomalyDetectionMarkdown.sh](./anomalyDetectionMarkdown.sh): Entry point for generating the Markdown summary report.
13+
14+
## Folder Structure
15+
16+
- [documentation](./documentation): Contains documentation including architecture diagrams.
17+
- [explore](./explore/): Jupyter notebooks for interactive, exploratory anomaly detection analysis.
18+
- [features](./features/): Cypher queries to extract features and run graph algorithms relevant for anomaly detection.
19+
- [graphs](./graphs/): Cypher queries and GraphViz templates for Graph visualizations related to anomaly detection.
20+
- [labels](./labels/): Cypher queries label nodes that represent specific archetypes.
21+
- [queries](./queries/): Cypher queries to identify anomalies based on various (deterministic/explainable) criteria.
22+
- [reset](./reset/): Cypher queries to reset the graph database state related to anomaly detection.
23+
- [summary](./summary/): Markdown templates and resources for generating the summary report.
24+
25+
## Pipeline Architecture Overview
26+
27+
![Anomaly Detection Architecture](./documentation/Architecture.svg)
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
digraph AnomalyDetectionPipeline {
2+
rankdir=LR;
3+
node [fontname="Helvetica", fontsize=10];
4+
5+
// Leiden community detection
6+
subgraph cluster_leiden {
7+
label="Leiden Community Detection";
8+
style=filled; color=lightblue;
9+
node [shape=box, style=filled, fillcolor=white];
10+
11+
Tuning_Leiden [label="Tuning\n(Optuna)"];
12+
Leiden_Gamma [label="gamma", shape=diamond]
13+
Leiden_Theta [label="theta", shape=diamond]
14+
Leiden_Algorithm [label="Leiden Community Detection"];
15+
CommunityId [label="Community", shape=ellipse];
16+
}
17+
18+
// --- Leiden Community Detection relationships ---
19+
Tuning_Leiden -> Leiden_Gamma;
20+
Tuning_Leiden -> Leiden_Theta;
21+
Leiden_Gamma -> Leiden_Algorithm
22+
Leiden_Theta -> Leiden_Algorithm
23+
Leiden_Algorithm -> Tuning_Leiden [label="modularity", style="dashed"]
24+
Leiden_Algorithm -> Tuning_Leiden [label="size", style="dashed"]
25+
Leiden_Algorithm -> CommunityId;
26+
27+
// Fast Random Projection (FastRP)
28+
subgraph cluster_fastRP {
29+
label="Fast Random Projection (FastRP)";
30+
style=filled; color=lightpink;
31+
node [shape=box, style=filled, fillcolor=white];
32+
33+
Tuning_FastRP [label="Tuning\n(Optuna)"];
34+
FastRP_Dimension [label="dimension", shape=diamond];
35+
FastRP_Normalization_Strength [label="normalization strength", shape="diamond"];
36+
FastRP_Forth_Iteration_Weight [label="forth iteration weight", shape="diamond"];
37+
FastRP_Algorithm [label="FastRP"];
38+
NodeEmbeddings [label="Node Embeddings", shape=ellipse];
39+
}
40+
41+
// --- FastRP relationships ---
42+
Tuning_FastRP -> FastRP_Dimension;
43+
Tuning_FastRP -> FastRP_Normalization_Strength;
44+
Tuning_FastRP -> FastRP_Forth_Iteration_Weight;
45+
FastRP_Dimension -> FastRP_Algorithm;
46+
FastRP_Normalization_Strength -> FastRP_Algorithm;
47+
FastRP_Forth_Iteration_Weight -> FastRP_Algorithm
48+
FastRP_Algorithm -> Tuning_FastRP [label="adjusted mutual info score\n(incl. preview clustering)", style="dashed"]
49+
FastRP_Algorithm -> NodeEmbeddings;
50+
51+
// Uniform Manifold Approximation and Projection (UMAP)
52+
subgraph cluster_UMAP {
53+
label="Uniform Manifold Approximation and Projection (UMAP)\nDimensionality Reduction for Visualization";
54+
style=filled; color=lightgrey;
55+
node [shape=box, style=filled, fillcolor=white];
56+
57+
UMAP_Algorithm [label="UMAP"];
58+
UMAP_Coordinates [label="2D Coordinates", shape=ellipse];
59+
}
60+
61+
// UMAP relationships
62+
NodeEmbeddings -> UMAP_Algorithm
63+
UMAP_Algorithm -> UMAP_Coordinates
64+
65+
// HDBSCAN clustering and tuning
66+
subgraph cluster_hdbscan {
67+
label="Hierarchical Density-Based Spatial Clustering (HDBSCAN)";
68+
style=filled; color=lightgoldenrod;
69+
node [shape=box, style=filled, fillcolor=white];
70+
71+
Tuning_HDBSCAN [label="Tuning\n(Optuna)"];
72+
HDBSCAN_Node [label="HDBSCAN"];
73+
HDBSCAN_Min_Cluster_Size [label="Min Cluster Size", shape=diamond];
74+
HDBSCAN_Min_Samples [label="Min Samples", shape=diamond];
75+
76+
ClusterLabel [label="Label", shape=ellipse];
77+
ClusterRadius [label="Radius\n(avg,max)", shape=ellipse];
78+
ClusterSize [label="Size", shape=ellipse];
79+
NormDistToMedoid [label="Normalized Distance\nTo Medoid", shape=ellipse];
80+
ClusterNoise [label="Noise\n(label=-1)", shape=ellipse];
81+
ClusterProbability [label="Probability", shape=ellipse];
82+
ClusterApproximationOutlierScore [label="Approximation\nOutlierScore\n(= 1 - Probability)", shape=ellipse];
83+
}
84+
85+
// --- Inputs into HDBSCAN ---
86+
CommunityId -> Tuning_HDBSCAN [label="reference"];
87+
NodeEmbeddings -> HDBSCAN_Node;
88+
89+
Tuning_HDBSCAN -> HDBSCAN_Min_Cluster_Size
90+
Tuning_HDBSCAN -> HDBSCAN_Min_Samples
91+
HDBSCAN_Min_Cluster_Size -> HDBSCAN_Node;
92+
HDBSCAN_Min_Samples -> HDBSCAN_Node;
93+
94+
HDBSCAN_Node -> Tuning_HDBSCAN [label="adjusted mutual info score", style=dashed];
95+
96+
// HDBSCAN outputs (cluster features)
97+
HDBSCAN_Node -> ClusterLabel;
98+
HDBSCAN_Node -> ClusterNoise;
99+
HDBSCAN_Node -> ClusterRadius;
100+
HDBSCAN_Node -> ClusterSize;
101+
HDBSCAN_Node -> NormDistToMedoid;
102+
HDBSCAN_Node -> ClusterProbability;
103+
HDBSCAN_Node -> ClusterApproximationOutlierScore;
104+
105+
// Graph algorithm based features
106+
subgraph cluster_graph_features {
107+
label="Graph (Algorithm) Features";
108+
style=filled; color=lightcyan;
109+
node [shape=ellipse, style=filled, fillcolor=white];
110+
111+
ArticleRank [label="ArticleRank"];
112+
PageRank [label="PageRank"];
113+
PageRank_minus_ArticleRank [label="PageRank -\nArticleRank"];
114+
BetweennessCentrality [label="Betweenness\nCentrality"];
115+
LocalClusteringCoefficient [label="Local Clustering\nCoefficient"];
116+
Degree [label="Degree\n(in, out, sum)"];
117+
}
118+
119+
// Anomaly detection model area
120+
subgraph cluster_anomaly {
121+
label="Anomaly Detection Model";
122+
style=filled; color=lightgreen; penwidth=4; pencolor=green; margin="50,50";
123+
node [shape=box, style=filled, fillcolor=white];
124+
125+
TuningAnomaly [label="Tuning\n(Optuna)"];
126+
IsolationMinCluster [label="Min Cluster Size", shape=diamond];
127+
IsolationEstimators [label="n estimators", shape=diamond];
128+
129+
ProxyEstimators [label="n estimators", shape=diamond];
130+
ProxyMaxDepth [label="max depth", shape=diamond];
131+
132+
AnomalyStandardizer [label="Standardizer"]
133+
AnomalyPCA [label="Principal Component\nAnalysis (PCA)"]
134+
IsolationForest [label="Isolation Forest\nAnomaly Detector", margin="0.4,0.4"];
135+
ProxyRandomForest [label="RandomForest\n(Proxy)"];
136+
AnomalyScore [label="Score", shape=ellipse];
137+
AnomalyLabel [label="Label", shape=ellipse];
138+
}
139+
140+
// Embeddings feed anomaly model
141+
NodeEmbeddings -> AnomalyPCA;
142+
143+
// HDBSCAN-derived features feed anomaly model
144+
ClusterRadius -> AnomalyStandardizer;
145+
NormDistToMedoid -> AnomalyStandardizer;
146+
ClusterApproximationOutlierScore -> AnomalyStandardizer;
147+
148+
// Graph Algorithm Features feed anomaly model
149+
ArticleRank -> AnomalyStandardizer;
150+
PageRank -> AnomalyStandardizer;
151+
PageRank_minus_ArticleRank -> AnomalyStandardizer;
152+
BetweennessCentrality -> AnomalyStandardizer;
153+
LocalClusteringCoefficient -> AnomalyStandardizer;
154+
Degree -> AnomalyStandardizer;
155+
156+
// Proxy RandomForest used as a backing/tuning model for the Isolation Forest
157+
TuningAnomaly -> IsolationMinCluster;
158+
TuningAnomaly -> IsolationEstimators;
159+
IsolationMinCluster -> IsolationForest
160+
IsolationEstimators -> IsolationForest
161+
162+
TuningAnomaly -> ProxyEstimators
163+
TuningAnomaly -> ProxyMaxDepth
164+
ProxyEstimators -> ProxyRandomForest
165+
ProxyMaxDepth -> ProxyRandomForest
166+
167+
AnomalyStandardizer -> IsolationForest;
168+
AnomalyPCA -> IsolationForest;
169+
IsolationForest -> ProxyRandomForest [label="reference", style="dashed"];
170+
ProxyRandomForest -> TuningAnomaly [label="f1 score\n(cross validation)", style="dashed"];
171+
172+
IsolationForest -> AnomalyLabel
173+
IsolationForest -> AnomalyScore
174+
175+
// Explainable AI / SHAP
176+
subgraph cluster_explainability {
177+
label="Explainable AI (SHAP)";
178+
style=filled; color=lavender;
179+
node [shape=note, style=filled, fillcolor=white];
180+
181+
SHAP [label="SHAP TreeExplainer"];
182+
183+
SHAP_Values [label="Top SHAP Values", shape=ellipse];
184+
SHAP_Features [label="Top Features", shape=ellipse];
185+
SHAP_Embedding_Sum [label="Node Embeddings\nSHAP Sum", shape=ellipse];
186+
}
187+
188+
// Explainability connections (RandomForest -> SHAP)
189+
ProxyRandomForest -> SHAP;
190+
SHAP -> SHAP_Values;
191+
SHAP -> SHAP_Features;
192+
SHAP -> SHAP_Embedding_Sum;
193+
194+
}

0 commit comments

Comments
 (0)