diff --git a/README.md b/README.md index b4a4764..5240c95 100644 --- a/README.md +++ b/README.md @@ -121,3 +121,29 @@ location = {Virtual Event, Singapore}, series = {KDD '21} } ``` + +## Testing + +### Python Tests +Run the Python tests to verify the implementation: + +```bash +# From the project root directory +PYTHONPATH=. python3 -m pytest tests/test_alphacore.py -v +``` + +### R Tests +Run the R tests to verify the R implementation: + +```r +# From the project root directory +library(testthat) +test_file("tests/testthat/test_alphaCore.R") +``` + +Or using Rscript: +```bash +# From the project root directory +Rscript -e "library(testthat); test_file('tests/testthat/test_alphaCore.R')" +``` + diff --git a/algorithms/alphaCore.R b/algorithms/alphaCore.R index 1c0eec4..630d446 100644 --- a/algorithms/alphaCore.R +++ b/algorithms/alphaCore.R @@ -7,125 +7,148 @@ #' mahalanobis data depth function at the origin. #' #' @param input_graph An igraph object of a directed graph -#' @param featureComputeFun A function that converts a node's edges (with -#' attributes) into node features. Default computes in-degree and strength +#' @param features A list of selected node features; default is ["all"] #' @param stepSize defines the stepsize of each iteration as percentage of node count +#' Higher values (e.g., > 0.1) speed up execution but yield lower ranking resolution, while lower values (e.g., < 0.1) +#' provide finer ranking but increase runtime. #' @param startEpsilon the epsilon to start with. Removes all nodes with depth>epsilon at start +#' Higher values (e.g., > 1.0) remove more nodes early, emphasizing denser cores, +#' whereas lower values (e.g., < 1.0) allow for a more gradual refinement of the ranking. #' @param exponentialDecay dynamically reduces the step size, to have high cores with few nodes +#' which results in higher resolution for the highest cores. Set to TRUE if node ranking precision is more important than speed. #' @return A dataframe of node name and alpha value indicating the ranking alphaCore <- function(input_graph, - featureComputeFun = computeNodeFeaturFun, + features = "all", stepSize = 0.1, startEpsilon = 1, exponentialDecay = TRUE) { - - #1 - node_features = featureComputeFun(input_graph) - #2 - cov_mat_inv <- solve(cov(node_features[, -c("node")]), tol = NULL) - # temporary ---- - cov_mat <- cov(node_features[, -c("node")]) - # temporary ---- - #3 - node_features$depth <- mhdOrigin(node_features[, !c("node")], cov_mat_inv) - #4 + # 1 Extract numerical node features from the graph + node_features <- extractFeatures(input_graph, features) + + # 2 Compute covariance matrix + cov_mat <- cov(node_features[, -1, with = FALSE]) + cov_mat_inv <- tryCatch( + solve(cov_mat), + error = function(e1) { + message("Covariance singular; falling back to pseudo-inverse.") + mat <- MASS::ginv(cov_mat) + if (nrow(mat) == 0 || ncol(mat) == 0) { + stop("Could not invert covariance: no valid features remain.") + } + mat + } + ) + # 3 + node_features$depth <- mhdOrigin(node_features[, -1, with = FALSE], cov_mat_inv) + + # 4 epsilon <- startEpsilon - #5 + # 5 result <- node_features[, c("node")] result$alpha <- 0 - #6 + # 6 result$batch <- 0 - #7 - alpha <- 1 - epsilon # modified - #8 - alpha_prev <- alpha # modified - #9 + # 7 + alpha <- 1 - epsilon + # 8 + alpha_prev <- alpha + # 9 batch_ID <- 0 - #10 + # 10 while (vcount(input_graph) > 0) { - #11 + # 11 repeat{ - #12: for each depth geq epsilon - #13 + # 12: for each depth geq epsilon + # 13 nodes <- node_features[depth >= epsilon]$node result[node %in% nodes, alpha := alpha_prev] - #14 result[node %in% nodes, batch := batch_ID] - #15 input_graph <- delete_vertices(input_graph, nodes) - #16 + # 16 batch_ID <- batch_ID + 1 - #17 - node_features = featureComputeFun(input_graph) - #18 - node_features$depth <- mhdOrigin(node_features[, !c("node")], cov_mat_inv) - - if(length(nodes) == 0){ # 19 : if there doesn't exist a vertex with depth geq epsilon + # 17 + node_features <- extractFeatures(input_graph, features) + # 18 + node_features$depth <- mhdOrigin(node_features[, -1, with = FALSE], cov_mat_inv) + + if (length(nodes) == 0) { # 19 : if there doesn't exist a vertex with depth geq epsilon break } } - #20 + # 20 alpha_prev <- alpha - #22 # reduce epsilon with strategy - if(exponentialDecay) { + # 22 # reduce epsilon with strategy + if (exponentialDecay) { localStepSize <- ceiling(vcount(input_graph) * stepSize) epsilon <- min(head(node_features[order(depth, decreasing = T)], localStepSize)$depth) } else { epsilon <- epsilon - stepSize } - #21 + # 21 alpha <- 1 - epsilon - } return(result) - # } ############################## AUX FUNCTIONS ################################ +#' Compute default node features for alphaCore +#' +#' \code{computeNodeFeaturFun} returns a data.table with indegree and strength +#' +#' @param graph An igraph object +#' @return A data.table with node names, indegree and strength computeNodeFeaturFun <- function(graph) { # get node names nodes <- V(graph)$name # compute indegree indegree <- degree(graph, mode = "in") # compute sum of incoming weights (a.k.a. "strength") - strength <- strength(graph, mode="in") + strength <- strength(graph, mode = "in") # combine results - nodeFeatures <- data.table(node=nodes, indegree=indegree, strength=strength) + nodeFeatures <- data.table(node = nodes, indegree = indegree, strength = strength) return(nodeFeatures) } -computeNodeFeatures <- function(graph, features) { + +#' Compute specified node features +#' +#' \code{computeNodeFeatures} calculates requested network features for each node +#' +#' @param graph An igraph object +#' @param features A character vector of features to compute +#' @return A data.table with node names and the requested features +computeNodeFeatures <- function(graph, features = c("indegree", "strength")) { nodeFeatures <- data.table(node = V(graph)$name) - if("degree" %in% features) { + if ("degree" %in% features) { nodeFeatures[, indegree := degree(graph, mode = "all")] } - if("indegree" %in% features) { + if ("indegree" %in% features) { nodeFeatures[, indegree := degree(graph, mode = "in")] } - if("outdegree" %in% features) { + if ("outdegree" %in% features) { nodeFeatures[, outdegree := degree(graph, mode = "out")] } - if("strength" %in% features) { + if ("strength" %in% features) { nodeFeatures[, strength := strength(graph, mode = "all")] } - if("instrength" %in% features) { + if ("instrength" %in% features) { nodeFeatures[, instrength := strength(graph, mode = "in")] } - if("outstrength" %in% features) { + if ("outstrength" %in% features) { nodeFeatures[, outstrength := strength(graph, mode = "out")] } - if("triangles" %in% features) { + if ("triangles" %in% features) { nodeFeatures[, triangles := count_triangles(graph)] } - if("neighborhoodsize" %in% features) { + if ("neighborhoodsize" %in% features) { nodeFeatures[, neighborhoodsize := neighborhood.size(graph, mode = "all", mindist = 1)] } - if("inneighborhoodsize" %in% features) { + if ("inneighborhoodsize" %in% features) { nodeFeatures[, inneighborhoodsize := neighborhood.size(graph, mode = "in", mindist = 1)] } - if("outneighborhoodsize" %in% features) { + if ("outneighborhoodsize" %in% features) { nodeFeatures[, outneighborhoodsize := neighborhood.size(graph, mode = "out", mindist = 1)] } return(nodeFeatures[]) @@ -138,11 +161,63 @@ customNodeFeatures <- function(features) { } +#' Extract numerical node features from a graph +#' +#' \code{extractFeatures} returns a data.table of node features +#' +#' Extracts numerical vertex attributes from the graph. If specified features +#' don't exist as vertex attributes, they will be computed on demand. +#' +#' @param graph An igraph object +#' @param features Features to extract: "all" for all numeric attributes, or a character vector of specific features +#' @return A data.table with node names and requested features +extractFeatures <- function(graph, features = "all") { + # If the user explicitly requested a feature list, compute those + if (!identical(features, "all")) { + df <- computeNodeFeatures(graph, features = features) + missing <- setdiff(features, names(df)) + if (length(missing) > 0L) { + stop( + "Vertex attribute(s) not found or not numeric: ", + paste(missing, collapse = ", "), + call. = FALSE + ) + } + return(df) + } + + # Otherwise, pull *all* numeric vertex attributes off the graph + attrs <- vertex.attributes(graph) + numeric_names <- names(attrs)[sapply(attrs, is.numeric)] + + # If none are present, fall back to the original indegree+strength + if (length(numeric_names) == 0L) { + message( + "No numeric vertex attributes found; reverting to indegree+strength." + ) + return(computeNodeFeaturFun(graph)) + } + + # Build a data.table: one column 'node', then each numeric attribute + dt <- data.table(node = V(graph)$name) + for (nm in numeric_names) dt[[nm]] <- attrs[[nm]] + return(dt[]) +} + + + +#' Calculate the Mahalanobis depth to the origin +#' +#' \code{mhdOrigin} calculates the Mahalanobis depth of data points to the origin +#' +#' @param data A matrix or data.frame of multivariate data +#' @param sigma_inv The inverse covariance matrix +#' @return A vector of depth values mhdOrigin <- function(data, sigma_inv) { - origin <- rep(0,ncol(data)) # c(0,0,...) + origin <- rep(0, ncol(data)) # c(0,0,...) # We reuse the Mahalanobis distance implementation of the stats package, # which returns the squared Mahalanobis distance: (x - μ)' Σ^-1 (x - μ) = D^2 # To arrive at the Mahalanobis Depth to the origin, we only need to add 1 and # take the reciprocal. - return((1 + stats::mahalanobis(data, center = origin, sigma_inv, inverted = TRUE))^-1) + return((1 + pmax(stats::mahalanobis(data, center = origin, sigma_inv, inverted = TRUE), 0))^-1) } diff --git a/algorithms/alphacore.py b/algorithms/alphacore.py index db29f7b..36cbe92 100644 --- a/algorithms/alphacore.py +++ b/algorithms/alphacore.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import math +import warnings # alphaCore ranking of nodes in a complex, directed network # @@ -9,20 +10,30 @@ # edge attributes and optionally static node features using the # mahalanobis data depth function at the origin. # -# @param graph A networkx directed graph +# @param graph A networkx directed graph. +# @param features A list of selected node features; default is ["all"] # @param stepSize Defines the stepsize of each iteration as percentage of node count +# Higher values (>0.1) speed up execution but lower ranking resolution. +# Lower values (<0.1) provide finer ranking but increase runtime. # @param startEpsi The epsilon to start with. Removes all nodes with depth>epsilon at start +# Higher values (>1.0) remove more nodes early, emphasizing denser cores. +# Lower values (<1.0) refine ranking progressively. # @param expoDecay Dynamically reduces the step size, to have high cores with few nodes if true +# Recommended when ranking precision is more important than speed. # @return A dataframe of columns nodeID, alpha value, and batchID -def alphaCore(graph, stepSize = 0.1, startEpsi = 1, expoDecay = False): - #1 - data = computeNodeFeatures(graph) + +def alphaCore(graph, features=["all"], stepSize = 0.1, startEpsi = 1, expoDecay = False): + #1 Extract numerical node features from the graph; if unavailable, use computeNodeFeatures. + data = extractFeatures(graph, features) + #2 compute cov matrix to be used for all remainder of depth calculations matrix = data.drop("nodeID", axis=1) # convert dataframe to numeric matrix by removing first column containing nodeID cov = np.cov(matrix.values.T) + #3 calculate the Mahalanobis depth and add it to the respective row of the dataframe data['mahal'] = calculateMahalFromCenter(data, 0, cov) - #4 + + #4 epsi = startEpsi #5 node = [] @@ -35,12 +46,13 @@ def alphaCore(graph, stepSize = 0.1, startEpsi = 1, expoDecay = False): alphaPrev = alpha #9 batchID = 0 - #10 + + #10 while graph.number_of_nodes() > 0: #11 while True: depthFound = False # to simulate do-while loop; used to check if there exists a node with depth >= epsi on current iteration - #12 + #12 for row in data.itertuples(): if row.mahal >= epsi: depthFound = True @@ -57,12 +69,12 @@ def alphaCore(graph, stepSize = 0.1, startEpsi = 1, expoDecay = False): if graph.number_of_nodes() == 0 or not depthFound: break #17 - data = computeNodeFeatures(graph) # recompute node properties + data = extractFeatures(graph, features) # recompute node properties #18 data['mahal'] = calculateMahalFromCenter(data, 0, cov) # recompute depth - #20 + #20 alphaPrev = alpha - #21 + #21 if expoDecay and graph.number_of_nodes() > 0: # exponential decay localStepSize = math.ceil(graph.number_of_nodes() * stepSize) data = data.sort_values(ascending=False, by=['mahal']) @@ -75,6 +87,55 @@ def alphaCore(graph, stepSize = 0.1, startEpsi = 1, expoDecay = False): return pd.DataFrame({'nodeID': node, 'alpha': alphaVals, 'batchID': batch}) +# Extract numerical node features from the graph. +# +# @param graph A NetworkX directed graph. +# @param features A list of feature names to extract. If None, defaults to ["all"]. +# @return A DataFrame containing extracted numerical node features. +def extractFeatures(graph, features=["all"]): + #Collect all node attributes and identify numerical features + all_features = set() + numeric_features = set() + selected_features = set() + data = {} + + for node_id, attributes in graph.nodes(data=True): + for key, value in attributes.items(): + all_features.add(key) # Track all available features + if isinstance(value, (int, float)): + numeric_features.add(key) # Track only numeric features + + #Determine the features to extract + if features == ["all"]: + if not numeric_features: + warnings.warn("No numerical node features found. Reverting to default AlphaCore node features.", stacklevel=2) + return computeNodeFeatures(graph) + selected_features = numeric_features # Use all numeric features + else: + for feature in features: + if feature not in all_features: + warnings.warn(f"Debug: Available features: {all_features}", stacklevel=2) + raise ValueError(f"Feature '{feature}' not found in graph nodes. Please check your graph.") + if feature in numeric_features: + selected_features.add(feature) + + if not selected_features: + warnings.warn("None of the selected features contain numerical values. Reverting to default AlphaCore node features.", stacklevel=2) + return computeNodeFeatures(graph) + + #Extract feature values for each node + for node_id, attributes in graph.nodes(data=True): + node_features = {} + for feature in selected_features: + if feature in attributes: + node_features[feature] = attributes.get(feature) + data[node_id] = node_features + + #Convert to DataFrame + df = pd.DataFrame.from_dict(data, orient="index").reset_index().rename(columns={"index": "nodeID"}) + return df + + # Computes the node features of a given directed graph and returns a dataframe containing the features of each node # @@ -112,8 +173,14 @@ def computeNodeFeatures(graph): def calculateMahalFromCenter(data, center, cov): matrix = data.drop("nodeID", axis=1) # convert dataframe to numeric matrix by removing first column containing nodeID x_minus_center = matrix.values - center - x_minus_center_transposed = (matrix.values - center).T - inv_cov = np.linalg.inv(cov) + x_minus_center_transposed = x_minus_center.T + # Try computing the inverse of the covariance matrix; if it fails, fall back to the pseudo-inverse for stability. + try: + inv_cov = np.linalg.inv(cov) + except np.linalg.LinAlgError: + warnings.warn("Covariance matrix is not invertible, using Moore-Penrose pseudo-inverse instead.", stacklevel=2) + inv_cov = np.linalg.pinv(cov) left = np.dot(x_minus_center, inv_cov) mahal = np.dot(left, x_minus_center_transposed) - return np.diagonal(np.reciprocal(1+mahal)) # diagonal contains the depth vaulues corresponding to each row from matrix + mahal_diag = np.diagonal(mahal) # diagonal contains the depth vaulues corresponding to each row from matrix + return np.reciprocal(1 + np.maximum(mahal_diag, 0)) # Ensure stability diff --git a/tests/test_alphacore.py b/tests/test_alphacore.py new file mode 100644 index 0000000..409a227 --- /dev/null +++ b/tests/test_alphacore.py @@ -0,0 +1,33 @@ +import unittest +import networkx as nx +import numpy as np +import pandas as pd +from algorithms.alphacore import alphaCore + +class TestAlphaCore(unittest.TestCase): + def setUp(self): + # Set random seed for reproducibility + np.random.seed(1) + + def test_default_parameters(self): + """Test with default parameters (200-node graph)""" + g = nx.erdos_renyi_graph(200, 2/200, directed=True, seed=1) + # Add edge weights + for idx, (u, v, w) in enumerate(g.edges(data=True)): + w['value'] = idx + + # Run alphaCore + result = alphaCore(g) + + # Check output structure + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(list(result.columns), ['nodeID', 'alpha', 'batchID']) + self.assertEqual(len(result), 200) + self.assertTrue(all((result['alpha'] >= 0) & (result['alpha'] <= 1))) + self.assertTrue(all(result['batchID'] >= 0)) + + print(result.head()) + print(result.tail()) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/testthat/test_alphaCore.R b/tests/testthat/test_alphaCore.R new file mode 100644 index 0000000..598bc0f --- /dev/null +++ b/tests/testthat/test_alphaCore.R @@ -0,0 +1,49 @@ +# test_alphaCore.R +library(testthat) +library(igraph) +library(data.table) +source("../../algorithms/alphaCore.R") + + +# Test 1: Default parameters +test_that("Default parameters: ", { + # Set seed for reproducibility + set.seed(1) + + g <- erdos.renyi.game(200, 2 / 200, directed = T) + E(g)$weight <- 1:ecount(g) + V(g)$name <- paste("v", 1:vcount(g), sep = "") + + # Run alphaCore + result <- alphaCore(g) + + # Test that result has correct structure + expect_true(all(c("node", "alpha", "batch") %in% names(result))) + expect_equal(nrow(result), 200) + expect_true(all(result$alpha >= 0 & result$alpha <= 1)) + expect_true(all(result$batch >= 0)) + + print(head(result, 5)) + print(tail(result, 5)) +}) + +# Test 2: Custom features +test_that("Custom features: ", { + # Set seed for reproducibility + set.seed(1) + + g <- erdos.renyi.game(200, 2 / 200, directed = T) + E(g)$weight <- 1:ecount(g) + V(g)$name <- paste("v", 1:vcount(g), sep = "") + + result <- alphaCore(g, features = c("indegree", "triangles")) + + # Test that result has correct structure + expect_true(all(c("node", "alpha", "batch") %in% names(result))) + expect_equal(nrow(result), 200) + expect_true(all(result$alpha >= 0 & result$alpha <= 1)) + expect_true(all(result$batch >= 0)) + + print(head(result, 6)) + print(tail(result, 5)) +})