From 30af36f23de6df29fed98d594155f5a0df7d545f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Schl=C3=BCter?= <10252511+oschlueter@users.noreply.github.com> Date: Thu, 18 Jul 2019 13:15:08 +0200 Subject: [PATCH 1/3] fixed calculation of denominator for jaccard similarity --- .../graphalgo/similarity/Similarities.java | 6 +- .../similarity/SimilaritiesTest.java | 91 +++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java index fa97b9925..e183fdb1f 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java @@ -45,10 +45,14 @@ public double jaccardSimilarity(@Name("vector1") List vector1, @Name("ve if (vector1 == null || vector2 == null) return 0; HashSet intersectionSet = new HashSet<>(vector1); + + // add size of vector1 and vector2 (ignoring duplicates) before calling retainAll(vector2) + long denom_sum = intersectionSet.size() + new HashSet<>(vector2).size(); + intersectionSet.retainAll(vector2); int intersection = intersectionSet.size(); - long denominator = vector1.size() + vector2.size() - intersection; + long denominator = denom_sum - intersection; return denominator == 0 ? 0 : (double) intersection / denominator; } diff --git a/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java new file mode 100644 index 000000000..6f1eb539f --- /dev/null +++ b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java @@ -0,0 +1,91 @@ +package org.neo4j.graphalgo.similarity; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.*; + +@RunWith(Parameterized.class) +public class SimilaritiesTest { + + private final List input; + + @Parameterized.Parameters(name = "{0}") + public static Collection> data() { + return Arrays.asList( + Arrays.asList(1, 2, 3), + Arrays.asList(1, 2, 3, 3), + Arrays.asList(104, 101, 108, 108, 111) + ); + } + + public SimilaritiesTest(List input) { + this.input = input; + } + + @Test + public void jaccardIdenticalInput() { + // given identical input + + // when + Similarities s = new Similarities(); + double result = s.jaccardSimilarity(input, input); + + // then + assertEquals(1.0, result, 0.01); + } + + @Test + public void cosineIdenticalInput() { + // given identical input + + // when + Similarities s = new Similarities(); + double result = s.cosineSimilarity(input, input); + + // then + assertEquals(1.0, result, 0.01); + } + + @Test + public void pearsonIdenticalInput() { + // given identical input + + // when + Similarities s = new Similarities(); + double result = s.pearsonSimilarity(input, input, Collections.emptyMap()); + + // then + assertEquals(1.0, result, 0.01); + } + + @Test + public void euclideanIdenticalInput() { + // given identical input + + // when + Similarities s = new Similarities(); + double result = s.euclideanSimilarity(input, input); + + // then + assertEquals(1.0, result, 0.01); + } + + @Test + public void overlapIdenticalInput() { + // given identical input + + // when + Similarities s = new Similarities(); + double result = s.euclideanSimilarity(input, input); + + // then + assertEquals(1.0, result, 0.01); + } +} From d24972416e06061e3cc69bfdad8a7d44fe35ea08 Mon Sep 17 00:00:00 2001 From: oschlueter <10252511+oschlueter@users.noreply.github.com> Date: Thu, 18 Jul 2019 14:26:13 +0200 Subject: [PATCH 2/3] Update SimilaritiesTest.java added license header to new test suite --- .../graphalgo/similarity/SimilaritiesTest.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java index 6f1eb539f..2ead1c20e 100644 --- a/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java +++ b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java @@ -1,3 +1,21 @@ +/* + * Copyright (c) 2017 "Neo4j, Inc." + * + * This file is part of Neo4j Graph Algorithms . + * + * Neo4j Graph Algorithms is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ package org.neo4j.graphalgo.similarity; import org.junit.Test; From dd218dff4fa94dff64740ef4d24495d8e9a64a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20Schl=C3=BCter?= Date: Thu, 18 Jul 2019 15:06:00 +0200 Subject: [PATCH 3/3] actually test overlapSimilarity instead of eculideanSimilarity and fix calculation of overlapSimilarity as well --- .../java/org/neo4j/graphalgo/similarity/Similarities.java | 6 +++++- .../org/neo4j/graphalgo/similarity/SimilaritiesTest.java | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java index e183fdb1f..1ac68a60e 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java @@ -176,10 +176,14 @@ public double overlapSimilarity(@Name("vector1") List vector1, @Name("ve if (vector1 == null || vector2 == null) return 0; HashSet intersectionSet = new HashSet<>(vector1); + + long size1 = intersectionSet.size(); + long size2 = new HashSet<>(vector2).size(); + intersectionSet.retainAll(vector2); int intersection = intersectionSet.size(); - long denominator = Math.min(vector1.size(), vector2.size()); + long denominator = Math.min(size1, size2); return denominator == 0 ? 0 : (double) intersection / denominator; } diff --git a/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java index 2ead1c20e..1e70469bb 100644 --- a/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java +++ b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilaritiesTest.java @@ -101,7 +101,7 @@ public void overlapIdenticalInput() { // when Similarities s = new Similarities(); - double result = s.euclideanSimilarity(input, input); + double result = s.overlapSimilarity(input, input); // then assertEquals(1.0, result, 0.01);